ceph/src/rocksdb/tools/db_bench_tool.cc

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9
  10 #ifdef GFLAGS
  11 #ifdef NUMA
  12 #include <numa.h>
  13 #endif
  14 #ifndef OS_WIN
  15 #include <unistd.h>
  16 #endif
  17 #include <fcntl.h>
  18 #include <stdio.h>
  19 #include <stdlib.h>
  20 #include <sys/types.h>
  21 #ifdef __APPLE__
  22 #include <mach/host_info.h>
  23 #include <mach/mach_host.h>
  24 #include <sys/sysctl.h>
  25 #endif
  26 #ifdef __FreeBSD__
  27 #include <sys/sysctl.h>
  28 #endif
  29 #include <atomic>
  30 #include <cinttypes>
  31 #include <condition_variable>
  32 #include <cstddef>
  33 #include <iostream>
  34 #include <memory>
  35 #include <mutex>
  36 #include <queue>
  37 #include <thread>
  38 #include <unordered_map>
  39
  40 #include "db/db_impl/db_impl.h"
  41 #include "db/malloc_stats.h"
  42 #include "db/version_set.h"
  43 #include "monitoring/histogram.h"
  44 #include "monitoring/statistics.h"
  45 #include "options/cf_options.h"
  46 #include "port/port.h"
  47 #include "port/stack_trace.h"
  48 #include "rocksdb/cache.h"
  49 #include "rocksdb/convenience.h"
  50 #include "rocksdb/db.h"
  51 #include "rocksdb/env.h"
  52 #include "rocksdb/filter_policy.h"
  53 #include "rocksdb/memtablerep.h"
  54 #include "rocksdb/options.h"
  55 #include "rocksdb/perf_context.h"
  56 #include "rocksdb/persistent_cache.h"
  57 #include "rocksdb/rate_limiter.h"
  58 #include "rocksdb/secondary_cache.h"
  59 #include "rocksdb/slice.h"
  60 #include "rocksdb/slice_transform.h"
  61 #include "rocksdb/stats_history.h"
  62 #include "rocksdb/table.h"
  63 #include "rocksdb/utilities/backup_engine.h"
  64 #include "rocksdb/utilities/object_registry.h"
  65 #include "rocksdb/utilities/optimistic_transaction_db.h"
  66 #include "rocksdb/utilities/options_type.h"
  67 #include "rocksdb/utilities/options_util.h"
  68 #ifndef ROCKSDB_LITE
  69 #include "rocksdb/utilities/replayer.h"
  70 #endif  // ROCKSDB_LITE
  71 #include "rocksdb/utilities/sim_cache.h"
  72 #include "rocksdb/utilities/transaction.h"
  73 #include "rocksdb/utilities/transaction_db.h"
  74 #include "rocksdb/write_batch.h"
  75 #include "test_util/testutil.h"
  76 #include "test_util/transaction_test_util.h"
  77 #include "tools/simulated_hybrid_file_system.h"
  78 #include "util/cast_util.h"
  79 #include "util/compression.h"
  80 #include "util/crc32c.h"
  81 #include "util/file_checksum_helper.h"
  82 #include "util/gflags_compat.h"
  83 #include "util/mutexlock.h"
  84 #include "util/random.h"
  85 #include "util/stderr_logger.h"
  86 #include "util/string_util.h"
  87 #include "util/xxhash.h"
  88 #include "utilities/blob_db/blob_db.h"
  89 #include "utilities/counted_fs.h"
  90 #include "utilities/merge_operators.h"
  91 #include "utilities/merge_operators/bytesxor.h"
  92 #include "utilities/merge_operators/sortlist.h"
  93 #include "utilities/persistent_cache/block_cache_tier.h"
  94
  95 #ifdef MEMKIND
  96 #include "memory/memkind_kmem_allocator.h"
  97 #endif
  98
  99 #ifdef OS_WIN
 100 #include <io.h>  // open/close
 101 #endif
 102
 103 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 104 using GFLAGS_NAMESPACE::RegisterFlagValidator;
 105 using GFLAGS_NAMESPACE::SetUsageMessage;
 106 using GFLAGS_NAMESPACE::SetVersionString;
 107
 108 #ifdef ROCKSDB_LITE
 109 #define IF_ROCKSDB_LITE(Then, Else) Then
 110 #else
 111 #define IF_ROCKSDB_LITE(Then, Else) Else
 112 #endif
 113
 114 DEFINE_string(
 115     benchmarks,
 116     "fillseq,"
 117     "fillseqdeterministic,"
 118     "fillsync,"
 119     "fillrandom,"
 120     "filluniquerandomdeterministic,"
 121     "overwrite,"
 122     "readrandom,"
 123     "newiterator,"
 124     "newiteratorwhilewriting,"
 125     "seekrandom,"
 126     "seekrandomwhilewriting,"
 127     "seekrandomwhilemerging,"
 128     "readseq,"
 129     "readreverse,"
 130     "compact,"
 131     "compactall,"
 132     "flush,"
 133 IF_ROCKSDB_LITE("",
 134     "compact0,"
 135     "compact1,"
 136     "waitforcompaction,"
 137 )
 138     "multireadrandom,"
 139     "mixgraph,"
 140     "readseq,"
 141     "readtorowcache,"
 142     "readtocache,"
 143     "readreverse,"
 144     "readwhilewriting,"
 145     "readwhilemerging,"
 146     "readwhilescanning,"
 147     "readrandomwriterandom,"
 148     "updaterandom,"
 149     "xorupdaterandom,"
 150     "approximatesizerandom,"
 151     "randomwithverify,"
 152     "fill100K,"
 153     "crc32c,"
 154     "xxhash,"
 155     "xxhash64,"
 156     "xxh3,"
 157     "compress,"
 158     "uncompress,"
 159     "acquireload,"
 160     "fillseekseq,"
 161     "randomtransaction,"
 162     "randomreplacekeys,"
 163     "timeseries,"
 164     "getmergeoperands,",
 165     "readrandomoperands,"
 166     "backup,"
 167     "restore"
 168
 169     "Comma-separated list of operations to run in the specified"
 170     " order. Available benchmarks:\n"
 171     "\tfillseq       -- write N values in sequential key"
 172     " order in async mode\n"
 173     "\tfillseqdeterministic       -- write N values in the specified"
 174     " key order and keep the shape of the LSM tree\n"
 175     "\tfillrandom    -- write N values in random key order in async"
 176     " mode\n"
 177     "\tfilluniquerandomdeterministic       -- write N values in a random"
 178     " key order and keep the shape of the LSM tree\n"
 179     "\toverwrite     -- overwrite N values in random key order in "
 180     "async mode\n"
 181     "\tfillsync      -- write N/1000 values in random key order in "
 182     "sync mode\n"
 183     "\tfill100K      -- write N/1000 100K values in random order in"
 184     " async mode\n"
 185     "\tdeleteseq     -- delete N keys in sequential order\n"
 186     "\tdeleterandom  -- delete N keys in random order\n"
 187     "\treadseq       -- read N times sequentially\n"
 188     "\treadtocache   -- 1 thread reading database sequentially\n"
 189     "\treadreverse   -- read N times in reverse order\n"
 190     "\treadrandom    -- read N times in random order\n"
 191     "\treadmissing   -- read N missing keys in random order\n"
 192     "\treadwhilewriting      -- 1 writer, N threads doing random "
 193     "reads\n"
 194     "\treadwhilemerging      -- 1 merger, N threads doing random "
 195     "reads\n"
 196     "\treadwhilescanning     -- 1 thread doing full table scan, "
 197     "N threads doing random reads\n"
 198     "\treadrandomwriterandom -- N threads doing random-read, "
 199     "random-write\n"
 200     "\tupdaterandom  -- N threads doing read-modify-write for random "
 201     "keys\n"
 202     "\txorupdaterandom  -- N threads doing read-XOR-write for "
 203     "random keys\n"
 204     "\tappendrandom  -- N threads doing read-modify-write with "
 205     "growing values\n"
 206     "\tmergerandom   -- same as updaterandom/appendrandom using merge"
 207     " operator. "
 208     "Must be used with merge_operator\n"
 209     "\treadrandommergerandom -- perform N random read-or-merge "
 210     "operations. Must be used with merge_operator\n"
 211     "\tnewiterator   -- repeated iterator creation\n"
 212     "\tseekrandom    -- N random seeks, call Next seek_nexts times "
 213     "per seek\n"
 214     "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
 215     "overwrite\n"
 216     "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
 217     "merge\n"
 218     "\tcrc32c        -- repeated crc32c of <block size> data\n"
 219     "\txxhash        -- repeated xxHash of <block size> data\n"
 220     "\txxhash64      -- repeated xxHash64 of <block size> data\n"
 221     "\txxh3          -- repeated XXH3 of <block size> data\n"
 222     "\tacquireload   -- load N*1000 times\n"
 223     "\tfillseekseq   -- write N values in sequential key, then read "
 224     "them by seeking to each key\n"
 225     "\trandomtransaction     -- execute N random transactions and "
 226     "verify correctness\n"
 227     "\trandomreplacekeys     -- randomly replaces N keys by deleting "
 228     "the old version and putting the new version\n\n"
 229     "\ttimeseries            -- 1 writer generates time series data "
 230     "and multiple readers doing random reads on id\n\n"
 231     "Meta operations:\n"
 232     "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
 233     "\tcompactall  -- Compact the entire DB\n"
 234 IF_ROCKSDB_LITE("",
 235     "\tcompact0  -- compact L0 into L1\n"
 236     "\tcompact1  -- compact L1 into L2\n"
 237     "\twaitforcompaction - pause until compaction is (probably) done\n"
 238 )
 239     "\tflush - flush the memtable\n"
 240     "\tstats       -- Print DB stats\n"
 241     "\tresetstats  -- Reset DB stats\n"
 242     "\tlevelstats  -- Print the number of files and bytes per level\n"
 243     "\tmemstats  -- Print memtable stats\n"
 244     "\tsstables    -- Print sstable info\n"
 245     "\theapprofile -- Dump a heap profile (if supported by this port)\n"
 246 IF_ROCKSDB_LITE("",
 247     "\treplay      -- replay the trace file specified with trace_file\n"
 248 )
 249     "\tgetmergeoperands -- Insert lots of merge records which are a list of "
 250     "sorted ints for a key and then compare performance of lookup for another "
 251     "key by doing a Get followed by binary searching in the large sorted list "
 252     "vs doing a GetMergeOperands and binary searching in the operands which "
 253     "are sorted sub-lists. The MergeOperator used is sortlist.h\n"
 254     "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An "
 255     "operation includes a rare but possible retry in case it got "
 256     "`Status::Incomplete()`. This happens upon encountering more keys than "
 257     "have ever been seen by the thread (or eight initially)\n"
 258     "\tbackup --  Create a backup of the current DB and verify that a new backup is corrected. "
 259     "Rate limit can be specified through --backup_rate_limit\n"
 260     "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n");
 261
 262 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
 263
 264 DEFINE_int64(numdistinct, 1000,
 265              "Number of distinct keys to use. Used in RandomWithVerify to "
 266              "read/write on fewer keys so that gets are more likely to find the"
 267              " key and puts are more likely to update the same key");
 268
 269 DEFINE_int64(merge_keys, -1,
 270              "Number of distinct keys to use for MergeRandom and "
 271              "ReadRandomMergeRandom. "
 272              "If negative, there will be FLAGS_num keys.");
 273 DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
 274
 275 DEFINE_int32(
 276     num_hot_column_families, 0,
 277     "Number of Hot Column Families. If more than 0, only write to this "
 278     "number of column families. After finishing all the writes to them, "
 279     "create new set of column families and insert to them. Only used "
 280     "when num_column_families > 1.");
 281
 282 DEFINE_string(column_family_distribution, "",
 283               "Comma-separated list of percentages, where the ith element "
 284               "indicates the probability of an op using the ith column family. "
 285               "The number of elements must be `num_hot_column_families` if "
 286               "specified; otherwise, it must be `num_column_families`. The "
 287               "sum of elements must be 100. E.g., if `num_column_families=4`, "
 288               "and `num_hot_column_families=0`, a valid list could be "
 289               "\"10,20,30,40\".");
 290
 291 DEFINE_int64(reads, -1,
 292              "Number of read operations to do.  "
 293              "If negative, do FLAGS_num reads.");
 294
 295 DEFINE_int64(deletes, -1,
 296              "Number of delete operations to do.  "
 297              "If negative, do FLAGS_num deletions.");
 298
 299 DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
 300
 301 DEFINE_int64(seed, 0,
 302              "Seed base for random number generators. "
 303              "When 0 it is derived from the current time.");
 304 static int64_t seed_base;
 305
 306 DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
 307
 308 DEFINE_int32(duration, 0,
 309              "Time in seconds for the random-ops tests to run."
 310              " When 0 then num & reads determine the test duration");
 311
 312 DEFINE_string(value_size_distribution_type, "fixed",
 313               "Value size distribution type: fixed, uniform, normal");
 314
 315 DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
 316 static unsigned int value_size = 100;
 317
 318 DEFINE_int32(value_size_min, 100, "Min size of random value");
 319
 320 DEFINE_int32(value_size_max, 102400, "Max size of random value");
 321
 322 DEFINE_int32(seek_nexts, 0,
 323              "How many times to call Next() after Seek() in "
 324              "fillseekseq, seekrandom, seekrandomwhilewriting and "
 325              "seekrandomwhilemerging");
 326
 327 DEFINE_bool(reverse_iterator, false,
 328             "When true use Prev rather than Next for iterators that do "
 329             "Seek and then Next");
 330
 331 DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark");
 332
 333 DEFINE_int64(max_scan_distance, 0,
 334              "Used to define iterate_upper_bound (or iterate_lower_bound "
 335              "if FLAGS_reverse_iterator is set to true) when value is nonzero");
 336
 337 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 338
 339 DEFINE_int64(batch_size, 1, "Batch size");
 340
 341 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
 342   return true;
 343 }
 344
 345 static bool ValidateUint32Range(const char* flagname, uint64_t value) {
 346   if (value > std::numeric_limits<uint32_t>::max()) {
 347     fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
 348             (unsigned long)value);
 349     return false;
 350   }
 351   return true;
 352 }
 353
 354 DEFINE_int32(key_size, 16, "size of each key");
 355
 356 DEFINE_int32(user_timestamp_size, 0,
 357              "number of bytes in a user-defined timestamp");
 358
 359 DEFINE_int32(num_multi_db, 0,
 360              "Number of DBs used in the benchmark. 0 means single DB.");
 361
 362 DEFINE_double(compression_ratio, 0.5,
 363               "Arrange to generate values that shrink to this fraction of "
 364               "their original size after compression");
 365
 366 DEFINE_double(
 367     overwrite_probability, 0.0,
 368     "Used in 'filluniquerandom' benchmark: for each write operation, "
 369     "we give a probability to perform an overwrite instead. The key used for "
 370     "the overwrite is randomly chosen from the last 'overwrite_window_size' "
 371     "keys previously inserted into the DB. "
 372     "Valid overwrite_probability values: [0.0, 1.0].");
 373
 374 DEFINE_uint32(overwrite_window_size, 1,
 375               "Used in 'filluniquerandom' benchmark. For each write operation,"
 376               " when the overwrite_probability flag is set by the user, the "
 377               "key used to perform an overwrite is randomly chosen from the "
 378               "last 'overwrite_window_size' keys previously inserted into DB. "
 379               "Warning: large values can affect throughput. "
 380               "Valid overwrite_window_size values: [1, kMaxUint32].");
 381
 382 DEFINE_uint64(
 383     disposable_entries_delete_delay, 0,
 384     "Minimum delay in microseconds for the series of Deletes "
 385     "to be issued. When 0 the insertion of the last disposable entry is "
 386     "immediately followed by the issuance of the Deletes. "
 387     "(only compatible with fillanddeleteuniquerandom benchmark).");
 388
 389 DEFINE_uint64(disposable_entries_batch_size, 0,
 390               "Number of consecutively inserted disposable KV entries "
 391               "that will be deleted after 'delete_delay' microseconds. "
 392               "A series of Deletes is always issued once all the "
 393               "disposable KV entries it targets have been inserted "
 394               "into the DB. When 0 no deletes are issued and a "
 395               "regular 'filluniquerandom' benchmark occurs. "
 396               "(only compatible with fillanddeleteuniquerandom benchmark)");
 397
 398 DEFINE_int32(disposable_entries_value_size, 64,
 399              "Size of the values (in bytes) of the entries targeted by "
 400              "selective deletes. "
 401              "(only compatible with fillanddeleteuniquerandom benchmark)");
 402
 403 DEFINE_uint64(
 404     persistent_entries_batch_size, 0,
 405     "Number of KV entries being inserted right before the deletes "
 406     "targeting the disposable KV entries are issued. These "
 407     "persistent keys are not targeted by the deletes, and will always "
 408     "remain valid in the DB. (only compatible with "
 409     "--benchmarks='fillanddeleteuniquerandom' "
 410     "and used when--disposable_entries_batch_size is > 0).");
 411
 412 DEFINE_int32(persistent_entries_value_size, 64,
 413              "Size of the values (in bytes) of the entries not targeted by "
 414              "deletes. (only compatible with "
 415              "--benchmarks='fillanddeleteuniquerandom' "
 416              "and used when--disposable_entries_batch_size is > 0).");
 417
 418 DEFINE_double(read_random_exp_range, 0.0,
 419               "Read random's key will be generated using distribution of "
 420               "num * exp(-r) where r is uniform number from 0 to this value. "
 421               "The larger the number is, the more skewed the reads are. "
 422               "Only used in readrandom and multireadrandom benchmarks.");
 423
 424 DEFINE_bool(histogram, false, "Print histogram of operation timings");
 425
 426 DEFINE_bool(confidence_interval_only, false,
 427             "Print 95% confidence interval upper and lower bounds only for "
 428             "aggregate stats.");
 429
 430 DEFINE_bool(enable_numa, false,
 431             "Make operations aware of NUMA architecture and bind memory "
 432             "and cpus corresponding to nodes together. In NUMA, memory "
 433             "in same node as CPUs are closer when compared to memory in "
 434             "other nodes. Reads can be faster when the process is bound to "
 435             "CPU and memory of same node. Use \"$numactl --hardware\" command "
 436             "to see NUMA memory architecture.");
 437
 438 DEFINE_int64(db_write_buffer_size,
 439              ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
 440              "Number of bytes to buffer in all memtables before compacting");
 441
 442 DEFINE_bool(cost_write_buffer_to_cache, false,
 443             "The usage of memtable is costed to the block cache");
 444
 445 DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
 446              "The size, in bytes, of one block in arena memory allocation.");
 447
 448 DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
 449              "Number of bytes to buffer in memtable before compacting");
 450
 451 DEFINE_int32(max_write_buffer_number,
 452              ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
 453              "The number of in-memory memtables. Each memtable is of size"
 454              " write_buffer_size bytes.");
 455
 456 DEFINE_int32(min_write_buffer_number_to_merge,
 457              ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
 458              "The minimum number of write buffers that will be merged together"
 459              "before writing to storage. This is cheap because it is an"
 460              "in-memory merge. If this feature is not enabled, then all these"
 461              "write buffers are flushed to L0 as separate files and this "
 462              "increases read amplification because a get request has to check"
 463              " in all of these files. Also, an in-memory merge may result in"
 464              " writing less data to storage if there are duplicate records "
 465              " in each of these individual write buffers.");
 466
 467 DEFINE_int32(max_write_buffer_number_to_maintain,
 468              ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
 469              "The total maximum number of write buffers to maintain in memory "
 470              "including copies of buffers that have already been flushed. "
 471              "Unlike max_write_buffer_number, this parameter does not affect "
 472              "flushing. This controls the minimum amount of write history "
 473              "that will be available in memory for conflict checking when "
 474              "Transactions are used. If this value is too low, some "
 475              "transactions may fail at commit time due to not being able to "
 476              "determine whether there were any write conflicts. Setting this "
 477              "value to 0 will cause write buffers to be freed immediately "
 478              "after they are flushed.  If this value is set to -1, "
 479              "'max_write_buffer_number' will be used.");
 480
 481 DEFINE_int64(max_write_buffer_size_to_maintain,
 482              ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
 483              "The total maximum size of write buffers to maintain in memory "
 484              "including copies of buffers that have already been flushed. "
 485              "Unlike max_write_buffer_number, this parameter does not affect "
 486              "flushing. This controls the minimum amount of write history "
 487              "that will be available in memory for conflict checking when "
 488              "Transactions are used. If this value is too low, some "
 489              "transactions may fail at commit time due to not being able to "
 490              "determine whether there were any write conflicts. Setting this "
 491              "value to 0 will cause write buffers to be freed immediately "
 492              "after they are flushed.  If this value is set to -1, "
 493              "'max_write_buffer_number' will be used.");
 494
 495 DEFINE_int32(max_background_jobs,
 496              ROCKSDB_NAMESPACE::Options().max_background_jobs,
 497              "The maximum number of concurrent background jobs that can occur "
 498              "in parallel.");
 499
 500 DEFINE_int32(num_bottom_pri_threads, 0,
 501              "The number of threads in the bottom-priority thread pool (used "
 502              "by universal compaction only).");
 503
 504 DEFINE_int32(num_high_pri_threads, 0,
 505              "The maximum number of concurrent background compactions"
 506              " that can occur in parallel.");
 507
 508 DEFINE_int32(num_low_pri_threads, 0,
 509              "The maximum number of concurrent background compactions"
 510              " that can occur in parallel.");
 511
 512 DEFINE_int32(max_background_compactions,
 513              ROCKSDB_NAMESPACE::Options().max_background_compactions,
 514              "The maximum number of concurrent background compactions"
 515              " that can occur in parallel.");
 516
 517 DEFINE_uint64(subcompactions, 1,
 518               "Maximum number of subcompactions to divide L0-L1 compactions "
 519               "into.");
 520 static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
 521     RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
 522
 523 DEFINE_int32(max_background_flushes,
 524              ROCKSDB_NAMESPACE::Options().max_background_flushes,
 525              "The maximum number of concurrent background flushes"
 526              " that can occur in parallel.");
 527
 528 static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
 529 DEFINE_int32(compaction_style,
 530              (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
 531              "style of compaction: level-based, universal and fifo");
 532
 533 static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
 534 DEFINE_int32(compaction_pri,
 535              (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
 536              "priority of files to compaction: by size or by data age");
 537
 538 DEFINE_int32(universal_size_ratio, 0,
 539              "Percentage flexibility while comparing file size "
 540              "(for universal compaction only).");
 541
 542 DEFINE_int32(universal_min_merge_width, 0,
 543              "The minimum number of files in a single compaction run "
 544              "(for universal compaction only).");
 545
 546 DEFINE_int32(universal_max_merge_width, 0,
 547              "The max number of files to compact in universal style "
 548              "compaction");
 549
 550 DEFINE_int32(universal_max_size_amplification_percent, 0,
 551              "The max size amplification for universal style compaction");
 552
 553 DEFINE_int32(universal_compression_size_percent, -1,
 554              "The percentage of the database to compress for universal "
 555              "compaction. -1 means compress everything.");
 556
 557 DEFINE_bool(universal_allow_trivial_move, false,
 558             "Allow trivial move in universal compaction.");
 559
 560 DEFINE_bool(universal_incremental, false,
 561             "Enable incremental compactions in universal compaction.");
 562
 563 DEFINE_int64(cache_size, 8 << 20,  // 8MB
 564              "Number of bytes to use as a cache of uncompressed data");
 565
 566 DEFINE_int32(cache_numshardbits, -1,
 567              "Number of shards for the block cache"
 568              " is 2 ** cache_numshardbits. Negative means use default settings."
 569              " This is applied only if FLAGS_cache_size is non-negative.");
 570
 571 DEFINE_double(cache_high_pri_pool_ratio, 0.0,
 572               "Ratio of block cache reserve for high pri blocks. "
 573               "If > 0.0, we also enable "
 574               "cache_index_and_filter_blocks_with_high_priority.");
 575
 576 DEFINE_double(cache_low_pri_pool_ratio, 0.0,
 577               "Ratio of block cache reserve for low pri blocks.");
 578
 579 DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
 580
 581 DEFINE_bool(use_compressed_secondary_cache, false,
 582             "Use the CompressedSecondaryCache as the secondary cache.");
 583
 584 DEFINE_int64(compressed_secondary_cache_size, 8 << 20,  // 8MB
 585              "Number of bytes to use as a cache of data");
 586
 587 DEFINE_int32(compressed_secondary_cache_numshardbits, 6,
 588              "Number of shards for the block cache"
 589              " is 2 ** compressed_secondary_cache_numshardbits."
 590              " Negative means use default settings."
 591              " This is applied only if FLAGS_cache_size is non-negative.");
 592
 593 DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio, 0.0,
 594               "Ratio of block cache reserve for high pri blocks. "
 595               "If > 0.0, we also enable "
 596               "cache_index_and_filter_blocks_with_high_priority.");
 597
 598 DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio, 0.0,
 599               "Ratio of block cache reserve for low pri blocks.");
 600
 601 DEFINE_string(compressed_secondary_cache_compression_type, "lz4",
 602               "The compression algorithm to use for large "
 603               "values stored in CompressedSecondaryCache.");
 604 static enum ROCKSDB_NAMESPACE::CompressionType
 605     FLAGS_compressed_secondary_cache_compression_type_e =
 606         ROCKSDB_NAMESPACE::kLZ4Compression;
 607
 608 DEFINE_uint32(
 609     compressed_secondary_cache_compress_format_version, 2,
 610     "compress_format_version can have two values: "
 611     "compress_format_version == 1 -- decompressed size is not included"
 612     " in the block header."
 613     "compress_format_version == 2 -- decompressed size is included"
 614     " in the block header in varint32 format.");
 615
 616 DEFINE_int64(simcache_size, -1,
 617              "Number of bytes to use as a simcache of "
 618              "uncompressed data. Nagative value disables simcache.");
 619
 620 DEFINE_bool(cache_index_and_filter_blocks, false,
 621             "Cache index/filter blocks in block cache.");
 622
 623 DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false,
 624             "Use JemallocNodumpAllocator for block/blob cache.");
 625
 626 DEFINE_bool(use_cache_memkind_kmem_allocator, false,
 627             "Use memkind kmem allocator for block/blob cache.");
 628
 629 DEFINE_bool(partition_index_and_filters, false,
 630             "Partition index and filter blocks.");
 631
 632 DEFINE_bool(partition_index, false, "Partition index blocks");
 633
 634 DEFINE_bool(index_with_first_key, false, "Include first key in the index");
 635
 636 DEFINE_bool(
 637     optimize_filters_for_memory,
 638     ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
 639     "Minimize memory footprint of filters");
 640
 641 DEFINE_int64(
 642     index_shortening_mode, 2,
 643     "mode to shorten index: 0 for no shortening; 1 for only shortening "
 644     "separaters; 2 for shortening shortening and successor");
 645
 646 DEFINE_int64(metadata_block_size,
 647              ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
 648              "Max partition size when partitioning index/filters");
 649
 650 // The default reduces the overhead of reading time with flash. With HDD, which
 651 // offers much less throughput, however, this number better to be set to 1.
 652 DEFINE_int32(ops_between_duration_checks, 1000,
 653              "Check duration limit every x ops");
 654
 655 DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
 656             "Pin index/filter blocks of L0 files in block cache.");
 657
 658 DEFINE_bool(
 659     pin_top_level_index_and_filter, false,
 660     "Pin top-level index of partitioned index/filter blocks in block cache.");
 661
 662 DEFINE_int32(block_size,
 663              static_cast<int32_t>(
 664                  ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
 665              "Number of bytes in a block.");
 666
 667 DEFINE_int32(format_version,
 668              static_cast<int32_t>(
 669                  ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
 670              "Format version of SST files.");
 671
 672 DEFINE_int32(block_restart_interval,
 673              ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
 674              "Number of keys between restart points "
 675              "for delta encoding of keys in data block.");
 676
 677 DEFINE_int32(
 678     index_block_restart_interval,
 679     ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
 680     "Number of keys between restart points "
 681     "for delta encoding of keys in index block.");
 682
 683 DEFINE_int32(read_amp_bytes_per_bit,
 684              ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
 685              "Number of bytes per bit to be used in block read-amp bitmap");
 686
 687 DEFINE_bool(
 688     enable_index_compression,
 689     ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
 690     "Compress the index block");
 691
 692 DEFINE_bool(block_align,
 693             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
 694             "Align data blocks on page size");
 695
 696 DEFINE_int64(prepopulate_block_cache, 0,
 697              "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
 698              "to insert during flush");
 699
 700 DEFINE_bool(use_data_block_hash_index, false,
 701             "if use kDataBlockBinaryAndHash "
 702             "instead of kDataBlockBinarySearch. "
 703             "This is valid if only we use BlockTable");
 704
 705 DEFINE_double(data_block_hash_table_util_ratio, 0.75,
 706               "util ratio for data block hash index table. "
 707               "This is only valid if use_data_block_hash_index is "
 708               "set to true");
 709
 710 DEFINE_int64(compressed_cache_size, -1,
 711              "Number of bytes to use as a cache of compressed data.");
 712
 713 DEFINE_int64(row_cache_size, 0,
 714              "Number of bytes to use as a cache of individual rows"
 715              " (0 = disabled).");
 716
 717 DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
 718              "Maximum number of files to keep open at the same time"
 719              " (use default if == 0)");
 720
 721 DEFINE_int32(file_opening_threads,
 722              ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
 723              "If open_files is set to -1, this option set the number of "
 724              "threads that will be used to open files during DB::Open()");
 725
 726 DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
 727
 728 DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
 729
 730 DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
 731              "Maximum windows randomaccess buffer size");
 732
 733 DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
 734              "Maximum write buffer for Writable File");
 735
 736 DEFINE_int32(bloom_bits, -1,
 737              "Bloom filter bits per key. Negative means use default."
 738              "Zero disables.");
 739
 740 DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
 741
 742 DEFINE_double(memtable_bloom_size_ratio, 0,
 743               "Ratio of memtable size used for bloom filter. 0 means no bloom "
 744               "filter.");
 745 DEFINE_bool(memtable_whole_key_filtering, false,
 746             "Try to use whole key bloom filter in memtables.");
 747 DEFINE_bool(memtable_use_huge_page, false,
 748             "Try to use huge page in memtables.");
 749
 750 DEFINE_bool(whole_key_filtering,
 751             ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering,
 752             "Use whole keys (in addition to prefixes) in SST bloom filter.");
 753
 754 DEFINE_bool(use_existing_db, false,
 755             "If true, do not destroy the existing database.  If you set this "
 756             "flag and also specify a benchmark that wants a fresh database, "
 757             "that benchmark will fail.");
 758
 759 DEFINE_bool(use_existing_keys, false,
 760             "If true, uses existing keys in the DB, "
 761             "rather than generating new ones. This involves some startup "
 762             "latency to load all keys into memory. It is supported for the "
 763             "same read/overwrite benchmarks as `-use_existing_db=true`, which "
 764             "must also be set for this flag to be enabled. When this flag is "
 765             "set, the value for `-num` will be ignored.");
 766
 767 DEFINE_bool(show_table_properties, false,
 768             "If true, then per-level table"
 769             " properties will be printed on every stats-interval when"
 770             " stats_interval is set and stats_per_interval is on.");
 771
 772 DEFINE_string(db, "", "Use the db with the following name.");
 773
 774 DEFINE_bool(progress_reports, true,
 775             "If true, db_bench will report number of finished operations.");
 776
 777 // Read cache flags
 778
 779 DEFINE_string(read_cache_path, "",
 780               "If not empty string, a read cache will be used in this path");
 781
 782 DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
 783              "Maximum size of the read cache");
 784
 785 DEFINE_bool(read_cache_direct_write, true,
 786             "Whether to use Direct IO for writing to the read cache");
 787
 788 DEFINE_bool(read_cache_direct_read, true,
 789             "Whether to use Direct IO for reading from read cache");
 790
 791 DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
 792
 793 static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
 794   if (value >= 20) {
 795     fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname,
 796             value);
 797     return false;
 798   }
 799   return true;
 800 }
 801
 802 DEFINE_bool(verify_checksum, true,
 803             "Verify checksum for every block read from storage");
 804
 805 DEFINE_int32(checksum_type,
 806              ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum,
 807              "ChecksumType as an int");
 808
 809 DEFINE_bool(statistics, false, "Database statistics");
 810 DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
 811              "stats level for statistics");
 812 DEFINE_string(statistics_string, "", "Serialized statistics string");
 813 static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
 814
 815 DEFINE_int64(writes, -1,
 816              "Number of write operations to do. If negative, do --num reads.");
 817
 818 DEFINE_bool(finish_after_writes, false,
 819             "Write thread terminates after all writes are finished");
 820
 821 DEFINE_bool(sync, false, "Sync all writes to disk");
 822
 823 DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
 824
 825 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
 826
 827 DEFINE_bool(manual_wal_flush, false,
 828             "If true, buffer WAL until buffer is full or a manual FlushWAL().");
 829
 830 DEFINE_string(wal_compression, "none",
 831               "Algorithm to use for WAL compression. none to disable.");
 832 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e =
 833     ROCKSDB_NAMESPACE::kNoCompression;
 834
 835 DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
 836
 837 DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
 838               "Truth key/values used when using verify");
 839
 840 DEFINE_int32(num_levels, 7, "The total number of levels");
 841
 842 DEFINE_int64(target_file_size_base,
 843              ROCKSDB_NAMESPACE::Options().target_file_size_base,
 844              "Target file size at level-1");
 845
 846 DEFINE_int32(target_file_size_multiplier,
 847              ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
 848              "A multiplier to compute target level-N file size (N >= 2)");
 849
 850 DEFINE_uint64(max_bytes_for_level_base,
 851               ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
 852               "Max bytes for level-1");
 853
 854 DEFINE_bool(level_compaction_dynamic_level_bytes, false,
 855             "Whether level size base is dynamic");
 856
 857 DEFINE_double(max_bytes_for_level_multiplier, 10,
 858               "A multiplier to compute max bytes for level-N (N >= 2)");
 859
 860 static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
 861 DEFINE_string(max_bytes_for_level_multiplier_additional, "",
 862               "A vector that specifies additional fanout per level");
 863
 864 DEFINE_int32(level0_stop_writes_trigger,
 865              ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
 866              "Number of files in level-0 that will trigger put stop.");
 867
 868 DEFINE_int32(level0_slowdown_writes_trigger,
 869              ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
 870              "Number of files in level-0 that will slow down writes.");
 871
 872 DEFINE_int32(level0_file_num_compaction_trigger,
 873              ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
 874              "Number of files in level-0 when compactions start.");
 875
 876 DEFINE_uint64(periodic_compaction_seconds,
 877               ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
 878               "Files older than this will be picked up for compaction and"
 879               " rewritten to the same level");
 880
 881 DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl");
 882
 883 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
 884   if (value <= 0 || value >= 100) {
 885     fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname,
 886             value);
 887     return false;
 888   }
 889   return true;
 890 }
 891 DEFINE_int32(readwritepercent, 90,
 892              "Ratio of reads to reads/writes (expressed as percentage) for "
 893              "the ReadRandomWriteRandom workload. The default value 90 means "
 894              "90% operations out of all reads and writes operations are "
 895              "reads. In other words, 9 gets for every 1 put.");
 896
 897 DEFINE_int32(mergereadpercent, 70,
 898              "Ratio of merges to merges&reads (expressed as percentage) for "
 899              "the ReadRandomMergeRandom workload. The default value 70 means "
 900              "70% out of all read and merge operations are merges. In other "
 901              "words, 7 merges for every 3 gets.");
 902
 903 DEFINE_int32(deletepercent, 2,
 904              "Percentage of deletes out of reads/writes/deletes (used in "
 905              "RandomWithVerify only). RandomWithVerify "
 906              "calculates writepercent as (100 - FLAGS_readwritepercent - "
 907              "deletepercent), so deletepercent must be smaller than (100 - "
 908              "FLAGS_readwritepercent)");
 909
 910 DEFINE_bool(optimize_filters_for_hits,
 911             ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits,
 912             "Optimizes bloom filters for workloads for most lookups return "
 913             "a value. For now this doesn't create bloom filters for the max "
 914             "level of the LSM to reduce metadata that should fit in RAM. ");
 915
 916 DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks,
 917             "RocksDB will aggressively check consistency of the data.");
 918
 919 DEFINE_bool(force_consistency_checks,
 920             ROCKSDB_NAMESPACE::Options().force_consistency_checks,
 921             "Runs consistency checks on the LSM every time a change is "
 922             "applied.");
 923
 924 DEFINE_bool(check_flush_compaction_key_order,
 925             ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order,
 926             "During flush or compaction, check whether keys inserted to "
 927             "output files are in order.");
 928
 929 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
 930               "Ignored. Left here for backward compatibility");
 931
 932 DEFINE_int64(writes_before_delete_range, 0,
 933              "Number of writes before DeleteRange is called regularly.");
 934
 935 DEFINE_int64(writes_per_range_tombstone, 0,
 936              "Number of writes between range tombstones");
 937
 938 DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
 939
 940 DEFINE_int64(max_num_range_tombstones, 0,
 941              "Maximum number of range tombstones to insert.");
 942
 943 DEFINE_bool(expand_range_tombstones, false,
 944             "Expand range tombstone into sequential regular tombstones.");
 945
 946 #ifndef ROCKSDB_LITE
 947 // Transactions Options
 948 DEFINE_bool(optimistic_transaction_db, false,
 949             "Open a OptimisticTransactionDB instance. "
 950             "Required for randomtransaction benchmark.");
 951
 952 DEFINE_bool(transaction_db, false,
 953             "Open a TransactionDB instance. "
 954             "Required for randomtransaction benchmark.");
 955
 956 DEFINE_uint64(transaction_sets, 2,
 957               "Number of keys each transaction will "
 958               "modify (use in RandomTransaction only).  Max: 9999");
 959
 960 DEFINE_bool(transaction_set_snapshot, false,
 961             "Setting to true will have each transaction call SetSnapshot()"
 962             " upon creation.");
 963
 964 DEFINE_int32(transaction_sleep, 0,
 965              "Max microseconds to sleep in between "
 966              "reading and writing a value (used in RandomTransaction only). ");
 967
 968 DEFINE_uint64(transaction_lock_timeout, 100,
 969               "If using a transaction_db, specifies the lock wait timeout in"
 970               " milliseconds before failing a transaction waiting on a lock");
 971 DEFINE_string(
 972     options_file, "",
 973     "The path to a RocksDB options file.  If specified, then db_bench will "
 974     "run with the RocksDB options in the default column family of the "
 975     "specified options file. "
 976     "Note that with this setting, db_bench will ONLY accept the following "
 977     "RocksDB options related command-line arguments, all other arguments "
 978     "that are related to RocksDB options will be ignored:\n"
 979     "\t--use_existing_db\n"
 980     "\t--use_existing_keys\n"
 981     "\t--statistics\n"
 982     "\t--row_cache_size\n"
 983     "\t--row_cache_numshardbits\n"
 984     "\t--enable_io_prio\n"
 985     "\t--dump_malloc_stats\n"
 986     "\t--num_multi_db\n");
 987
 988 // FIFO Compaction Options
 989 DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
 990               "The limit of total table file sizes to trigger FIFO compaction");
 991
 992 DEFINE_bool(fifo_compaction_allow_compaction, true,
 993             "Allow compaction in FIFO compaction.");
 994
 995 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
 996
 997 DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
 998
 999 // Stacked BlobDB Options
1000 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
1001
1002 DEFINE_bool(
1003     blob_db_enable_gc,
1004     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
1005     "[Stacked BlobDB] Enable BlobDB garbage collection.");
1006
1007 DEFINE_double(
1008     blob_db_gc_cutoff,
1009     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
1010     "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
1011
1012 DEFINE_bool(blob_db_is_fifo,
1013             ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
1014             "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
1015
1016 DEFINE_uint64(blob_db_max_db_size,
1017               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
1018               "[Stacked BlobDB] Max size limit of the directory where blob "
1019               "files are stored.");
1020
1021 DEFINE_uint64(blob_db_max_ttl_range, 0,
1022               "[Stacked BlobDB] TTL range to generate BlobDB data (in "
1023               "seconds). 0 means no TTL.");
1024
1025 DEFINE_uint64(
1026     blob_db_ttl_range_secs,
1027     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
1028     "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
1029
1030 DEFINE_uint64(
1031     blob_db_min_blob_size,
1032     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
1033     "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
1034     "smaller than this will be inlined with the key in the LSM tree.");
1035
1036 DEFINE_uint64(blob_db_bytes_per_sync,
1037               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
1038               "[Stacked BlobDB] Bytes to sync blob file at.");
1039
1040 DEFINE_uint64(blob_db_file_size,
1041               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
1042               "[Stacked BlobDB] Target size of each blob file.");
1043
1044 DEFINE_string(
1045     blob_db_compression_type, "snappy",
1046     "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
1047 static enum ROCKSDB_NAMESPACE::CompressionType
1048     FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
1049
1050 #endif  // ROCKSDB_LITE
1051
1052 // Integrated BlobDB options
1053 DEFINE_bool(
1054     enable_blob_files,
1055     ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
1056     "[Integrated BlobDB] Enable writing large values to separate blob files.");
1057
1058 DEFINE_uint64(min_blob_size,
1059               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
1060               "[Integrated BlobDB] The size of the smallest value to be stored "
1061               "separately in a blob file.");
1062
1063 DEFINE_uint64(blob_file_size,
1064               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
1065               "[Integrated BlobDB] The size limit for blob files.");
1066
1067 DEFINE_string(blob_compression_type, "none",
1068               "[Integrated BlobDB] The compression algorithm to use for large "
1069               "values stored in blob files.");
1070
1071 DEFINE_bool(enable_blob_garbage_collection,
1072             ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1073                 .enable_blob_garbage_collection,
1074             "[Integrated BlobDB] Enable blob garbage collection.");
1075
1076 DEFINE_double(blob_garbage_collection_age_cutoff,
1077               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1078                   .blob_garbage_collection_age_cutoff,
1079               "[Integrated BlobDB] The cutoff in terms of blob file age for "
1080               "garbage collection.");
1081
1082 DEFINE_double(blob_garbage_collection_force_threshold,
1083               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1084                   .blob_garbage_collection_force_threshold,
1085               "[Integrated BlobDB] The threshold for the ratio of garbage in "
1086               "the oldest blob files for forcing garbage collection.");
1087
1088 DEFINE_uint64(blob_compaction_readahead_size,
1089               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1090                   .blob_compaction_readahead_size,
1091               "[Integrated BlobDB] Compaction readahead for blob files.");
1092
1093 DEFINE_int32(
1094     blob_file_starting_level,
1095     ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level,
1096     "[Integrated BlobDB] The starting level for blob files.");
1097
1098 DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache.");
1099
1100 DEFINE_bool(
1101     use_shared_block_and_blob_cache, true,
1102     "[Integrated BlobDB] Use a shared backing cache for both block "
1103     "cache and blob cache. It only takes effect if use_blob_cache is enabled.");
1104
1105 DEFINE_uint64(
1106     blob_cache_size, 8 << 20,
1107     "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only "
1108     "takes effect if the block and blob caches are different "
1109     "(use_shared_block_and_blob_cache = false).");
1110
1111 DEFINE_int32(blob_cache_numshardbits, 6,
1112              "[Integrated BlobDB] Number of shards for the blob cache is 2 ** "
1113              "blob_cache_numshardbits. Negative means use default settings. "
1114              "It only takes effect if blob_cache_size is greater than 0, and "
1115              "the block and blob caches are different "
1116              "(use_shared_block_and_blob_cache = false).");
1117
1118 DEFINE_int32(prepopulate_blob_cache, 0,
1119              "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
1120              "to disable and 1 to insert during flush.");
1121
1122 #ifndef ROCKSDB_LITE
1123
1124 // Secondary DB instance Options
1125 DEFINE_bool(use_secondary_db, false,
1126             "Open a RocksDB secondary instance. A primary instance can be "
1127             "running in another db_bench process.");
1128
1129 DEFINE_string(secondary_path, "",
1130               "Path to a directory used by the secondary instance to store "
1131               "private files, e.g. info log.");
1132
1133 DEFINE_int32(secondary_update_interval, 5,
1134              "Secondary instance attempts to catch up with the primary every "
1135              "secondary_update_interval seconds.");
1136
1137 #endif  // ROCKSDB_LITE
1138
1139 DEFINE_bool(report_bg_io_stats, false,
1140             "Measure times spents on I/Os while in compactions. ");
1141
1142 DEFINE_bool(use_stderr_info_logger, false,
1143             "Write info logs to stderr instead of to LOG file. ");
1144
1145 #ifndef ROCKSDB_LITE
1146
1147 DEFINE_string(trace_file, "", "Trace workload to a file. ");
1148
1149 DEFINE_double(trace_replay_fast_forward, 1.0,
1150               "Fast forward trace replay, must > 0.0.");
1151 DEFINE_int32(block_cache_trace_sampling_frequency, 1,
1152              "Block cache trace sampling frequency, termed s. It uses spatial "
1153              "downsampling and samples accesses to one out of s blocks.");
1154 DEFINE_int64(
1155     block_cache_trace_max_trace_file_size_in_bytes,
1156     uint64_t{64} * 1024 * 1024 * 1024,
1157     "The maximum block cache trace file size in bytes. Block cache accesses "
1158     "will not be logged if the trace file size exceeds this threshold. Default "
1159     "is 64 GB.");
1160 DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
1161 DEFINE_int32(trace_replay_threads, 1,
1162              "The number of threads to replay, must >=1.");
1163
1164 DEFINE_bool(io_uring_enabled, true,
1165             "If true, enable the use of IO uring if the platform supports it");
1166 extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; }
1167 #endif  // ROCKSDB_LITE
1168
1169 DEFINE_bool(adaptive_readahead, false,
1170             "carry forward internal auto readahead size from one file to next "
1171             "file at each level during iteration");
1172
1173 DEFINE_bool(rate_limit_user_ops, false,
1174             "When true use Env::IO_USER priority level to charge internal rate "
1175             "limiter for reads associated with user operations.");
1176
1177 DEFINE_bool(file_checksum, false,
1178             "When true use FileChecksumGenCrc32cFactory for "
1179             "file_checksum_gen_factory.");
1180
1181 DEFINE_bool(rate_limit_auto_wal_flush, false,
1182             "When true use Env::IO_USER priority level to charge internal rate "
1183             "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
1184             "false) after the user write operation.");
1185
1186 DEFINE_bool(async_io, false,
1187             "When set true, RocksDB does asynchronous reads for internal auto "
1188             "readahead prefetching.");
1189
1190 DEFINE_bool(optimize_multiget_for_io, true,
1191             "When set true, RocksDB does asynchronous reads for SST files in "
1192             "multiple levels for MultiGet.");
1193
1194 DEFINE_bool(charge_compression_dictionary_building_buffer, false,
1195             "Setting for "
1196             "CacheEntryRoleOptions::charged of "
1197             "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
1198
1199 DEFINE_bool(charge_filter_construction, false,
1200             "Setting for "
1201             "CacheEntryRoleOptions::charged of "
1202             "CacheEntryRole::kFilterConstruction");
1203
1204 DEFINE_bool(charge_table_reader, false,
1205             "Setting for "
1206             "CacheEntryRoleOptions::charged of "
1207             "CacheEntryRole::kBlockBasedTableReader");
1208
1209 DEFINE_bool(charge_file_metadata, false,
1210             "Setting for "
1211             "CacheEntryRoleOptions::charged of "
1212             "CacheEntryRole::kFileMetadata");
1213
1214 DEFINE_bool(charge_blob_cache, false,
1215             "Setting for "
1216             "CacheEntryRoleOptions::charged of "
1217             "CacheEntryRole::kBlobCache");
1218
1219 DEFINE_uint64(backup_rate_limit, 0ull,
1220               "If non-zero, db_bench will rate limit reads and writes for DB "
1221               "backup. This "
1222               "is the global rate in ops/second.");
1223
1224 DEFINE_uint64(restore_rate_limit, 0ull,
1225               "If non-zero, db_bench will rate limit reads and writes for DB "
1226               "restore. This "
1227               "is the global rate in ops/second.");
1228
1229 DEFINE_string(backup_dir, "",
1230               "If not empty string, use the given dir for backup.");
1231
1232 DEFINE_string(restore_dir, "",
1233               "If not empty string, use the given dir for restore.");
1234
1235 DEFINE_uint64(
1236     initial_auto_readahead_size,
1237     ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size,
1238     "RocksDB does auto-readahead for iterators on noticing more than two reads "
1239     "for a table file if user doesn't provide readahead_size. The readahead "
1240     "size starts at initial_auto_readahead_size");
1241
1242 DEFINE_uint64(
1243     max_auto_readahead_size,
1244     ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size,
1245     "Rocksdb implicit readahead starts at "
1246     "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every "
1247     "additional read upto max_auto_readahead_size");
1248
1249 DEFINE_uint64(
1250     num_file_reads_for_auto_readahead,
1251     ROCKSDB_NAMESPACE::BlockBasedTableOptions()
1252         .num_file_reads_for_auto_readahead,
1253     "Rocksdb implicit readahead is enabled if reads are sequential and "
1254     "num_file_reads_for_auto_readahead indicates after how many sequential "
1255     "reads into that file internal auto prefetching should be start.");
1256
1257 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
1258     const char* ctype) {
1259   assert(ctype);
1260
1261   if (!strcasecmp(ctype, "none"))
1262     return ROCKSDB_NAMESPACE::kNoCompression;
1263   else if (!strcasecmp(ctype, "snappy"))
1264     return ROCKSDB_NAMESPACE::kSnappyCompression;
1265   else if (!strcasecmp(ctype, "zlib"))
1266     return ROCKSDB_NAMESPACE::kZlibCompression;
1267   else if (!strcasecmp(ctype, "bzip2"))
1268     return ROCKSDB_NAMESPACE::kBZip2Compression;
1269   else if (!strcasecmp(ctype, "lz4"))
1270     return ROCKSDB_NAMESPACE::kLZ4Compression;
1271   else if (!strcasecmp(ctype, "lz4hc"))
1272     return ROCKSDB_NAMESPACE::kLZ4HCCompression;
1273   else if (!strcasecmp(ctype, "xpress"))
1274     return ROCKSDB_NAMESPACE::kXpressCompression;
1275   else if (!strcasecmp(ctype, "zstd"))
1276     return ROCKSDB_NAMESPACE::kZSTD;
1277   else {
1278     fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
1279     exit(1);
1280   }
1281 }
1282
1283 static std::string ColumnFamilyName(size_t i) {
1284   if (i == 0) {
1285     return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
1286   } else {
1287     char name[100];
1288     snprintf(name, sizeof(name), "column_family_name_%06zu", i);
1289     return std::string(name);
1290   }
1291 }
1292
1293 DEFINE_string(compression_type, "snappy",
1294               "Algorithm to use to compress the database");
1295 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
1296     ROCKSDB_NAMESPACE::kSnappyCompression;
1297
1298 DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
1299
1300 DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
1301              "Compression level. The meaning of this value is library-"
1302              "dependent. If unset, we try to use the default for the library "
1303              "specified in `--compression_type`");
1304
1305 DEFINE_int32(compression_max_dict_bytes,
1306              ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
1307              "Maximum size of dictionary used to prime the compression "
1308              "library.");
1309
1310 DEFINE_int32(compression_zstd_max_train_bytes,
1311              ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
1312              "Maximum size of training data passed to zstd's dictionary "
1313              "trainer.");
1314
1315 DEFINE_int32(min_level_to_compress, -1,
1316              "If non-negative, compression starts"
1317              " from this level. Levels with number < min_level_to_compress are"
1318              " not compressed. Otherwise, apply compression_type to "
1319              "all levels.");
1320
1321 DEFINE_int32(compression_parallel_threads, 1,
1322              "Number of threads for parallel compression.");
1323
1324 DEFINE_uint64(compression_max_dict_buffer_bytes,
1325               ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
1326               "Maximum bytes to buffer to collect samples for dictionary.");
1327
1328 DEFINE_bool(compression_use_zstd_dict_trainer,
1329             ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer,
1330             "If true, use ZSTD_TrainDictionary() to create dictionary, else"
1331             "use ZSTD_FinalizeDictionary() to create dictionary");
1332
1333 static bool ValidateTableCacheNumshardbits(const char* flagname,
1334                                            int32_t value) {
1335   if (0 >= value || value >= 20) {
1336     fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val < 20\n",
1337             flagname, value);
1338     return false;
1339   }
1340   return true;
1341 }
1342 DEFINE_int32(table_cache_numshardbits, 4, "");
1343
1344 #ifndef ROCKSDB_LITE
1345 DEFINE_string(env_uri, "",
1346               "URI for registry Env lookup. Mutually exclusive with --fs_uri");
1347 DEFINE_string(fs_uri, "",
1348               "URI for registry Filesystem lookup. Mutually exclusive"
1349               " with --env_uri."
1350               " Creates a default environment with the specified filesystem.");
1351 #endif  // ROCKSDB_LITE
1352 DEFINE_string(simulate_hybrid_fs_file, "",
1353               "File for Store Metadata for Simulate hybrid FS. Empty means "
1354               "disable the feature. Now, if it is set, last_level_temperature "
1355               "is set to kWarm.");
1356 DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
1357              "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
1358              "are simulated.");
1359 DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
1360
1361 DEFINE_int64(
1362     preclude_last_level_data_seconds, 0,
1363     "Preclude the latest data from the last level. (Used for tiered storage)");
1364
1365 DEFINE_int64(preserve_internal_time_seconds, 0,
1366              "Preserve the internal time information which stores with SST.");
1367
1368 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
1369
1370 static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
1371
1372 DEFINE_int64(stats_interval, 0,
1373              "Stats are reported every N operations when this is greater than "
1374              "zero. When 0 the interval grows over time.");
1375
1376 DEFINE_int64(stats_interval_seconds, 0,
1377              "Report stats every N seconds. This overrides stats_interval when"
1378              " both are > 0.");
1379
1380 DEFINE_int32(stats_per_interval, 0,
1381              "Reports additional stats per interval when this is greater than "
1382              "0.");
1383
1384 DEFINE_uint64(slow_usecs, 1000000,
1385               "A message is printed for operations that take at least this "
1386               "many microseconds.");
1387
1388 DEFINE_int64(report_interval_seconds, 0,
1389              "If greater than zero, it will write simple stats in CSV format "
1390              "to --report_file every N seconds");
1391
1392 DEFINE_string(report_file, "report.csv",
1393               "Filename where some simple stats are reported to (if "
1394               "--report_interval_seconds is bigger than 0)");
1395
1396 DEFINE_int32(thread_status_per_interval, 0,
1397              "Takes and report a snapshot of the current status of each thread"
1398              " when this is greater than 0.");
1399
1400 DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
1401              "Level of perf collection");
1402
1403 DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
1404               "Slowdown writes if pending compaction bytes exceed this number");
1405
1406 DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
1407               "Stop writes if pending compaction bytes exceed this number");
1408
1409 DEFINE_uint64(delayed_write_rate, 8388608u,
1410               "Limited bytes allowed to DB when soft_rate_limit or "
1411               "level0_slowdown_writes_trigger triggers");
1412
1413 DEFINE_bool(enable_pipelined_write, true,
1414             "Allow WAL and memtable writes to be pipelined");
1415
1416 DEFINE_bool(
1417     unordered_write, false,
1418     "Enable the unordered write feature, which provides higher throughput but "
1419     "relaxes the guarantees around atomic reads and immutable snapshots");
1420
1421 DEFINE_bool(allow_concurrent_memtable_write, true,
1422             "Allow multi-writers to update mem tables in parallel.");
1423
1424 DEFINE_double(experimental_mempurge_threshold, 0.0,
1425               "Maximum useful payload ratio estimate that triggers a mempurge "
1426               "(memtable garbage collection).");
1427
1428 DEFINE_bool(inplace_update_support,
1429             ROCKSDB_NAMESPACE::Options().inplace_update_support,
1430             "Support in-place memtable update for smaller or same-size values");
1431
1432 DEFINE_uint64(inplace_update_num_locks,
1433               ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
1434               "Number of RW locks to protect in-place memtable updates");
1435
1436 DEFINE_bool(enable_write_thread_adaptive_yield, true,
1437             "Use a yielding spin loop for brief writer thread waits.");
1438
1439 DEFINE_uint64(
1440     write_thread_max_yield_usec, 100,
1441     "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
1442
1443 DEFINE_uint64(write_thread_slow_yield_usec, 3,
1444               "The threshold at which a slow yield is considered a signal that "
1445               "other processes or threads want the core.");
1446
1447 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
1448
1449 DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000,
1450              "Set refill period on rate limiter.");
1451
1452 DEFINE_bool(rate_limiter_auto_tuned, false,
1453             "Enable dynamic adjustment of rate limit according to demand for "
1454             "background I/O");
1455
1456 DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit");
1457
1458 DEFINE_uint64(
1459     sine_write_rate_interval_milliseconds, 10000,
1460     "Interval of which the sine wave write_rate_limit is recalculated");
1461
1462 DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d");
1463
1464 DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d");
1465
1466 DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d");
1467
1468 DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d");
1469
1470 DEFINE_bool(rate_limit_bg_reads, false,
1471             "Use options.rate_limiter on compaction reads");
1472
1473 DEFINE_uint64(
1474     benchmark_write_rate_limit, 0,
1475     "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
1476     "is the global rate in bytes/second.");
1477
1478 // the parameters of mix_graph
1479 DEFINE_double(keyrange_dist_a, 0.0,
1480               "The parameter 'a' of prefix average access distribution "
1481               "f(x)=a*exp(b*x)+c*exp(d*x)");
1482 DEFINE_double(keyrange_dist_b, 0.0,
1483               "The parameter 'b' of prefix average access distribution "
1484               "f(x)=a*exp(b*x)+c*exp(d*x)");
1485 DEFINE_double(keyrange_dist_c, 0.0,
1486               "The parameter 'c' of prefix average access distribution"
1487               "f(x)=a*exp(b*x)+c*exp(d*x)");
1488 DEFINE_double(keyrange_dist_d, 0.0,
1489               "The parameter 'd' of prefix average access distribution"
1490               "f(x)=a*exp(b*x)+c*exp(d*x)");
1491 DEFINE_int64(keyrange_num, 1,
1492              "The number of key ranges that are in the same prefix "
1493              "group, each prefix range will have its key access distribution");
1494 DEFINE_double(key_dist_a, 0.0,
1495               "The parameter 'a' of key access distribution model f(x)=a*x^b");
1496 DEFINE_double(key_dist_b, 0.0,
1497               "The parameter 'b' of key access distribution model f(x)=a*x^b");
1498 DEFINE_double(value_theta, 0.0,
1499               "The parameter 'theta' of Generized Pareto Distribution "
1500               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1501 // Use reasonable defaults based on the mixgraph paper
1502 DEFINE_double(value_k, 0.2615,
1503               "The parameter 'k' of Generized Pareto Distribution "
1504               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1505 // Use reasonable defaults based on the mixgraph paper
1506 DEFINE_double(value_sigma, 25.45,
1507               "The parameter 'theta' of Generized Pareto Distribution "
1508               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1509 DEFINE_double(iter_theta, 0.0,
1510               "The parameter 'theta' of Generized Pareto Distribution "
1511               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1512 // Use reasonable defaults based on the mixgraph paper
1513 DEFINE_double(iter_k, 2.517,
1514               "The parameter 'k' of Generized Pareto Distribution "
1515               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1516 // Use reasonable defaults based on the mixgraph paper
1517 DEFINE_double(iter_sigma, 14.236,
1518               "The parameter 'sigma' of Generized Pareto Distribution "
1519               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1520 DEFINE_double(mix_get_ratio, 1.0,
1521               "The ratio of Get queries of mix_graph workload");
1522 DEFINE_double(mix_put_ratio, 0.0,
1523               "The ratio of Put queries of mix_graph workload");
1524 DEFINE_double(mix_seek_ratio, 0.0,
1525               "The ratio of Seek queries of mix_graph workload");
1526 DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
1527 DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
1528 DEFINE_double(
1529     sine_mix_rate_noise, 0.0,
1530     "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
1531 DEFINE_bool(sine_mix_rate, false,
1532             "Enable the sine QPS control on the mix workload");
1533 DEFINE_uint64(
1534     sine_mix_rate_interval_milliseconds, 10000,
1535     "Interval of which the sine wave read_rate_limit is recalculated");
1536 DEFINE_int64(mix_accesses, -1,
1537              "The total query accesses of mix_graph workload");
1538
1539 DEFINE_uint64(
1540     benchmark_read_rate_limit, 0,
1541     "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
1542     "is the global rate in ops/second.");
1543
1544 DEFINE_uint64(max_compaction_bytes,
1545               ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
1546               "Max bytes allowed in one compaction");
1547
1548 #ifndef ROCKSDB_LITE
1549 DEFINE_bool(readonly, false, "Run read only benchmarks.");
1550
1551 DEFINE_bool(print_malloc_stats, false,
1552             "Print malloc stats to stdout after benchmarks finish.");
1553 #endif  // ROCKSDB_LITE
1554
1555 DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1556
1557 DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
1558 DEFINE_uint64(wal_size_limit_MB, 0,
1559               "Set the size limit for the WAL Files in MB.");
1560 DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1561
1562 DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
1563             "Allow reads to occur via mmap-ing files");
1564
1565 DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
1566             "Allow writes to occur via mmap-ing files");
1567
1568 DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
1569             "Use O_DIRECT for reading data");
1570
1571 DEFINE_bool(use_direct_io_for_flush_and_compaction,
1572             ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
1573             "Use O_DIRECT for background flush and compaction writes");
1574
1575 DEFINE_bool(advise_random_on_open,
1576             ROCKSDB_NAMESPACE::Options().advise_random_on_open,
1577             "Advise random access on table file open");
1578
1579 DEFINE_string(compaction_fadvice, "NORMAL",
1580               "Access pattern advice when a file is compacted");
1581 static auto FLAGS_compaction_fadvice_e =
1582     ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
1583
1584 DEFINE_bool(use_tailing_iterator, false,
1585             "Use tailing iterator to access a series of keys instead of get");
1586
1587 DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
1588             "Use adaptive mutex");
1589
1590 DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
1591               "Allows OS to incrementally sync SST files to disk while they are"
1592               " being written, in the background. Issue one request for every"
1593               " bytes_per_sync written. 0 turns it off.");
1594
1595 DEFINE_uint64(wal_bytes_per_sync,
1596               ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
1597               "Allows OS to incrementally sync WAL files to disk while they are"
1598               " being written, in the background. Issue one request for every"
1599               " wal_bytes_per_sync written. 0 turns it off.");
1600
1601 DEFINE_bool(use_single_deletes, true,
1602             "Use single deletes (used in RandomReplaceKeys only).");
1603
1604 DEFINE_double(stddev, 2000.0,
1605               "Standard deviation of normal distribution used for picking keys"
1606               " (used in RandomReplaceKeys only).");
1607
1608 DEFINE_int32(key_id_range, 100000,
1609              "Range of possible value of key id (used in TimeSeries only).");
1610
1611 DEFINE_string(expire_style, "none",
1612               "Style to remove expired time entries. Can be one of the options "
1613               "below: none (do not expired data), compaction_filter (use a "
1614               "compaction filter to remove expired data), delete (seek IDs and "
1615               "remove expired data) (used in TimeSeries only).");
1616
1617 DEFINE_uint64(
1618     time_range, 100000,
1619     "Range of timestamp that store in the database (used in TimeSeries"
1620     " only).");
1621
1622 DEFINE_int32(num_deletion_threads, 1,
1623              "Number of threads to do deletion (used in TimeSeries and delete "
1624              "expire_style only).");
1625
1626 DEFINE_int32(max_successive_merges, 0,
1627              "Maximum number of successive merge operations on a key in the "
1628              "memtable");
1629
1630 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
1631   if (value < 0 || value >= 2000000000) {
1632     fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
1633             flagname, value);
1634     return false;
1635   }
1636   return true;
1637 }
1638
1639 DEFINE_int32(prefix_size, 0,
1640              "control the prefix size for HashSkipList and plain table");
1641 DEFINE_int64(keys_per_prefix, 0,
1642              "control average number of keys generated per prefix, 0 means no "
1643              "special handling of the prefix, i.e. use the prefix comes with "
1644              "the generated random number.");
1645 DEFINE_bool(total_order_seek, false,
1646             "Enable total order seek regardless of index format.");
1647 DEFINE_bool(prefix_same_as_start, false,
1648             "Enforce iterator to return keys with prefix same as seek key.");
1649 DEFINE_bool(
1650     seek_missing_prefix, false,
1651     "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
1652
1653 DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
1654              "If non-zero, enable "
1655              "memtable insert with hint with the given prefix size.");
1656 DEFINE_bool(enable_io_prio, false,
1657             "Lower the background flush/compaction threads' IO priority");
1658 DEFINE_bool(enable_cpu_prio, false,
1659             "Lower the background flush/compaction threads' CPU priority");
1660 DEFINE_bool(identity_as_first_hash, false,
1661             "the first hash function of cuckoo table becomes an identity "
1662             "function. This is only valid when key is 8 bytes");
1663 DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
1664 DEFINE_uint64(stats_dump_period_sec,
1665               ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
1666               "Gap between printing stats to log in seconds");
1667 DEFINE_uint64(stats_persist_period_sec,
1668               ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
1669               "Gap between persisting stats in seconds");
1670 DEFINE_bool(persist_stats_to_disk,
1671             ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
1672             "whether to persist stats to disk");
1673 DEFINE_uint64(stats_history_buffer_size,
1674               ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
1675               "Max number of stats snapshots to keep in memory");
1676 DEFINE_bool(avoid_flush_during_recovery,
1677             ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
1678             "If true, avoids flushing the recovered WAL data where possible.");
1679 DEFINE_int64(multiread_stride, 0,
1680              "Stride length for the keys in a MultiGet batch");
1681 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
1682
1683 DEFINE_string(memtablerep, "skip_list", "");
1684 DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
1685 DEFINE_bool(use_plain_table, false,
1686             "if use plain table instead of block-based table format");
1687 DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
1688 DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1689 DEFINE_bool(use_hash_search, false,
1690             "if use kHashSearch instead of kBinarySearch. "
1691             "This is valid if only we use BlockTable");
1692 DEFINE_string(merge_operator, "",
1693               "The merge operator to use with the database."
1694               "If a new merge operator is specified, be sure to use fresh"
1695               " database The possible merge operators are defined in"
1696               " utilities/merge_operators.h");
1697 DEFINE_int32(skip_list_lookahead, 0,
1698              "Used with skip_list memtablerep; try linear search first for "
1699              "this many steps from the previous position");
1700 DEFINE_bool(report_file_operations, false,
1701             "if report number of file operations");
1702 DEFINE_bool(report_open_timing, false, "if report open timing");
1703 DEFINE_int32(readahead_size, 0, "Iterator readahead size");
1704
1705 DEFINE_bool(read_with_latest_user_timestamp, true,
1706             "If true, always use the current latest timestamp for read. If "
1707             "false, choose a random timestamp from the past.");
1708
1709 #ifndef ROCKSDB_LITE
1710 DEFINE_string(secondary_cache_uri, "",
1711               "Full URI for creating a custom secondary cache object");
1712 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
1713 #endif  // ROCKSDB_LITE
1714
1715 static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
1716     RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
1717
1718 static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
1719     RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
1720
1721 static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
1722     RegisterFlagValidator(&FLAGS_cache_numshardbits,
1723                           &ValidateCacheNumshardbits);
1724
1725 static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
1726     RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
1727
1728 DEFINE_int32(disable_seek_compaction, false,
1729              "Not used, left here for backwards compatibility");
1730
1731 DEFINE_bool(allow_data_in_errors,
1732             ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
1733             "If true, allow logging data, e.g. key, value in LOG files.");
1734
1735 static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
1736     RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
1737 static const bool FLAGS_table_cache_numshardbits_dummy
1738     __attribute__((__unused__)) = RegisterFlagValidator(
1739         &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits);
1740
1741 DEFINE_uint32(write_batch_protection_bytes_per_key, 0,
1742               "Size of per-key-value checksum in each write batch. Currently "
1743               "only value 0 and 8 are supported.");
1744
1745 DEFINE_uint32(
1746     memtable_protection_bytes_per_key, 0,
1747     "Enable memtable per key-value checksum protection. "
1748     "Each entry in memtable will be suffixed by a per key-value checksum. "
1749     "This options determines the size of such checksums. "
1750     "Supported values: 0, 1, 2, 4, 8.");
1751
1752 DEFINE_bool(build_info, false,
1753             "Print the build info via GetRocksBuildInfoAsString");
1754
1755 DEFINE_bool(track_and_verify_wals_in_manifest, false,
1756             "If true, enable WAL tracking in the MANIFEST");
1757
1758 namespace ROCKSDB_NAMESPACE {
1759 namespace {
1760 static Status CreateMemTableRepFactory(
1761     const ConfigOptions& config_options,
1762     std::shared_ptr<MemTableRepFactory>* factory) {
1763   Status s;
1764   if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) {
1765     factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead));
1766 #ifndef ROCKSDB_LITE
1767   } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) {
1768     factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
1769   } else if (!strcasecmp(FLAGS_memtablerep.c_str(),
1770                          VectorRepFactory::kNickName())) {
1771     factory->reset(new VectorRepFactory());
1772   } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) {
1773     factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count));
1774 #endif  // ROCKSDB_LITE
1775   } else {
1776     std::unique_ptr<MemTableRepFactory> unique;
1777     s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep,
1778                                              &unique);
1779     if (s.ok()) {
1780       factory->reset(unique.release());
1781     }
1782   }
1783   return s;
1784 }
1785
1786 }  // namespace
1787
1788 enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal };
1789
1790 static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
1791
1792 static enum DistributionType StringToDistributionType(const char* ctype) {
1793   assert(ctype);
1794
1795   if (!strcasecmp(ctype, "fixed"))
1796     return kFixed;
1797   else if (!strcasecmp(ctype, "uniform"))
1798     return kUniform;
1799   else if (!strcasecmp(ctype, "normal"))
1800     return kNormal;
1801
1802   fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
1803   exit(1);
1804 }
1805
1806 class BaseDistribution {
1807  public:
1808   BaseDistribution(unsigned int _min, unsigned int _max)
1809       : min_value_size_(_min), max_value_size_(_max) {}
1810   virtual ~BaseDistribution() {}
1811
1812   unsigned int Generate() {
1813     auto val = Get();
1814     if (NeedTruncate()) {
1815       val = std::max(min_value_size_, val);
1816       val = std::min(max_value_size_, val);
1817     }
1818     return val;
1819   }
1820
1821  private:
1822   virtual unsigned int Get() = 0;
1823   virtual bool NeedTruncate() { return true; }
1824   unsigned int min_value_size_;
1825   unsigned int max_value_size_;
1826 };
1827
1828 class FixedDistribution : public BaseDistribution {
1829  public:
1830   FixedDistribution(unsigned int size)
1831       : BaseDistribution(size, size), size_(size) {}
1832
1833  private:
1834   virtual unsigned int Get() override { return size_; }
1835   virtual bool NeedTruncate() override { return false; }
1836   unsigned int size_;
1837 };
1838
1839 class NormalDistribution : public BaseDistribution,
1840                            public std::normal_distribution<double> {
1841  public:
1842   NormalDistribution(unsigned int _min, unsigned int _max)
1843       : BaseDistribution(_min, _max),
1844         // 99.7% values within the range [min, max].
1845         std::normal_distribution<double>(
1846             (double)(_min + _max) / 2.0 /*mean*/,
1847             (double)(_max - _min) / 6.0 /*stddev*/),
1848         gen_(rd_()) {}
1849
1850  private:
1851   virtual unsigned int Get() override {
1852     return static_cast<unsigned int>((*this)(gen_));
1853   }
1854   std::random_device rd_;
1855   std::mt19937 gen_;
1856 };
1857
1858 class UniformDistribution : public BaseDistribution,
1859                             public std::uniform_int_distribution<unsigned int> {
1860  public:
1861   UniformDistribution(unsigned int _min, unsigned int _max)
1862       : BaseDistribution(_min, _max),
1863         std::uniform_int_distribution<unsigned int>(_min, _max),
1864         gen_(rd_()) {}
1865
1866  private:
1867   virtual unsigned int Get() override { return (*this)(gen_); }
1868   virtual bool NeedTruncate() override { return false; }
1869   std::random_device rd_;
1870   std::mt19937 gen_;
1871 };
1872
1873 // Helper for quickly generating random data.
1874 class RandomGenerator {
1875  private:
1876   std::string data_;
1877   unsigned int pos_;
1878   std::unique_ptr<BaseDistribution> dist_;
1879
1880  public:
1881   RandomGenerator() {
1882     auto max_value_size = FLAGS_value_size_max;
1883     switch (FLAGS_value_size_distribution_type_e) {
1884       case kUniform:
1885         dist_.reset(new UniformDistribution(FLAGS_value_size_min,
1886                                             FLAGS_value_size_max));
1887         break;
1888       case kNormal:
1889         dist_.reset(
1890             new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max));
1891         break;
1892       case kFixed:
1893       default:
1894         dist_.reset(new FixedDistribution(value_size));
1895         max_value_size = value_size;
1896     }
1897     // We use a limited amount of data over and over again and ensure
1898     // that it is larger than the compression window (32KB), and also
1899     // large enough to serve all typical value sizes we want to write.
1900     Random rnd(301);
1901     std::string piece;
1902     while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
1903       // Add a short fragment that is as compressible as specified
1904       // by FLAGS_compression_ratio.
1905       test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
1906       data_.append(piece);
1907     }
1908     pos_ = 0;
1909   }
1910
1911   Slice Generate(unsigned int len) {
1912     assert(len <= data_.size());
1913     if (pos_ + len > data_.size()) {
1914       pos_ = 0;
1915     }
1916     pos_ += len;
1917     return Slice(data_.data() + pos_ - len, len);
1918   }
1919
1920   Slice Generate() {
1921     auto len = dist_->Generate();
1922     return Generate(len);
1923   }
1924 };
1925
1926 static void AppendWithSpace(std::string* str, Slice msg) {
1927   if (msg.empty()) return;
1928   if (!str->empty()) {
1929     str->push_back(' ');
1930   }
1931   str->append(msg.data(), msg.size());
1932 }
1933
1934 struct DBWithColumnFamilies {
1935   std::vector<ColumnFamilyHandle*> cfh;
1936   DB* db;
1937 #ifndef ROCKSDB_LITE
1938   OptimisticTransactionDB* opt_txn_db;
1939 #endif                              // ROCKSDB_LITE
1940   std::atomic<size_t> num_created;  // Need to be updated after all the
1941                                     // new entries in cfh are set.
1942   size_t num_hot;  // Number of column families to be queried at each moment.
1943                    // After each CreateNewCf(), another num_hot number of new
1944                    // Column families will be created and used to be queried.
1945   port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
1946   std::vector<int> cfh_idx_to_prob;  // ith index holds probability of operating
1947                                      // on cfh[i].
1948
1949   DBWithColumnFamilies()
1950       : db(nullptr)
1951 #ifndef ROCKSDB_LITE
1952         ,
1953         opt_txn_db(nullptr)
1954 #endif  // ROCKSDB_LITE
1955   {
1956     cfh.clear();
1957     num_created = 0;
1958     num_hot = 0;
1959   }
1960
1961   DBWithColumnFamilies(const DBWithColumnFamilies& other)
1962       : cfh(other.cfh),
1963         db(other.db),
1964 #ifndef ROCKSDB_LITE
1965         opt_txn_db(other.opt_txn_db),
1966 #endif  // ROCKSDB_LITE
1967         num_created(other.num_created.load()),
1968         num_hot(other.num_hot),
1969         cfh_idx_to_prob(other.cfh_idx_to_prob) {
1970   }
1971
1972   void DeleteDBs() {
1973     std::for_each(cfh.begin(), cfh.end(),
1974                   [](ColumnFamilyHandle* cfhi) { delete cfhi; });
1975     cfh.clear();
1976 #ifndef ROCKSDB_LITE
1977     if (opt_txn_db) {
1978       delete opt_txn_db;
1979       opt_txn_db = nullptr;
1980     } else {
1981       delete db;
1982       db = nullptr;
1983     }
1984 #else
1985     delete db;
1986     db = nullptr;
1987 #endif  // ROCKSDB_LITE
1988   }
1989
1990   ColumnFamilyHandle* GetCfh(int64_t rand_num) {
1991     assert(num_hot > 0);
1992     size_t rand_offset = 0;
1993     if (!cfh_idx_to_prob.empty()) {
1994       assert(cfh_idx_to_prob.size() == num_hot);
1995       int sum = 0;
1996       while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
1997         sum += cfh_idx_to_prob[rand_offset];
1998         ++rand_offset;
1999       }
2000       assert(rand_offset < cfh_idx_to_prob.size());
2001     } else {
2002       rand_offset = rand_num % num_hot;
2003     }
2004     return cfh[num_created.load(std::memory_order_acquire) - num_hot +
2005                rand_offset];
2006   }
2007
2008   // stage: assume CF from 0 to stage * num_hot has be created. Need to create
2009   //        stage * num_hot + 1 to stage * (num_hot + 1).
2010   void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
2011     MutexLock l(&create_cf_mutex);
2012     if ((stage + 1) * num_hot <= num_created) {
2013       // Already created.
2014       return;
2015     }
2016     auto new_num_created = num_created + num_hot;
2017     assert(new_num_created <= cfh.size());
2018     for (size_t i = num_created; i < new_num_created; i++) {
2019       Status s =
2020           db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
2021       if (!s.ok()) {
2022         fprintf(stderr, "create column family error: %s\n",
2023                 s.ToString().c_str());
2024         abort();
2025       }
2026     }
2027     num_created.store(new_num_created, std::memory_order_release);
2028   }
2029 };
2030
2031 // A class that reports stats to CSV file.
2032 class ReporterAgent {
2033  public:
2034   ReporterAgent(Env* env, const std::string& fname,
2035                 uint64_t report_interval_secs)
2036       : env_(env),
2037         total_ops_done_(0),
2038         last_report_(0),
2039         report_interval_secs_(report_interval_secs),
2040         stop_(false) {
2041     auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
2042     if (s.ok()) {
2043       s = report_file_->Append(Header() + "\n");
2044     }
2045     if (s.ok()) {
2046       s = report_file_->Flush();
2047     }
2048     if (!s.ok()) {
2049       fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
2050               s.ToString().c_str());
2051       abort();
2052     }
2053
2054     reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
2055   }
2056
2057   ~ReporterAgent() {
2058     {
2059       std::unique_lock<std::mutex> lk(mutex_);
2060       stop_ = true;
2061       stop_cv_.notify_all();
2062     }
2063     reporting_thread_.join();
2064   }
2065
2066   // thread safe
2067   void ReportFinishedOps(int64_t num_ops) {
2068     total_ops_done_.fetch_add(num_ops);
2069   }
2070
2071  private:
2072   std::string Header() const { return "secs_elapsed,interval_qps"; }
2073   void SleepAndReport() {
2074     auto* clock = env_->GetSystemClock().get();
2075     auto time_started = clock->NowMicros();
2076     while (true) {
2077       {
2078         std::unique_lock<std::mutex> lk(mutex_);
2079         if (stop_ ||
2080             stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
2081                               [&]() { return stop_; })) {
2082           // stopping
2083           break;
2084         }
2085         // else -> timeout, which means time for a report!
2086       }
2087       auto total_ops_done_snapshot = total_ops_done_.load();
2088       // round the seconds elapsed
2089       auto secs_elapsed =
2090           (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
2091           kMicrosInSecond;
2092       std::string report =
2093           std::to_string(secs_elapsed) + "," +
2094           std::to_string(total_ops_done_snapshot - last_report_) + "\n";
2095       auto s = report_file_->Append(report);
2096       if (s.ok()) {
2097         s = report_file_->Flush();
2098       }
2099       if (!s.ok()) {
2100         fprintf(stderr,
2101                 "Can't write to report file (%s), stopping the reporting\n",
2102                 s.ToString().c_str());
2103         break;
2104       }
2105       last_report_ = total_ops_done_snapshot;
2106     }
2107   }
2108
2109   Env* env_;
2110   std::unique_ptr<WritableFile> report_file_;
2111   std::atomic<int64_t> total_ops_done_;
2112   int64_t last_report_;
2113   const uint64_t report_interval_secs_;
2114   ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
2115   std::mutex mutex_;
2116   // will notify on stop
2117   std::condition_variable stop_cv_;
2118   bool stop_;
2119 };
2120
2121 enum OperationType : unsigned char {
2122   kRead = 0,
2123   kWrite,
2124   kDelete,
2125   kSeek,
2126   kMerge,
2127   kUpdate,
2128   kCompress,
2129   kUncompress,
2130   kCrc,
2131   kHash,
2132   kOthers
2133 };
2134
2135 static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
2136     OperationTypeString = {{kRead, "read"},         {kWrite, "write"},
2137                            {kDelete, "delete"},     {kSeek, "seek"},
2138                            {kMerge, "merge"},       {kUpdate, "update"},
2139                            {kCompress, "compress"}, {kCompress, "uncompress"},
2140                            {kCrc, "crc"},           {kHash, "hash"},
2141                            {kOthers, "op"}};
2142
2143 class CombinedStats;
2144 class Stats {
2145  private:
2146   SystemClock* clock_;
2147   int id_;
2148   uint64_t start_ = 0;
2149   uint64_t sine_interval_;
2150   uint64_t finish_;
2151   double seconds_;
2152   uint64_t done_;
2153   uint64_t last_report_done_;
2154   uint64_t next_report_;
2155   uint64_t bytes_;
2156   uint64_t last_op_finish_;
2157   uint64_t last_report_finish_;
2158   std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
2159                      std::hash<unsigned char>>
2160       hist_;
2161   std::string message_;
2162   bool exclude_from_merge_;
2163   ReporterAgent* reporter_agent_;  // does not own
2164   friend class CombinedStats;
2165
2166  public:
2167   Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
2168
2169   void SetReporterAgent(ReporterAgent* reporter_agent) {
2170     reporter_agent_ = reporter_agent;
2171   }
2172
2173   void Start(int id) {
2174     id_ = id;
2175     next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
2176     last_op_finish_ = start_;
2177     hist_.clear();
2178     done_ = 0;
2179     last_report_done_ = 0;
2180     bytes_ = 0;
2181     seconds_ = 0;
2182     start_ = clock_->NowMicros();
2183     sine_interval_ = clock_->NowMicros();
2184     finish_ = start_;
2185     last_report_finish_ = start_;
2186     message_.clear();
2187     // When set, stats from this thread won't be merged with others.
2188     exclude_from_merge_ = false;
2189   }
2190
2191   void Merge(const Stats& other) {
2192     if (other.exclude_from_merge_) return;
2193
2194     for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
2195       auto this_it = hist_.find(it->first);
2196       if (this_it != hist_.end()) {
2197         this_it->second->Merge(*(other.hist_.at(it->first)));
2198       } else {
2199         hist_.insert({it->first, it->second});
2200       }
2201     }
2202
2203     done_ += other.done_;
2204     bytes_ += other.bytes_;
2205     seconds_ += other.seconds_;
2206     if (other.start_ < start_) start_ = other.start_;
2207     if (other.finish_ > finish_) finish_ = other.finish_;
2208
2209     // Just keep the messages from one thread.
2210     if (message_.empty()) message_ = other.message_;
2211   }
2212
2213   void Stop() {
2214     finish_ = clock_->NowMicros();
2215     seconds_ = (finish_ - start_) * 1e-6;
2216   }
2217
2218   void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); }
2219
2220   void SetId(int id) { id_ = id; }
2221   void SetExcludeFromMerge() { exclude_from_merge_ = true; }
2222
2223   void PrintThreadStatus() {
2224     std::vector<ThreadStatus> thread_list;
2225     FLAGS_env->GetThreadList(&thread_list);
2226
2227     fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID",
2228             "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage",
2229             "State", "OperationProperties");
2230
2231     int64_t current_time = 0;
2232     clock_->GetCurrentTime(&current_time).PermitUncheckedError();
2233     for (auto ts : thread_list) {
2234       fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
2235               ts.thread_id,
2236               ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
2237               ts.cf_name.c_str(),
2238               ThreadStatus::GetOperationName(ts.operation_type).c_str(),
2239               ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
2240               ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
2241               ThreadStatus::GetStateName(ts.state_type).c_str());
2242
2243       auto op_properties = ThreadStatus::InterpretOperationProperties(
2244           ts.operation_type, ts.op_properties);
2245       for (const auto& op_prop : op_properties) {
2246         fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(),
2247                 op_prop.second);
2248       }
2249       fprintf(stderr, "\n");
2250     }
2251   }
2252
2253   void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
2254
2255   uint64_t GetSineInterval() { return sine_interval_; }
2256
2257   uint64_t GetStart() { return start_; }
2258
2259   void ResetLastOpTime() {
2260     // Set to now to avoid latency from calls to SleepForMicroseconds.
2261     last_op_finish_ = clock_->NowMicros();
2262   }
2263
2264   void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
2265                    enum OperationType op_type = kOthers) {
2266     if (reporter_agent_) {
2267       reporter_agent_->ReportFinishedOps(num_ops);
2268     }
2269     if (FLAGS_histogram) {
2270       uint64_t now = clock_->NowMicros();
2271       uint64_t micros = now - last_op_finish_;
2272
2273       if (hist_.find(op_type) == hist_.end()) {
2274         auto hist_temp = std::make_shared<HistogramImpl>();
2275         hist_.insert({op_type, std::move(hist_temp)});
2276       }
2277       hist_[op_type]->Add(micros);
2278
2279       if (micros >= FLAGS_slow_usecs && !FLAGS_stats_interval) {
2280         fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
2281         fflush(stderr);
2282       }
2283       last_op_finish_ = now;
2284     }
2285
2286     done_ += num_ops;
2287     if (done_ >= next_report_ && FLAGS_progress_reports) {
2288       if (!FLAGS_stats_interval) {
2289         if (next_report_ < 1000)
2290           next_report_ += 100;
2291         else if (next_report_ < 5000)
2292           next_report_ += 500;
2293         else if (next_report_ < 10000)
2294           next_report_ += 1000;
2295         else if (next_report_ < 50000)
2296           next_report_ += 5000;
2297         else if (next_report_ < 100000)
2298           next_report_ += 10000;
2299         else if (next_report_ < 500000)
2300           next_report_ += 50000;
2301         else
2302           next_report_ += 100000;
2303         fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
2304       } else {
2305         uint64_t now = clock_->NowMicros();
2306         int64_t usecs_since_last = now - last_report_finish_;
2307
2308         // Determine whether to print status where interval is either
2309         // each N operations or each N seconds.
2310
2311         if (FLAGS_stats_interval_seconds &&
2312             usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
2313           // Don't check again for this many operations.
2314           next_report_ += FLAGS_stats_interval;
2315
2316         } else {
2317           fprintf(stderr,
2318                   "%s ... thread %d: (%" PRIu64 ",%" PRIu64
2319                   ") ops and "
2320                   "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
2321                   clock_->TimeToString(now / 1000000).c_str(), id_,
2322                   done_ - last_report_done_, done_,
2323                   (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
2324                   done_ / ((now - start_) / 1000000.0),
2325                   (now - last_report_finish_) / 1000000.0,
2326                   (now - start_) / 1000000.0);
2327
2328           if (id_ == 0 && FLAGS_stats_per_interval) {
2329             std::string stats;
2330
2331             if (db_with_cfh && db_with_cfh->num_created.load()) {
2332               for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
2333                 if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
2334                                     &stats))
2335                   fprintf(stderr, "%s\n", stats.c_str());
2336                 if (FLAGS_show_table_properties) {
2337                   for (int level = 0; level < FLAGS_num_levels; ++level) {
2338                     if (db->GetProperty(
2339                             db_with_cfh->cfh[i],
2340                             "rocksdb.aggregated-table-properties-at-level" +
2341                                 std::to_string(level),
2342                             &stats)) {
2343                       if (stats.find("# entries=0") == std::string::npos) {
2344                         fprintf(stderr, "Level[%d]: %s\n", level,
2345                                 stats.c_str());
2346                       }
2347                     }
2348                   }
2349                 }
2350               }
2351             } else if (db) {
2352               if (db->GetProperty("rocksdb.stats", &stats)) {
2353                 fprintf(stderr, "%s", stats.c_str());
2354               }
2355               if (db->GetProperty("rocksdb.num-running-compactions", &stats)) {
2356                 fprintf(stderr, "num-running-compactions: %s\n", stats.c_str());
2357               }
2358               if (db->GetProperty("rocksdb.num-running-flushes", &stats)) {
2359                 fprintf(stderr, "num-running-flushes: %s\n\n", stats.c_str());
2360               }
2361               if (FLAGS_show_table_properties) {
2362                 for (int level = 0; level < FLAGS_num_levels; ++level) {
2363                   if (db->GetProperty(
2364                           "rocksdb.aggregated-table-properties-at-level" +
2365                               std::to_string(level),
2366                           &stats)) {
2367                     if (stats.find("# entries=0") == std::string::npos) {
2368                       fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
2369                     }
2370                   }
2371                 }
2372               }
2373             }
2374           }
2375
2376           next_report_ += FLAGS_stats_interval;
2377           last_report_finish_ = now;
2378           last_report_done_ = done_;
2379         }
2380       }
2381       if (id_ == 0 && FLAGS_thread_status_per_interval) {
2382         PrintThreadStatus();
2383       }
2384       fflush(stderr);
2385     }
2386   }
2387
2388   void AddBytes(int64_t n) { bytes_ += n; }
2389
2390   void Report(const Slice& name) {
2391     // Pretend at least one op was done in case we are running a benchmark
2392     // that does not call FinishedOps().
2393     if (done_ < 1) done_ = 1;
2394
2395     std::string extra;
2396     double elapsed = (finish_ - start_) * 1e-6;
2397     if (bytes_ > 0) {
2398       // Rate is computed on actual elapsed time, not the sum of per-thread
2399       // elapsed times.
2400       char rate[100];
2401       snprintf(rate, sizeof(rate), "%6.1f MB/s",
2402                (bytes_ / 1048576.0) / elapsed);
2403       extra = rate;
2404     }
2405     AppendWithSpace(&extra, message_);
2406     double throughput = (double)done_ / elapsed;
2407
2408     fprintf(stdout,
2409             "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64
2410             " operations;%s%s\n",
2411             name.ToString().c_str(), seconds_ * 1e6 / done_, (long)throughput,
2412             elapsed, done_, (extra.empty() ? "" : " "), extra.c_str());
2413     if (FLAGS_histogram) {
2414       for (auto it = hist_.begin(); it != hist_.end(); ++it) {
2415         fprintf(stdout, "Microseconds per %s:\n%s\n",
2416                 OperationTypeString[it->first].c_str(),
2417                 it->second->ToString().c_str());
2418       }
2419     }
2420     if (FLAGS_report_file_operations) {
2421       auto* counted_fs =
2422           FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>();
2423       assert(counted_fs);
2424       fprintf(stdout, "%s", counted_fs->PrintCounters().c_str());
2425       counted_fs->ResetCounters();
2426     }
2427     fflush(stdout);
2428   }
2429 };
2430
2431 class CombinedStats {
2432  public:
2433   void AddStats(const Stats& stat) {
2434     uint64_t total_ops = stat.done_;
2435     uint64_t total_bytes_ = stat.bytes_;
2436     double elapsed;
2437
2438     if (total_ops < 1) {
2439       total_ops = 1;
2440     }
2441
2442     elapsed = (stat.finish_ - stat.start_) * 1e-6;
2443     throughput_ops_.emplace_back(total_ops / elapsed);
2444
2445     if (total_bytes_ > 0) {
2446       double mbs = (total_bytes_ / 1048576.0);
2447       throughput_mbs_.emplace_back(mbs / elapsed);
2448     }
2449   }
2450
2451   void Report(const std::string& bench_name) {
2452     if (throughput_ops_.size() < 2) {
2453       // skip if there are not enough samples
2454       return;
2455     }
2456
2457     const char* name = bench_name.c_str();
2458     int num_runs = static_cast<int>(throughput_ops_.size());
2459
2460     if (throughput_mbs_.size() == throughput_ops_.size()) {
2461       fprintf(stdout,
2462               "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2463               "%.1f) MB/sec\n",
2464               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2465               static_cast<int>(CalcConfidence95(throughput_ops_)),
2466               CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_));
2467     } else {
2468       fprintf(stdout, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name,
2469               num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2470               static_cast<int>(CalcConfidence95(throughput_ops_)));
2471     }
2472   }
2473
2474   void ReportWithConfidenceIntervals(const std::string& bench_name) {
2475     if (throughput_ops_.size() < 2) {
2476       // skip if there are not enough samples
2477       return;
2478     }
2479
2480     const char* name = bench_name.c_str();
2481     int num_runs = static_cast<int>(throughput_ops_.size());
2482
2483     int ops_avg = static_cast<int>(CalcAvg(throughput_ops_));
2484     int ops_confidence_95 = static_cast<int>(CalcConfidence95(throughput_ops_));
2485
2486     if (throughput_mbs_.size() == throughput_ops_.size()) {
2487       double mbs_avg = CalcAvg(throughput_mbs_);
2488       double mbs_confidence_95 = CalcConfidence95(throughput_mbs_);
2489       fprintf(stdout,
2490               "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n",
2491               name, num_runs, ops_avg - ops_confidence_95,
2492               ops_avg + ops_confidence_95, mbs_avg - mbs_confidence_95,
2493               mbs_avg + mbs_confidence_95);
2494     } else {
2495       fprintf(stdout, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name, num_runs,
2496               ops_avg - ops_confidence_95, ops_avg + ops_confidence_95);
2497     }
2498   }
2499
2500   void ReportFinal(const std::string& bench_name) {
2501     if (throughput_ops_.size() < 2) {
2502       // skip if there are not enough samples
2503       return;
2504     }
2505
2506     const char* name = bench_name.c_str();
2507     int num_runs = static_cast<int>(throughput_ops_.size());
2508
2509     if (throughput_mbs_.size() == throughput_ops_.size()) {
2510       // \xC2\xB1 is +/- character in UTF-8
2511       fprintf(stdout,
2512               "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2513               "%.1f) MB/sec\n"
2514               "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
2515               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2516               static_cast<int>(CalcConfidence95(throughput_ops_)),
2517               CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name,
2518               num_runs, static_cast<int>(CalcMedian(throughput_ops_)),
2519               CalcMedian(throughput_mbs_));
2520     } else {
2521       fprintf(stdout,
2522               "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
2523               "%s [MEDIAN %d runs] : %d ops/sec\n",
2524               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2525               static_cast<int>(CalcConfidence95(throughput_ops_)), name,
2526               num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
2527     }
2528   }
2529
2530  private:
2531   double CalcAvg(std::vector<double>& data) {
2532     double avg = 0;
2533     for (double x : data) {
2534       avg += x;
2535     }
2536     avg = avg / data.size();
2537     return avg;
2538   }
2539
2540   // Calculates 95% CI assuming a normal distribution of samples.
2541   // Samples are not from a normal distribution, but it still
2542   // provides useful approximation.
2543   double CalcConfidence95(std::vector<double>& data) {
2544     assert(data.size() > 1);
2545     double avg = CalcAvg(data);
2546     double std_error = CalcStdDev(data, avg) / std::sqrt(data.size());
2547
2548     // Z score for the 97.5 percentile
2549     // see https://en.wikipedia.org/wiki/1.96
2550     return 1.959964 * std_error;
2551   }
2552
2553   double CalcMedian(std::vector<double>& data) {
2554     assert(data.size() > 0);
2555     std::sort(data.begin(), data.end());
2556
2557     size_t mid = data.size() / 2;
2558     if (data.size() % 2 == 1) {
2559       // Odd number of entries
2560       return data[mid];
2561     } else {
2562       // Even number of entries
2563       return (data[mid] + data[mid - 1]) / 2;
2564     }
2565   }
2566
2567   double CalcStdDev(std::vector<double>& data, double average) {
2568     assert(data.size() > 1);
2569     double squared_sum = 0.0;
2570     for (double x : data) {
2571       squared_sum += std::pow(x - average, 2);
2572     }
2573
2574     // using samples count - 1 following Bessel's correction
2575     // see https://en.wikipedia.org/wiki/Bessel%27s_correction
2576     return std::sqrt(squared_sum / (data.size() - 1));
2577   }
2578
2579   std::vector<double> throughput_ops_;
2580   std::vector<double> throughput_mbs_;
2581 };
2582
2583 class TimestampEmulator {
2584  private:
2585   std::atomic<uint64_t> timestamp_;
2586
2587  public:
2588   TimestampEmulator() : timestamp_(0) {}
2589   uint64_t Get() const { return timestamp_.load(); }
2590   void Inc() { timestamp_++; }
2591   Slice Allocate(char* scratch) {
2592     // TODO: support larger timestamp sizes
2593     assert(FLAGS_user_timestamp_size == 8);
2594     assert(scratch);
2595     uint64_t ts = timestamp_.fetch_add(1);
2596     EncodeFixed64(scratch, ts);
2597     return Slice(scratch, FLAGS_user_timestamp_size);
2598   }
2599   Slice GetTimestampForRead(Random64& rand, char* scratch) {
2600     assert(FLAGS_user_timestamp_size == 8);
2601     assert(scratch);
2602     if (FLAGS_read_with_latest_user_timestamp) {
2603       return Allocate(scratch);
2604     }
2605     // Choose a random timestamp from the past.
2606     uint64_t ts = rand.Next() % Get();
2607     EncodeFixed64(scratch, ts);
2608     return Slice(scratch, FLAGS_user_timestamp_size);
2609   }
2610 };
2611
2612 // State shared by all concurrent executions of the same benchmark.
2613 struct SharedState {
2614   port::Mutex mu;
2615   port::CondVar cv;
2616   int total;
2617   int perf_level;
2618   std::shared_ptr<RateLimiter> write_rate_limiter;
2619   std::shared_ptr<RateLimiter> read_rate_limiter;
2620
2621   // Each thread goes through the following states:
2622   //    (1) initializing
2623   //    (2) waiting for others to be initialized
2624   //    (3) running
2625   //    (4) done
2626
2627   long num_initialized;
2628   long num_done;
2629   bool start;
2630
2631   SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {}
2632 };
2633
2634 // Per-thread state for concurrent executions of the same benchmark.
2635 struct ThreadState {
2636   int tid;        // 0..n-1 when running in n threads
2637   Random64 rand;  // Has different seeds for different threads
2638   Stats stats;
2639   SharedState* shared;
2640
2641   explicit ThreadState(int index, int my_seed)
2642       : tid(index), rand(seed_base + my_seed) {}
2643 };
2644
2645 class Duration {
2646  public:
2647   Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
2648     max_seconds_ = max_seconds;
2649     max_ops_ = max_ops;
2650     ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
2651     ops_ = 0;
2652     start_at_ = FLAGS_env->NowMicros();
2653   }
2654
2655   int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
2656
2657   bool Done(int64_t increment) {
2658     if (increment <= 0) increment = 1;  // avoid Done(0) and infinite loops
2659     ops_ += increment;
2660
2661     if (max_seconds_) {
2662       // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2663       auto granularity = FLAGS_ops_between_duration_checks;
2664       if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
2665         uint64_t now = FLAGS_env->NowMicros();
2666         return ((now - start_at_) / 1000000) >= max_seconds_;
2667       } else {
2668         return false;
2669       }
2670     } else {
2671       return ops_ > max_ops_;
2672     }
2673   }
2674
2675  private:
2676   uint64_t max_seconds_;
2677   int64_t max_ops_;
2678   int64_t ops_per_stage_;
2679   int64_t ops_;
2680   uint64_t start_at_;
2681 };
2682
2683 class Benchmark {
2684  private:
2685   std::shared_ptr<Cache> cache_;
2686   std::shared_ptr<Cache> compressed_cache_;
2687   std::shared_ptr<const SliceTransform> prefix_extractor_;
2688   DBWithColumnFamilies db_;
2689   std::vector<DBWithColumnFamilies> multi_dbs_;
2690   int64_t num_;
2691   int key_size_;
2692   int user_timestamp_size_;
2693   int prefix_size_;
2694   int total_thread_count_;
2695   int64_t keys_per_prefix_;
2696   int64_t entries_per_batch_;
2697   int64_t writes_before_delete_range_;
2698   int64_t writes_per_range_tombstone_;
2699   int64_t range_tombstone_width_;
2700   int64_t max_num_range_tombstones_;
2701   ReadOptions read_options_;
2702   WriteOptions write_options_;
2703   Options open_options_;  // keep options around to properly destroy db later
2704 #ifndef ROCKSDB_LITE
2705   TraceOptions trace_options_;
2706   TraceOptions block_cache_trace_options_;
2707 #endif
2708   int64_t reads_;
2709   int64_t deletes_;
2710   double read_random_exp_range_;
2711   int64_t writes_;
2712   int64_t readwrites_;
2713   int64_t merge_keys_;
2714   bool report_file_operations_;
2715   bool use_blob_db_;    // Stacked BlobDB
2716   bool read_operands_;  // read via GetMergeOperands()
2717   std::vector<std::string> keys_;
2718
2719   class ErrorHandlerListener : public EventListener {
2720    public:
2721 #ifndef ROCKSDB_LITE
2722     ErrorHandlerListener()
2723         : mutex_(),
2724           cv_(&mutex_),
2725           no_auto_recovery_(false),
2726           recovery_complete_(false) {}
2727
2728     ~ErrorHandlerListener() override {}
2729
2730     const char* Name() const override { return kClassName(); }
2731     static const char* kClassName() { return "ErrorHandlerListener"; }
2732
2733     void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
2734                               Status /*bg_error*/,
2735                               bool* auto_recovery) override {
2736       if (*auto_recovery && no_auto_recovery_) {
2737         *auto_recovery = false;
2738       }
2739     }
2740
2741     void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
2742       InstrumentedMutexLock l(&mutex_);
2743       recovery_complete_ = true;
2744       cv_.SignalAll();
2745     }
2746
2747     bool WaitForRecovery(uint64_t abs_time_us) {
2748       InstrumentedMutexLock l(&mutex_);
2749       if (!recovery_complete_) {
2750         cv_.TimedWait(abs_time_us);
2751       }
2752       if (recovery_complete_) {
2753         recovery_complete_ = false;
2754         return true;
2755       }
2756       return false;
2757     }
2758
2759     void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
2760
2761    private:
2762     InstrumentedMutex mutex_;
2763     InstrumentedCondVar cv_;
2764     bool no_auto_recovery_;
2765     bool recovery_complete_;
2766 #else   // ROCKSDB_LITE
2767     bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
2768     void EnableAutoRecovery(bool /*enable*/) {}
2769 #endif  // ROCKSDB_LITE
2770   };
2771
2772   std::shared_ptr<ErrorHandlerListener> listener_;
2773
2774   std::unique_ptr<TimestampEmulator> mock_app_clock_;
2775
2776   bool SanityCheck() {
2777     if (FLAGS_compression_ratio > 1) {
2778       fprintf(stderr, "compression_ratio should be between 0 and 1\n");
2779       return false;
2780     }
2781     return true;
2782   }
2783
2784   inline bool CompressSlice(const CompressionInfo& compression_info,
2785                             const Slice& input, std::string* compressed) {
2786     constexpr uint32_t compress_format_version = 2;
2787
2788     return CompressData(input, compression_info, compress_format_version,
2789                         compressed);
2790   }
2791
2792   void PrintHeader(const Options& options) {
2793     PrintEnvironment();
2794     fprintf(stdout,
2795             "Keys:       %d bytes each (+ %d bytes user-defined timestamp)\n",
2796             FLAGS_key_size, FLAGS_user_timestamp_size);
2797     auto avg_value_size = FLAGS_value_size;
2798     if (FLAGS_value_size_distribution_type_e == kFixed) {
2799       fprintf(stdout,
2800               "Values:     %d bytes each (%d bytes after compression)\n",
2801               avg_value_size,
2802               static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2803     } else {
2804       avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
2805       fprintf(stdout,
2806               "Values:     %d avg bytes each (%d bytes after compression)\n",
2807               avg_value_size,
2808               static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2809       fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
2810               FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min,
2811               FLAGS_value_size_max);
2812     }
2813     fprintf(stdout, "Entries:    %" PRIu64 "\n", num_);
2814     fprintf(stdout, "Prefix:    %d bytes\n", FLAGS_prefix_size);
2815     fprintf(stdout, "Keys per prefix:    %" PRIu64 "\n", keys_per_prefix_);
2816     fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
2817             ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_) /
2818              1048576.0));
2819     fprintf(
2820         stdout, "FileSize:   %.1f MB (estimated)\n",
2821         (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) /
2822          1048576.0));
2823     fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
2824             FLAGS_benchmark_write_rate_limit);
2825     fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
2826             FLAGS_benchmark_read_rate_limit);
2827     if (FLAGS_enable_numa) {
2828       fprintf(stderr, "Running in NUMA enabled mode.\n");
2829 #ifndef NUMA
2830       fprintf(stderr, "NUMA is not defined in the system.\n");
2831       exit(1);
2832 #else
2833       if (numa_available() == -1) {
2834         fprintf(stderr, "NUMA is not supported by the system.\n");
2835         exit(1);
2836       }
2837 #endif
2838     }
2839
2840     auto compression = CompressionTypeToString(FLAGS_compression_type_e);
2841     fprintf(stdout, "Compression: %s\n", compression.c_str());
2842     fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
2843             FLAGS_sample_for_compression);
2844     if (options.memtable_factory != nullptr) {
2845       fprintf(stdout, "Memtablerep: %s\n",
2846               options.memtable_factory->GetId().c_str());
2847     }
2848     fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
2849
2850     PrintWarnings(compression.c_str());
2851     fprintf(stdout, "------------------------------------------------\n");
2852   }
2853
2854   void PrintWarnings(const char* compression) {
2855 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
2856     fprintf(
2857         stdout,
2858         "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
2859 #endif
2860 #ifndef NDEBUG
2861     fprintf(stdout,
2862             "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
2863 #endif
2864     if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
2865       // The test string should not be too small.
2866       const int len = FLAGS_block_size;
2867       std::string input_str(len, 'y');
2868       std::string compressed;
2869       CompressionOptions opts;
2870       CompressionContext context(FLAGS_compression_type_e);
2871       CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2872                            FLAGS_compression_type_e,
2873                            FLAGS_sample_for_compression);
2874       bool result = CompressSlice(info, Slice(input_str), &compressed);
2875
2876       if (!result) {
2877         fprintf(stdout, "WARNING: %s compression is not enabled\n",
2878                 compression);
2879       } else if (compressed.size() >= input_str.size()) {
2880         fprintf(stdout, "WARNING: %s compression is not effective\n",
2881                 compression);
2882       }
2883     }
2884   }
2885
2886 // Current the following isn't equivalent to OS_LINUX.
2887 #if defined(__linux)
2888   static Slice TrimSpace(Slice s) {
2889     unsigned int start = 0;
2890     while (start < s.size() && isspace(s[start])) {
2891       start++;
2892     }
2893     unsigned int limit = static_cast<unsigned int>(s.size());
2894     while (limit > start && isspace(s[limit - 1])) {
2895       limit--;
2896     }
2897     return Slice(s.data() + start, limit - start);
2898   }
2899 #endif
2900
2901   void PrintEnvironment() {
2902     fprintf(stderr, "RocksDB:    version %s\n",
2903             GetRocksVersionAsString(true).c_str());
2904
2905 #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
2906     time_t now = time(nullptr);
2907     char buf[52];
2908     // Lint complains about ctime() usage, so replace it with ctime_r(). The
2909     // requirement is to provide a buffer which is at least 26 bytes.
2910     fprintf(stderr, "Date:       %s",
2911             ctime_r(&now, buf));  // ctime_r() adds newline
2912
2913 #if defined(__linux)
2914     FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2915     if (cpuinfo != nullptr) {
2916       char line[1000];
2917       int num_cpus = 0;
2918       std::string cpu_type;
2919       std::string cache_size;
2920       while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2921         const char* sep = strchr(line, ':');
2922         if (sep == nullptr) {
2923           continue;
2924         }
2925         Slice key = TrimSpace(Slice(line, sep - 1 - line));
2926         Slice val = TrimSpace(Slice(sep + 1));
2927         if (key == "model name") {
2928           ++num_cpus;
2929           cpu_type = val.ToString();
2930         } else if (key == "cache size") {
2931           cache_size = val.ToString();
2932         }
2933       }
2934       fclose(cpuinfo);
2935       fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
2936       fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
2937     }
2938 #elif defined(__APPLE__)
2939     struct host_basic_info h;
2940     size_t hlen = HOST_BASIC_INFO_COUNT;
2941     if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
2942                   (uint32_t*)&hlen) == KERN_SUCCESS) {
2943       std::string cpu_type;
2944       std::string cache_size;
2945       size_t hcache_size;
2946       hlen = sizeof(hcache_size);
2947       if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
2948         cache_size = std::to_string(hcache_size);
2949       }
2950       switch (h.cpu_type) {
2951         case CPU_TYPE_X86_64:
2952           cpu_type = "x86_64";
2953           break;
2954         case CPU_TYPE_ARM64:
2955           cpu_type = "arm64";
2956           break;
2957         default:
2958           break;
2959       }
2960       fprintf(stderr, "CPU:        %d * %s\n", h.max_cpus, cpu_type.c_str());
2961       fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
2962     }
2963 #elif defined(__FreeBSD__)
2964     int ncpus;
2965     size_t len = sizeof(ncpus);
2966     int mib[2] = {CTL_HW, HW_NCPU};
2967     if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
2968       char cpu_type[16];
2969       len = sizeof(cpu_type) - 1;
2970       mib[1] = HW_MACHINE;
2971       if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
2972
2973       fprintf(stderr, "CPU:        %d * %s\n", ncpus, cpu_type);
2974       // no programmatic way to get the cache line size except on PPC
2975     }
2976 #endif
2977 #endif
2978   }
2979
2980   static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
2981                          const Slice& key) {
2982     const char* pos = key.data();
2983     pos += 8;
2984     uint64_t timestamp = 0;
2985     if (port::kLittleEndian) {
2986       int bytes_to_fill = 8;
2987       for (int i = 0; i < bytes_to_fill; ++i) {
2988         timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
2989                       << ((bytes_to_fill - i - 1) << 3));
2990       }
2991     } else {
2992       memcpy(&timestamp, pos, sizeof(timestamp));
2993     }
2994     return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
2995   }
2996
2997   class ExpiredTimeFilter : public CompactionFilter {
2998    public:
2999     explicit ExpiredTimeFilter(
3000         const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
3001         : timestamp_emulator_(timestamp_emulator) {}
3002     bool Filter(int /*level*/, const Slice& key,
3003                 const Slice& /*existing_value*/, std::string* /*new_value*/,
3004                 bool* /*value_changed*/) const override {
3005       return KeyExpired(timestamp_emulator_.get(), key);
3006     }
3007     const char* Name() const override { return "ExpiredTimeFilter"; }
3008
3009    private:
3010     std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3011   };
3012
3013   class KeepFilter : public CompactionFilter {
3014    public:
3015     bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
3016                 std::string* /*new_value*/,
3017                 bool* /*value_changed*/) const override {
3018       return false;
3019     }
3020
3021     const char* Name() const override { return "KeepFilter"; }
3022   };
3023
3024   static std::shared_ptr<MemoryAllocator> GetCacheAllocator() {
3025     std::shared_ptr<MemoryAllocator> allocator;
3026
3027     if (FLAGS_use_cache_jemalloc_no_dump_allocator) {
3028       JemallocAllocatorOptions jemalloc_options;
3029       if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) {
3030         fprintf(stderr, "JemallocNodumpAllocator not supported.\n");
3031         exit(1);
3032       }
3033     } else if (FLAGS_use_cache_memkind_kmem_allocator) {
3034 #ifdef MEMKIND
3035       allocator = std::make_shared<MemkindKmemAllocator>();
3036 #else
3037       fprintf(stderr, "Memkind library is not linked with the binary.\n");
3038       exit(1);
3039 #endif
3040     }
3041
3042     return allocator;
3043   }
3044
3045   static std::shared_ptr<Cache> NewCache(int64_t capacity) {
3046     if (capacity <= 0) {
3047       return nullptr;
3048     }
3049     if (FLAGS_cache_type == "clock_cache") {
3050       fprintf(stderr, "Old clock cache implementation has been removed.\n");
3051       exit(1);
3052     } else if (FLAGS_cache_type == "hyper_clock_cache") {
3053       return HyperClockCacheOptions(static_cast<size_t>(capacity),
3054                                     FLAGS_block_size /*estimated_entry_charge*/,
3055                                     FLAGS_cache_numshardbits)
3056           .MakeSharedCache();
3057     } else if (FLAGS_cache_type == "lru_cache") {
3058       LRUCacheOptions opts(
3059           static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
3060           false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
3061           GetCacheAllocator(), kDefaultToAdaptiveMutex,
3062           kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio);
3063
3064 #ifndef ROCKSDB_LITE
3065       if (!FLAGS_secondary_cache_uri.empty()) {
3066         Status s = SecondaryCache::CreateFromString(
3067             ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
3068         if (secondary_cache == nullptr) {
3069           fprintf(
3070               stderr,
3071               "No secondary cache registered matching string: %s status=%s\n",
3072               FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
3073           exit(1);
3074         }
3075         opts.secondary_cache = secondary_cache;
3076       }
3077 #endif  // ROCKSDB_LITE
3078
3079       if (FLAGS_use_compressed_secondary_cache) {
3080         CompressedSecondaryCacheOptions secondary_cache_opts;
3081         secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size;
3082         secondary_cache_opts.num_shard_bits =
3083             FLAGS_compressed_secondary_cache_numshardbits;
3084         secondary_cache_opts.high_pri_pool_ratio =
3085             FLAGS_compressed_secondary_cache_high_pri_pool_ratio;
3086         secondary_cache_opts.low_pri_pool_ratio =
3087             FLAGS_compressed_secondary_cache_low_pri_pool_ratio;
3088         secondary_cache_opts.compression_type =
3089             FLAGS_compressed_secondary_cache_compression_type_e;
3090         secondary_cache_opts.compress_format_version =
3091             FLAGS_compressed_secondary_cache_compress_format_version;
3092         opts.secondary_cache =
3093             NewCompressedSecondaryCache(secondary_cache_opts);
3094       }
3095
3096       return NewLRUCache(opts);
3097     } else {
3098       fprintf(stderr, "Cache type not supported.");
3099       exit(1);
3100     }
3101   }
3102
3103  public:
3104   Benchmark()
3105       : cache_(NewCache(FLAGS_cache_size)),
3106         compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
3107         prefix_extractor_(FLAGS_prefix_size != 0
3108                               ? NewFixedPrefixTransform(FLAGS_prefix_size)
3109                               : nullptr),
3110         num_(FLAGS_num),
3111         key_size_(FLAGS_key_size),
3112         user_timestamp_size_(FLAGS_user_timestamp_size),
3113         prefix_size_(FLAGS_prefix_size),
3114         total_thread_count_(0),
3115         keys_per_prefix_(FLAGS_keys_per_prefix),
3116         entries_per_batch_(1),
3117         reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
3118         read_random_exp_range_(0.0),
3119         writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
3120         readwrites_(
3121             (FLAGS_writes < 0 && FLAGS_reads < 0)
3122                 ? FLAGS_num
3123                 : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
3124         merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
3125         report_file_operations_(FLAGS_report_file_operations),
3126 #ifndef ROCKSDB_LITE
3127         use_blob_db_(FLAGS_use_blob_db),  // Stacked BlobDB
3128 #else
3129         use_blob_db_(false),  // Stacked BlobDB
3130 #endif  // !ROCKSDB_LITE
3131         read_operands_(false) {
3132     // use simcache instead of cache
3133     if (FLAGS_simcache_size >= 0) {
3134       if (FLAGS_cache_numshardbits >= 1) {
3135         cache_ =
3136             NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
3137       } else {
3138         cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
3139       }
3140     }
3141
3142     if (report_file_operations_) {
3143       FLAGS_env = new CompositeEnvWrapper(
3144           FLAGS_env,
3145           std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem()));
3146     }
3147
3148     if (FLAGS_prefix_size > FLAGS_key_size) {
3149       fprintf(stderr, "prefix size is larger than key size");
3150       exit(1);
3151     }
3152
3153     std::vector<std::string> files;
3154     FLAGS_env->GetChildren(FLAGS_db, &files);
3155     for (size_t i = 0; i < files.size(); i++) {
3156       if (Slice(files[i]).starts_with("heap-")) {
3157         FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
3158       }
3159     }
3160     if (!FLAGS_use_existing_db) {
3161       Options options;
3162       options.env = FLAGS_env;
3163       if (!FLAGS_wal_dir.empty()) {
3164         options.wal_dir = FLAGS_wal_dir;
3165       }
3166 #ifndef ROCKSDB_LITE
3167       if (use_blob_db_) {
3168         // Stacked BlobDB
3169         blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
3170       }
3171 #endif  // !ROCKSDB_LITE
3172       DestroyDB(FLAGS_db, options);
3173       if (!FLAGS_wal_dir.empty()) {
3174         FLAGS_env->DeleteDir(FLAGS_wal_dir);
3175       }
3176
3177       if (FLAGS_num_multi_db > 1) {
3178         FLAGS_env->CreateDir(FLAGS_db);
3179         if (!FLAGS_wal_dir.empty()) {
3180           FLAGS_env->CreateDir(FLAGS_wal_dir);
3181         }
3182       }
3183     }
3184
3185     listener_.reset(new ErrorHandlerListener());
3186     if (user_timestamp_size_ > 0) {
3187       mock_app_clock_.reset(new TimestampEmulator());
3188     }
3189   }
3190
3191   void DeleteDBs() {
3192     db_.DeleteDBs();
3193     for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
3194       delete dbwcf.db;
3195     }
3196   }
3197
3198   ~Benchmark() {
3199     DeleteDBs();
3200     if (cache_.get() != nullptr) {
3201       // Clear cache reference first
3202       open_options_.write_buffer_manager.reset();
3203       // this will leak, but we're shutting down so nobody cares
3204       cache_->DisownData();
3205     }
3206   }
3207
3208   Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
3209     char* data = new char[key_size_];
3210     const char* const_data = data;
3211     key_guard->reset(const_data);
3212     return Slice(key_guard->get(), key_size_);
3213   }
3214
3215   // Generate key according to the given specification and random number.
3216   // The resulting key will have the following format:
3217   //   - If keys_per_prefix_ is positive, extra trailing bytes are either cut
3218   //     off or padded with '0'.
3219   //     The prefix value is derived from key value.
3220   //     ----------------------------
3221   //     | prefix 00000 | key 00000 |
3222   //     ----------------------------
3223   //
3224   //   - If keys_per_prefix_ is 0, the key is simply a binary representation of
3225   //     random number followed by trailing '0's
3226   //     ----------------------------
3227   //     |        key 00000         |
3228   //     ----------------------------
3229   void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
3230     if (!keys_.empty()) {
3231       assert(FLAGS_use_existing_keys);
3232       assert(keys_.size() == static_cast<size_t>(num_keys));
3233       assert(v < static_cast<uint64_t>(num_keys));
3234       *key = keys_[v];
3235       return;
3236     }
3237     char* start = const_cast<char*>(key->data());
3238     char* pos = start;
3239     if (keys_per_prefix_ > 0) {
3240       int64_t num_prefix = num_keys / keys_per_prefix_;
3241       int64_t prefix = v % num_prefix;
3242       int bytes_to_fill = std::min(prefix_size_, 8);
3243       if (port::kLittleEndian) {
3244         for (int i = 0; i < bytes_to_fill; ++i) {
3245           pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
3246         }
3247       } else {
3248         memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
3249       }
3250       if (prefix_size_ > 8) {
3251         // fill the rest with 0s
3252         memset(pos + 8, '0', prefix_size_ - 8);
3253       }
3254       pos += prefix_size_;
3255     }
3256
3257     int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
3258     if (port::kLittleEndian) {
3259       for (int i = 0; i < bytes_to_fill; ++i) {
3260         pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
3261       }
3262     } else {
3263       memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
3264     }
3265     pos += bytes_to_fill;
3266     if (key_size_ > pos - start) {
3267       memset(pos, '0', key_size_ - (pos - start));
3268     }
3269   }
3270
3271   void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
3272     GenerateKeyFromInt(v, num_keys, key);
3273     if (FLAGS_seek_missing_prefix) {
3274       assert(prefix_size_ > 8);
3275       char* key_ptr = const_cast<char*>(key->data());
3276       // This rely on GenerateKeyFromInt filling paddings with '0's.
3277       // Putting a '1' will create a non-existing prefix.
3278       key_ptr[8] = '1';
3279     }
3280   }
3281
3282   std::string GetPathForMultiple(std::string base_name, size_t id) {
3283     if (!base_name.empty()) {
3284 #ifndef OS_WIN
3285       if (base_name.back() != '/') {
3286         base_name += '/';
3287       }
3288 #else
3289       if (base_name.back() != '\\') {
3290         base_name += '\\';
3291       }
3292 #endif
3293     }
3294     return base_name + std::to_string(id);
3295   }
3296
3297   void VerifyDBFromDB(std::string& truth_db_name) {
3298     DBWithColumnFamilies truth_db;
3299     auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
3300     if (!s.ok()) {
3301       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
3302       exit(1);
3303     }
3304     ReadOptions ro;
3305     ro.total_order_seek = true;
3306     std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
3307     std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
3308     // Verify that all the key/values in truth_db are retrivable in db with
3309     // ::Get
3310     fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
3311     for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
3312       std::string value;
3313       s = db_.db->Get(ro, truth_iter->key(), &value);
3314       assert(s.ok());
3315       // TODO(myabandeh): provide debugging hints
3316       assert(Slice(value) == truth_iter->value());
3317     }
3318     // Verify that the db iterator does not give any extra key/value
3319     fprintf(stderr, "Verifying db == truth_db...\n");
3320     for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
3321          db_iter->Next(), truth_iter->Next()) {
3322       assert(truth_iter->Valid());
3323       assert(truth_iter->value() == db_iter->value());
3324     }
3325     // No more key should be left unchecked in truth_db
3326     assert(!truth_iter->Valid());
3327     fprintf(stderr, "...Verified\n");
3328   }
3329
3330   void ErrorExit() {
3331     DeleteDBs();
3332     exit(1);
3333   }
3334
3335   void Run() {
3336     if (!SanityCheck()) {
3337       ErrorExit();
3338     }
3339     Open(&open_options_);
3340     PrintHeader(open_options_);
3341     std::stringstream benchmark_stream(FLAGS_benchmarks);
3342     std::string name;
3343     std::unique_ptr<ExpiredTimeFilter> filter;
3344     while (std::getline(benchmark_stream, name, ',')) {
3345       // Sanitize parameters
3346       num_ = FLAGS_num;
3347       reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
3348       writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
3349       deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
3350       value_size = FLAGS_value_size;
3351       key_size_ = FLAGS_key_size;
3352       entries_per_batch_ = FLAGS_batch_size;
3353       writes_before_delete_range_ = FLAGS_writes_before_delete_range;
3354       writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
3355       range_tombstone_width_ = FLAGS_range_tombstone_width;
3356       max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
3357       write_options_ = WriteOptions();
3358       read_random_exp_range_ = FLAGS_read_random_exp_range;
3359       if (FLAGS_sync) {
3360         write_options_.sync = true;
3361       }
3362       write_options_.disableWAL = FLAGS_disable_wal;
3363       write_options_.rate_limiter_priority =
3364           FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL;
3365       read_options_ = ReadOptions(FLAGS_verify_checksum, true);
3366       read_options_.total_order_seek = FLAGS_total_order_seek;
3367       read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start;
3368       read_options_.rate_limiter_priority =
3369           FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
3370       read_options_.tailing = FLAGS_use_tailing_iterator;
3371       read_options_.readahead_size = FLAGS_readahead_size;
3372       read_options_.adaptive_readahead = FLAGS_adaptive_readahead;
3373       read_options_.async_io = FLAGS_async_io;
3374       read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io;
3375
3376       void (Benchmark::*method)(ThreadState*) = nullptr;
3377       void (Benchmark::*post_process_method)() = nullptr;
3378
3379       bool fresh_db = false;
3380       int num_threads = FLAGS_threads;
3381
3382       int num_repeat = 1;
3383       int num_warmup = 0;
3384       if (!name.empty() && *name.rbegin() == ']') {
3385         auto it = name.find('[');
3386         if (it == std::string::npos) {
3387           fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
3388           ErrorExit();
3389         }
3390         std::string args = name.substr(it + 1);
3391         args.resize(args.size() - 1);
3392         name.resize(it);
3393
3394         std::string bench_arg;
3395         std::stringstream args_stream(args);
3396         while (std::getline(args_stream, bench_arg, '-')) {
3397           if (bench_arg.empty()) {
3398             continue;
3399           }
3400           if (bench_arg[0] == 'X') {
3401             // Repeat the benchmark n times
3402             std::string num_str = bench_arg.substr(1);
3403             num_repeat = std::stoi(num_str);
3404           } else if (bench_arg[0] == 'W') {
3405             // Warm up the benchmark for n times
3406             std::string num_str = bench_arg.substr(1);
3407             num_warmup = std::stoi(num_str);
3408           }
3409         }
3410       }
3411
3412       // Both fillseqdeterministic and filluniquerandomdeterministic
3413       // fill the levels except the max level with UNIQUE_RANDOM
3414       // and fill the max level with fillseq and filluniquerandom, respectively
3415       if (name == "fillseqdeterministic" ||
3416           name == "filluniquerandomdeterministic") {
3417         if (!FLAGS_disable_auto_compactions) {
3418           fprintf(stderr,
3419                   "Please disable_auto_compactions in FillDeterministic "
3420                   "benchmark\n");
3421           ErrorExit();
3422         }
3423         if (num_threads > 1) {
3424           fprintf(stderr,
3425                   "filldeterministic multithreaded not supported"
3426                   ", use 1 thread\n");
3427           num_threads = 1;
3428         }
3429         fresh_db = true;
3430         if (name == "fillseqdeterministic") {
3431           method = &Benchmark::WriteSeqDeterministic;
3432         } else {
3433           method = &Benchmark::WriteUniqueRandomDeterministic;
3434         }
3435       } else if (name == "fillseq") {
3436         fresh_db = true;
3437         method = &Benchmark::WriteSeq;
3438       } else if (name == "fillbatch") {
3439         fresh_db = true;
3440         entries_per_batch_ = 1000;
3441         method = &Benchmark::WriteSeq;
3442       } else if (name == "fillrandom") {
3443         fresh_db = true;
3444         method = &Benchmark::WriteRandom;
3445       } else if (name == "filluniquerandom" ||
3446                  name == "fillanddeleteuniquerandom") {
3447         fresh_db = true;
3448         if (num_threads > 1) {
3449           fprintf(stderr,
3450                   "filluniquerandom and fillanddeleteuniquerandom "
3451                   "multithreaded not supported, use 1 thread");
3452           num_threads = 1;
3453         }
3454         method = &Benchmark::WriteUniqueRandom;
3455       } else if (name == "overwrite") {
3456         method = &Benchmark::WriteRandom;
3457       } else if (name == "fillsync") {
3458         fresh_db = true;
3459         num_ /= 1000;
3460         write_options_.sync = true;
3461         method = &Benchmark::WriteRandom;
3462       } else if (name == "fill100K") {
3463         fresh_db = true;
3464         num_ /= 1000;
3465         value_size = 100 * 1000;
3466         method = &Benchmark::WriteRandom;
3467       } else if (name == "readseq") {
3468         method = &Benchmark::ReadSequential;
3469       } else if (name == "readtorowcache") {
3470         if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
3471           fprintf(stderr,
3472                   "Please set use_existing_keys to true and specify a "
3473                   "row cache size in readtorowcache benchmark\n");
3474           ErrorExit();
3475         }
3476         method = &Benchmark::ReadToRowCache;
3477       } else if (name == "readtocache") {
3478         method = &Benchmark::ReadSequential;
3479         num_threads = 1;
3480         reads_ = num_;
3481       } else if (name == "readreverse") {
3482         method = &Benchmark::ReadReverse;
3483       } else if (name == "readrandom") {
3484         if (FLAGS_multiread_stride) {
3485           fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3486                   entries_per_batch_);
3487         }
3488         method = &Benchmark::ReadRandom;
3489       } else if (name == "readrandomfast") {
3490         method = &Benchmark::ReadRandomFast;
3491       } else if (name == "multireadrandom") {
3492         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3493                 entries_per_batch_);
3494         method = &Benchmark::MultiReadRandom;
3495       } else if (name == "multireadwhilewriting") {
3496         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3497                 entries_per_batch_);
3498         num_threads++;
3499         method = &Benchmark::MultiReadWhileWriting;
3500       } else if (name == "approximatesizerandom") {
3501         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3502                 entries_per_batch_);
3503         method = &Benchmark::ApproximateSizeRandom;
3504       } else if (name == "mixgraph") {
3505         method = &Benchmark::MixGraph;
3506       } else if (name == "readmissing") {
3507         ++key_size_;
3508         method = &Benchmark::ReadRandom;
3509       } else if (name == "newiterator") {
3510         method = &Benchmark::IteratorCreation;
3511       } else if (name == "newiteratorwhilewriting") {
3512         num_threads++;  // Add extra thread for writing
3513         method = &Benchmark::IteratorCreationWhileWriting;
3514       } else if (name == "seekrandom") {
3515         method = &Benchmark::SeekRandom;
3516       } else if (name == "seekrandomwhilewriting") {
3517         num_threads++;  // Add extra thread for writing
3518         method = &Benchmark::SeekRandomWhileWriting;
3519       } else if (name == "seekrandomwhilemerging") {
3520         num_threads++;  // Add extra thread for merging
3521         method = &Benchmark::SeekRandomWhileMerging;
3522       } else if (name == "readrandomsmall") {
3523         reads_ /= 1000;
3524         method = &Benchmark::ReadRandom;
3525       } else if (name == "deleteseq") {
3526         method = &Benchmark::DeleteSeq;
3527       } else if (name == "deleterandom") {
3528         method = &Benchmark::DeleteRandom;
3529       } else if (name == "readwhilewriting") {
3530         num_threads++;  // Add extra thread for writing
3531         method = &Benchmark::ReadWhileWriting;
3532       } else if (name == "readwhilemerging") {
3533         num_threads++;  // Add extra thread for writing
3534         method = &Benchmark::ReadWhileMerging;
3535       } else if (name == "readwhilescanning") {
3536         num_threads++;  // Add extra thread for scaning
3537         method = &Benchmark::ReadWhileScanning;
3538       } else if (name == "readrandomwriterandom") {
3539         method = &Benchmark::ReadRandomWriteRandom;
3540       } else if (name == "readrandommergerandom") {
3541         if (FLAGS_merge_operator.empty()) {
3542           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3543                   name.c_str());
3544           ErrorExit();
3545         }
3546         method = &Benchmark::ReadRandomMergeRandom;
3547       } else if (name == "updaterandom") {
3548         method = &Benchmark::UpdateRandom;
3549       } else if (name == "xorupdaterandom") {
3550         method = &Benchmark::XORUpdateRandom;
3551       } else if (name == "appendrandom") {
3552         method = &Benchmark::AppendRandom;
3553       } else if (name == "mergerandom") {
3554         if (FLAGS_merge_operator.empty()) {
3555           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3556                   name.c_str());
3557           exit(1);
3558         }
3559         method = &Benchmark::MergeRandom;
3560       } else if (name == "randomwithverify") {
3561         method = &Benchmark::RandomWithVerify;
3562       } else if (name == "fillseekseq") {
3563         method = &Benchmark::WriteSeqSeekSeq;
3564       } else if (name == "compact") {
3565         method = &Benchmark::Compact;
3566       } else if (name == "compactall") {
3567         CompactAll();
3568 #ifndef ROCKSDB_LITE
3569       } else if (name == "compact0") {
3570         CompactLevel(0);
3571       } else if (name == "compact1") {
3572         CompactLevel(1);
3573       } else if (name == "waitforcompaction") {
3574         WaitForCompaction();
3575 #endif
3576       } else if (name == "flush") {
3577         Flush();
3578       } else if (name == "crc32c") {
3579         method = &Benchmark::Crc32c;
3580       } else if (name == "xxhash") {
3581         method = &Benchmark::xxHash;
3582       } else if (name == "xxhash64") {
3583         method = &Benchmark::xxHash64;
3584       } else if (name == "xxh3") {
3585         method = &Benchmark::xxh3;
3586       } else if (name == "acquireload") {
3587         method = &Benchmark::AcquireLoad;
3588       } else if (name == "compress") {
3589         method = &Benchmark::Compress;
3590       } else if (name == "uncompress") {
3591         method = &Benchmark::Uncompress;
3592 #ifndef ROCKSDB_LITE
3593       } else if (name == "randomtransaction") {
3594         method = &Benchmark::RandomTransaction;
3595         post_process_method = &Benchmark::RandomTransactionVerify;
3596 #endif  // ROCKSDB_LITE
3597       } else if (name == "randomreplacekeys") {
3598         fresh_db = true;
3599         method = &Benchmark::RandomReplaceKeys;
3600       } else if (name == "timeseries") {
3601         timestamp_emulator_.reset(new TimestampEmulator());
3602         if (FLAGS_expire_style == "compaction_filter") {
3603           filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
3604           fprintf(stdout, "Compaction filter is used to remove expired data");
3605           open_options_.compaction_filter = filter.get();
3606         }
3607         fresh_db = true;
3608         method = &Benchmark::TimeSeries;
3609       } else if (name == "block_cache_entry_stats") {
3610         // DB::Properties::kBlockCacheEntryStats
3611         PrintStats("rocksdb.block-cache-entry-stats");
3612       } else if (name == "stats") {
3613         PrintStats("rocksdb.stats");
3614       } else if (name == "resetstats") {
3615         ResetStats();
3616       } else if (name == "verify") {
3617         VerifyDBFromDB(FLAGS_truth_db);
3618       } else if (name == "levelstats") {
3619         PrintStats("rocksdb.levelstats");
3620       } else if (name == "memstats") {
3621         std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
3622                                       "rocksdb.cur-size-active-mem-table",
3623                                       "rocksdb.cur-size-all-mem-tables",
3624                                       "rocksdb.size-all-mem-tables",
3625                                       "rocksdb.num-entries-active-mem-table",
3626                                       "rocksdb.num-entries-imm-mem-tables"};
3627         PrintStats(keys);
3628       } else if (name == "sstables") {
3629         PrintStats("rocksdb.sstables");
3630       } else if (name == "stats_history") {
3631         PrintStatsHistory();
3632 #ifndef ROCKSDB_LITE
3633       } else if (name == "replay") {
3634         if (num_threads > 1) {
3635           fprintf(stderr, "Multi-threaded replay is not yet supported\n");
3636           ErrorExit();
3637         }
3638         if (FLAGS_trace_file == "") {
3639           fprintf(stderr, "Please set --trace_file to be replayed from\n");
3640           ErrorExit();
3641         }
3642         method = &Benchmark::Replay;
3643 #endif  // ROCKSDB_LITE
3644       } else if (name == "getmergeoperands") {
3645         method = &Benchmark::GetMergeOperands;
3646 #ifndef ROCKSDB_LITE
3647       } else if (name == "verifychecksum") {
3648         method = &Benchmark::VerifyChecksum;
3649       } else if (name == "verifyfilechecksums") {
3650         method = &Benchmark::VerifyFileChecksums;
3651 #endif  // ROCKSDB_LITE
3652       } else if (name == "readrandomoperands") {
3653         read_operands_ = true;
3654         method = &Benchmark::ReadRandom;
3655 #ifndef ROCKSDB_LITE
3656       } else if (name == "backup") {
3657         method = &Benchmark::Backup;
3658       } else if (name == "restore") {
3659         method = &Benchmark::Restore;
3660 #endif
3661       } else if (!name.empty()) {  // No error message for empty name
3662         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
3663         ErrorExit();
3664       }
3665
3666       if (fresh_db) {
3667         if (FLAGS_use_existing_db) {
3668           fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
3669                   name.c_str());
3670           method = nullptr;
3671         } else {
3672           if (db_.db != nullptr) {
3673             db_.DeleteDBs();
3674             DestroyDB(FLAGS_db, open_options_);
3675           }
3676           Options options = open_options_;
3677           for (size_t i = 0; i < multi_dbs_.size(); i++) {
3678             delete multi_dbs_[i].db;
3679             if (!open_options_.wal_dir.empty()) {
3680               options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
3681             }
3682             DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
3683           }
3684           multi_dbs_.clear();
3685         }
3686         Open(&open_options_);  // use open_options for the last accessed
3687       }
3688
3689       if (method != nullptr) {
3690         fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
3691
3692 #ifndef ROCKSDB_LITE
3693         if (name == "backup") {
3694           std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
3695         } else if (name == "restore") {
3696           std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
3697           std::cout << "Restore path: [" << FLAGS_restore_dir << "]"
3698                     << std::endl;
3699         }
3700         // A trace_file option can be provided both for trace and replay
3701         // operations. But db_bench does not support tracing and replaying at
3702         // the same time, for now. So, start tracing only when it is not a
3703         // replay.
3704         if (FLAGS_trace_file != "" && name != "replay") {
3705           std::unique_ptr<TraceWriter> trace_writer;
3706           Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3707                                         FLAGS_trace_file, &trace_writer);
3708           if (!s.ok()) {
3709             fprintf(stderr, "Encountered an error starting a trace, %s\n",
3710                     s.ToString().c_str());
3711             ErrorExit();
3712           }
3713           s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
3714           if (!s.ok()) {
3715             fprintf(stderr, "Encountered an error starting a trace, %s\n",
3716                     s.ToString().c_str());
3717             ErrorExit();
3718           }
3719           fprintf(stdout, "Tracing the workload to: [%s]\n",
3720                   FLAGS_trace_file.c_str());
3721         }
3722         // Start block cache tracing.
3723         if (!FLAGS_block_cache_trace_file.empty()) {
3724           // Sanity checks.
3725           if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
3726             fprintf(stderr,
3727                     "Block cache trace sampling frequency must be higher than "
3728                     "0.\n");
3729             ErrorExit();
3730           }
3731           if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
3732             fprintf(stderr,
3733                     "The maximum file size for block cache tracing must be "
3734                     "higher than 0.\n");
3735             ErrorExit();
3736           }
3737           block_cache_trace_options_.max_trace_file_size =
3738               FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
3739           block_cache_trace_options_.sampling_frequency =
3740               FLAGS_block_cache_trace_sampling_frequency;
3741           std::unique_ptr<TraceWriter> block_cache_trace_writer;
3742           Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3743                                         FLAGS_block_cache_trace_file,
3744                                         &block_cache_trace_writer);
3745           if (!s.ok()) {
3746             fprintf(stderr,
3747                     "Encountered an error when creating trace writer, %s\n",
3748                     s.ToString().c_str());
3749             ErrorExit();
3750           }
3751           s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
3752                                            std::move(block_cache_trace_writer));
3753           if (!s.ok()) {
3754             fprintf(
3755                 stderr,
3756                 "Encountered an error when starting block cache tracing, %s\n",
3757                 s.ToString().c_str());
3758             ErrorExit();
3759           }
3760           fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
3761                   FLAGS_block_cache_trace_file.c_str());
3762         }
3763 #endif  // ROCKSDB_LITE
3764
3765         if (num_warmup > 0) {
3766           printf("Warming up benchmark by running %d times\n", num_warmup);
3767         }
3768
3769         for (int i = 0; i < num_warmup; i++) {
3770           RunBenchmark(num_threads, name, method);
3771         }
3772
3773         if (num_repeat > 1) {
3774           printf("Running benchmark for %d times\n", num_repeat);
3775         }
3776
3777         CombinedStats combined_stats;
3778         for (int i = 0; i < num_repeat; i++) {
3779           Stats stats = RunBenchmark(num_threads, name, method);
3780           combined_stats.AddStats(stats);
3781           if (FLAGS_confidence_interval_only) {
3782             combined_stats.ReportWithConfidenceIntervals(name);
3783           } else {
3784             combined_stats.Report(name);
3785           }
3786         }
3787         if (num_repeat > 1) {
3788           combined_stats.ReportFinal(name);
3789         }
3790       }
3791       if (post_process_method != nullptr) {
3792         (this->*post_process_method)();
3793       }
3794     }
3795
3796     if (secondary_update_thread_) {
3797       secondary_update_stopped_.store(1, std::memory_order_relaxed);
3798       secondary_update_thread_->join();
3799       secondary_update_thread_.reset();
3800     }
3801
3802 #ifndef ROCKSDB_LITE
3803     if (name != "replay" && FLAGS_trace_file != "") {
3804       Status s = db_.db->EndTrace();
3805       if (!s.ok()) {
3806         fprintf(stderr, "Encountered an error ending the trace, %s\n",
3807                 s.ToString().c_str());
3808       }
3809     }
3810     if (!FLAGS_block_cache_trace_file.empty()) {
3811       Status s = db_.db->EndBlockCacheTrace();
3812       if (!s.ok()) {
3813         fprintf(stderr,
3814                 "Encountered an error ending the block cache tracing, %s\n",
3815                 s.ToString().c_str());
3816       }
3817     }
3818 #endif  // ROCKSDB_LITE
3819
3820     if (FLAGS_statistics) {
3821       fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3822     }
3823     if (FLAGS_simcache_size >= 0) {
3824       fprintf(
3825           stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
3826           static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
3827     }
3828
3829 #ifndef ROCKSDB_LITE
3830     if (FLAGS_use_secondary_db) {
3831       fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
3832               secondary_db_updates_);
3833     }
3834 #endif  // ROCKSDB_LITE
3835   }
3836
3837  private:
3838   std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3839   std::unique_ptr<port::Thread> secondary_update_thread_;
3840   std::atomic<int> secondary_update_stopped_{0};
3841 #ifndef ROCKSDB_LITE
3842   uint64_t secondary_db_updates_ = 0;
3843 #endif  // ROCKSDB_LITE
3844   struct ThreadArg {
3845     Benchmark* bm;
3846     SharedState* shared;
3847     ThreadState* thread;
3848     void (Benchmark::*method)(ThreadState*);
3849   };
3850
3851   static void ThreadBody(void* v) {
3852     ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
3853     SharedState* shared = arg->shared;
3854     ThreadState* thread = arg->thread;
3855     {
3856       MutexLock l(&shared->mu);
3857       shared->num_initialized++;
3858       if (shared->num_initialized >= shared->total) {
3859         shared->cv.SignalAll();
3860       }
3861       while (!shared->start) {
3862         shared->cv.Wait();
3863       }
3864     }
3865
3866     SetPerfLevel(static_cast<PerfLevel>(shared->perf_level));
3867     perf_context.EnablePerLevelPerfContext();
3868     thread->stats.Start(thread->tid);
3869     (arg->bm->*(arg->method))(thread);
3870     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
3871       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
3872                                get_perf_context()->ToString());
3873     }
3874     thread->stats.Stop();
3875
3876     {
3877       MutexLock l(&shared->mu);
3878       shared->num_done++;
3879       if (shared->num_done >= shared->total) {
3880         shared->cv.SignalAll();
3881       }
3882     }
3883   }
3884
3885   Stats RunBenchmark(int n, Slice name,
3886                      void (Benchmark::*method)(ThreadState*)) {
3887     SharedState shared;
3888     shared.total = n;
3889     shared.num_initialized = 0;
3890     shared.num_done = 0;
3891     shared.start = false;
3892     if (FLAGS_benchmark_write_rate_limit > 0) {
3893       shared.write_rate_limiter.reset(
3894           NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
3895     }
3896     if (FLAGS_benchmark_read_rate_limit > 0) {
3897       shared.read_rate_limiter.reset(NewGenericRateLimiter(
3898           FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
3899           10 /* fairness */, RateLimiter::Mode::kReadsOnly));
3900     }
3901
3902     std::unique_ptr<ReporterAgent> reporter_agent;
3903     if (FLAGS_report_interval_seconds > 0) {
3904       reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
3905                                              FLAGS_report_interval_seconds));
3906     }
3907
3908     ThreadArg* arg = new ThreadArg[n];
3909
3910     for (int i = 0; i < n; i++) {
3911 #ifdef NUMA
3912       if (FLAGS_enable_numa) {
3913         // Performs a local allocation of memory to threads in numa node.
3914         int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
3915         numa_exit_on_error = 1;
3916         int numa_node = i % n_nodes;
3917         bitmask* nodes = numa_allocate_nodemask();
3918         numa_bitmask_clearall(nodes);
3919         numa_bitmask_setbit(nodes, numa_node);
3920         // numa_bind() call binds the process to the node and these
3921         // properties are passed on to the thread that is created in
3922         // StartThread method called later in the loop.
3923         numa_bind(nodes);
3924         numa_set_strict(1);
3925         numa_free_nodemask(nodes);
3926       }
3927 #endif
3928       arg[i].bm = this;
3929       arg[i].method = method;
3930       arg[i].shared = &shared;
3931       total_thread_count_++;
3932       arg[i].thread = new ThreadState(i, total_thread_count_);
3933       arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3934       arg[i].thread->shared = &shared;
3935       FLAGS_env->StartThread(ThreadBody, &arg[i]);
3936     }
3937
3938     shared.mu.Lock();
3939     while (shared.num_initialized < n) {
3940       shared.cv.Wait();
3941     }
3942
3943     shared.start = true;
3944     shared.cv.SignalAll();
3945     while (shared.num_done < n) {
3946       shared.cv.Wait();
3947     }
3948     shared.mu.Unlock();
3949
3950     // Stats for some threads can be excluded.
3951     Stats merge_stats;
3952     for (int i = 0; i < n; i++) {
3953       merge_stats.Merge(arg[i].thread->stats);
3954     }
3955     merge_stats.Report(name);
3956
3957     for (int i = 0; i < n; i++) {
3958       delete arg[i].thread;
3959     }
3960     delete[] arg;
3961
3962     return merge_stats;
3963   }
3964
3965   template <OperationType kOpType, typename FnType, typename... Args>
3966   static inline void ChecksumBenchmark(FnType fn, ThreadState* thread,
3967                                        Args... args) {
3968     const int size = FLAGS_block_size;  // use --block_size option for db_bench
3969     std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)";
3970     const char* label = labels.c_str();
3971
3972     std::string data(size, 'x');
3973     uint64_t bytes = 0;
3974     uint32_t val = 0;
3975     while (bytes < 5000U * uint64_t{1048576}) {  // ~5GB
3976       val += static_cast<uint32_t>(fn(data.data(), size, args...));
3977       thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType);
3978       bytes += size;
3979     }
3980     // Print so result is not dead
3981     fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val));
3982
3983     thread->stats.AddBytes(bytes);
3984     thread->stats.AddMessage(label);
3985   }
3986
3987   void Crc32c(ThreadState* thread) {
3988     ChecksumBenchmark<kCrc>(crc32c::Value, thread);
3989   }
3990
3991   void xxHash(ThreadState* thread) {
3992     ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0);
3993   }
3994
3995   void xxHash64(ThreadState* thread) {
3996     ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0);
3997   }
3998
3999   void xxh3(ThreadState* thread) {
4000     ChecksumBenchmark<kHash>(XXH3_64bits, thread);
4001   }
4002
4003   void AcquireLoad(ThreadState* thread) {
4004     int dummy;
4005     std::atomic<void*> ap(&dummy);
4006     int count = 0;
4007     void* ptr = nullptr;
4008     thread->stats.AddMessage("(each op is 1000 loads)");
4009     while (count < 100000) {
4010       for (int i = 0; i < 1000; i++) {
4011         ptr = ap.load(std::memory_order_acquire);
4012       }
4013       count++;
4014       thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
4015     }
4016     if (ptr == nullptr) exit(1);  // Disable unused variable warning.
4017   }
4018
4019   void Compress(ThreadState* thread) {
4020     RandomGenerator gen;
4021     Slice input = gen.Generate(FLAGS_block_size);
4022     int64_t bytes = 0;
4023     int64_t produced = 0;
4024     bool ok = true;
4025     std::string compressed;
4026     CompressionOptions opts;
4027     CompressionContext context(FLAGS_compression_type_e);
4028     CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
4029                          FLAGS_compression_type_e,
4030                          FLAGS_sample_for_compression);
4031     // Compress 1G
4032     while (ok && bytes < int64_t(1) << 30) {
4033       compressed.clear();
4034       ok = CompressSlice(info, input, &compressed);
4035       produced += compressed.size();
4036       bytes += input.size();
4037       thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
4038     }
4039
4040     if (!ok) {
4041       thread->stats.AddMessage("(compression failure)");
4042     } else {
4043       char buf[340];
4044       snprintf(buf, sizeof(buf), "(output: %.1f%%)",
4045                (produced * 100.0) / bytes);
4046       thread->stats.AddMessage(buf);
4047       thread->stats.AddBytes(bytes);
4048     }
4049   }
4050
4051   void Uncompress(ThreadState* thread) {
4052     RandomGenerator gen;
4053     Slice input = gen.Generate(FLAGS_block_size);
4054     std::string compressed;
4055
4056     CompressionContext compression_ctx(FLAGS_compression_type_e);
4057     CompressionOptions compression_opts;
4058     CompressionInfo compression_info(
4059         compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
4060         FLAGS_compression_type_e, FLAGS_sample_for_compression);
4061     UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
4062     UncompressionInfo uncompression_info(uncompression_ctx,
4063                                          UncompressionDict::GetEmptyDict(),
4064                                          FLAGS_compression_type_e);
4065
4066     bool ok = CompressSlice(compression_info, input, &compressed);
4067     int64_t bytes = 0;
4068     size_t uncompressed_size = 0;
4069     while (ok && bytes < 1024 * 1048576) {
4070       constexpr uint32_t compress_format_version = 2;
4071
4072       CacheAllocationPtr uncompressed = UncompressData(
4073           uncompression_info, compressed.data(), compressed.size(),
4074           &uncompressed_size, compress_format_version);
4075
4076       ok = uncompressed.get() != nullptr;
4077       bytes += input.size();
4078       thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
4079     }
4080
4081     if (!ok) {
4082       thread->stats.AddMessage("(compression failure)");
4083     } else {
4084       thread->stats.AddBytes(bytes);
4085     }
4086   }
4087
4088   // Returns true if the options is initialized from the specified
4089   // options file.
4090   bool InitializeOptionsFromFile(Options* opts) {
4091 #ifndef ROCKSDB_LITE
4092     printf("Initializing RocksDB Options from the specified file\n");
4093     DBOptions db_opts;
4094     std::vector<ColumnFamilyDescriptor> cf_descs;
4095     if (FLAGS_options_file != "") {
4096       auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
4097                                    &cf_descs);
4098       db_opts.env = FLAGS_env;
4099       if (s.ok()) {
4100         *opts = Options(db_opts, cf_descs[0].options);
4101         return true;
4102       }
4103       fprintf(stderr, "Unable to load options file %s --- %s\n",
4104               FLAGS_options_file.c_str(), s.ToString().c_str());
4105       exit(1);
4106     }
4107 #else
4108     (void)opts;
4109 #endif
4110     return false;
4111   }
4112
4113   void InitializeOptionsFromFlags(Options* opts) {
4114     printf("Initializing RocksDB Options from command-line flags\n");
4115     Options& options = *opts;
4116     ConfigOptions config_options(options);
4117     config_options.ignore_unsupported_options = false;
4118
4119     assert(db_.db == nullptr);
4120
4121     options.env = FLAGS_env;
4122     options.wal_dir = FLAGS_wal_dir;
4123     options.dump_malloc_stats = FLAGS_dump_malloc_stats;
4124     options.stats_dump_period_sec =
4125         static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
4126     options.stats_persist_period_sec =
4127         static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
4128     options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
4129     options.stats_history_buffer_size =
4130         static_cast<size_t>(FLAGS_stats_history_buffer_size);
4131     options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
4132
4133     options.compression_opts.level = FLAGS_compression_level;
4134     options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
4135     options.compression_opts.zstd_max_train_bytes =
4136         FLAGS_compression_zstd_max_train_bytes;
4137     options.compression_opts.parallel_threads =
4138         FLAGS_compression_parallel_threads;
4139     options.compression_opts.max_dict_buffer_bytes =
4140         FLAGS_compression_max_dict_buffer_bytes;
4141     options.compression_opts.use_zstd_dict_trainer =
4142         FLAGS_compression_use_zstd_dict_trainer;
4143
4144     options.max_open_files = FLAGS_open_files;
4145     if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
4146       options.write_buffer_manager.reset(
4147           new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
4148     }
4149     options.arena_block_size = FLAGS_arena_block_size;
4150     options.write_buffer_size = FLAGS_write_buffer_size;
4151     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
4152     options.min_write_buffer_number_to_merge =
4153         FLAGS_min_write_buffer_number_to_merge;
4154     options.max_write_buffer_number_to_maintain =
4155         FLAGS_max_write_buffer_number_to_maintain;
4156     options.max_write_buffer_size_to_maintain =
4157         FLAGS_max_write_buffer_size_to_maintain;
4158     options.max_background_jobs = FLAGS_max_background_jobs;
4159     options.max_background_compactions = FLAGS_max_background_compactions;
4160     options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
4161     options.max_background_flushes = FLAGS_max_background_flushes;
4162     options.compaction_style = FLAGS_compaction_style_e;
4163     options.compaction_pri = FLAGS_compaction_pri_e;
4164     options.allow_mmap_reads = FLAGS_mmap_read;
4165     options.allow_mmap_writes = FLAGS_mmap_write;
4166     options.use_direct_reads = FLAGS_use_direct_reads;
4167     options.use_direct_io_for_flush_and_compaction =
4168         FLAGS_use_direct_io_for_flush_and_compaction;
4169     options.manual_wal_flush = FLAGS_manual_wal_flush;
4170     options.wal_compression = FLAGS_wal_compression_e;
4171 #ifndef ROCKSDB_LITE
4172     options.ttl = FLAGS_fifo_compaction_ttl;
4173     options.compaction_options_fifo = CompactionOptionsFIFO(
4174         FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
4175         FLAGS_fifo_compaction_allow_compaction);
4176     options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
4177 #endif  // ROCKSDB_LITE
4178     options.prefix_extractor = prefix_extractor_;
4179     if (FLAGS_use_uint64_comparator) {
4180       options.comparator = test::Uint64Comparator();
4181       if (FLAGS_key_size != 8) {
4182         fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
4183         exit(1);
4184       }
4185     }
4186     if (FLAGS_use_stderr_info_logger) {
4187       options.info_log.reset(new StderrLogger());
4188     }
4189     options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
4190     options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
4191     options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
4192     if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
4193       options.memtable_insert_with_hint_prefix_extractor.reset(
4194           NewCappedPrefixTransform(
4195               FLAGS_memtable_insert_with_hint_prefix_size));
4196     }
4197     options.bloom_locality = FLAGS_bloom_locality;
4198     options.max_file_opening_threads = FLAGS_file_opening_threads;
4199     options.compaction_readahead_size = FLAGS_compaction_readahead_size;
4200     options.log_readahead_size = FLAGS_log_readahead_size;
4201     options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
4202     options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
4203     options.use_fsync = FLAGS_use_fsync;
4204     options.num_levels = FLAGS_num_levels;
4205     options.target_file_size_base = FLAGS_target_file_size_base;
4206     options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
4207     options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
4208     options.level_compaction_dynamic_level_bytes =
4209         FLAGS_level_compaction_dynamic_level_bytes;
4210     options.max_bytes_for_level_multiplier =
4211         FLAGS_max_bytes_for_level_multiplier;
4212     Status s =
4213         CreateMemTableRepFactory(config_options, &options.memtable_factory);
4214     if (!s.ok()) {
4215       fprintf(stderr, "Could not create memtable factory: %s\n",
4216               s.ToString().c_str());
4217       exit(1);
4218     } else if ((FLAGS_prefix_size == 0) &&
4219                (options.memtable_factory->IsInstanceOf("prefix_hash") ||
4220                 options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
4221       fprintf(stderr,
4222               "prefix_size should be non-zero if PrefixHash or "
4223               "HashLinkedList memtablerep is used\n");
4224       exit(1);
4225     }
4226     if (FLAGS_use_plain_table) {
4227 #ifndef ROCKSDB_LITE
4228       if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
4229           !options.memtable_factory->IsInstanceOf("hash_linkedlist")) {
4230         fprintf(stderr, "Warning: plain table is used with %s\n",
4231                 options.memtable_factory->Name());
4232       }
4233
4234       int bloom_bits_per_key = FLAGS_bloom_bits;
4235       if (bloom_bits_per_key < 0) {
4236         bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
4237       }
4238
4239       PlainTableOptions plain_table_options;
4240       plain_table_options.user_key_len = FLAGS_key_size;
4241       plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
4242       plain_table_options.hash_table_ratio = 0.75;
4243       options.table_factory = std::shared_ptr<TableFactory>(
4244           NewPlainTableFactory(plain_table_options));
4245 #else
4246       fprintf(stderr, "Plain table is not supported in lite mode\n");
4247       exit(1);
4248 #endif  // ROCKSDB_LITE
4249     } else if (FLAGS_use_cuckoo_table) {
4250 #ifndef ROCKSDB_LITE
4251       if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
4252         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
4253         exit(1);
4254       }
4255
4256       if (!FLAGS_mmap_read) {
4257         fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
4258         exit(1);
4259       }
4260
4261       ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
4262       table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
4263       table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
4264       options.table_factory =
4265           std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
4266 #else
4267       fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
4268       exit(1);
4269 #endif  // ROCKSDB_LITE
4270     } else {
4271       BlockBasedTableOptions block_based_options;
4272       block_based_options.checksum =
4273           static_cast<ChecksumType>(FLAGS_checksum_type);
4274       if (FLAGS_use_hash_search) {
4275         if (FLAGS_prefix_size == 0) {
4276           fprintf(stderr,
4277                   "prefix_size not assigned when enable use_hash_search \n");
4278           exit(1);
4279         }
4280         block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
4281       } else {
4282         block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
4283       }
4284       if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
4285         if (FLAGS_index_with_first_key) {
4286           fprintf(stderr,
4287                   "--index_with_first_key is not compatible with"
4288                   " partition index.");
4289         }
4290         if (FLAGS_use_hash_search) {
4291           fprintf(stderr,
4292                   "use_hash_search is incompatible with "
4293                   "partition index and is ignored");
4294         }
4295         block_based_options.index_type =
4296             BlockBasedTableOptions::kTwoLevelIndexSearch;
4297         block_based_options.metadata_block_size = FLAGS_metadata_block_size;
4298         if (FLAGS_partition_index_and_filters) {
4299           block_based_options.partition_filters = true;
4300         }
4301       } else if (FLAGS_index_with_first_key) {
4302         block_based_options.index_type =
4303             BlockBasedTableOptions::kBinarySearchWithFirstKey;
4304       }
4305       BlockBasedTableOptions::IndexShorteningMode index_shortening =
4306           block_based_options.index_shortening;
4307       switch (FLAGS_index_shortening_mode) {
4308         case 0:
4309           index_shortening =
4310               BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
4311           break;
4312         case 1:
4313           index_shortening =
4314               BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
4315           break;
4316         case 2:
4317           index_shortening = BlockBasedTableOptions::IndexShorteningMode::
4318               kShortenSeparatorsAndSuccessor;
4319           break;
4320         default:
4321           fprintf(stderr, "Unknown key shortening mode\n");
4322       }
4323       block_based_options.optimize_filters_for_memory =
4324           FLAGS_optimize_filters_for_memory;
4325       block_based_options.index_shortening = index_shortening;
4326       if (cache_ == nullptr) {
4327         block_based_options.no_block_cache = true;
4328       }
4329       block_based_options.cache_index_and_filter_blocks =
4330           FLAGS_cache_index_and_filter_blocks;
4331       block_based_options.pin_l0_filter_and_index_blocks_in_cache =
4332           FLAGS_pin_l0_filter_and_index_blocks_in_cache;
4333       block_based_options.pin_top_level_index_and_filter =
4334           FLAGS_pin_top_level_index_and_filter;
4335       if (FLAGS_cache_high_pri_pool_ratio > 1e-6) {  // > 0.0 + eps
4336         block_based_options.cache_index_and_filter_blocks_with_high_priority =
4337             true;
4338       }
4339       if (FLAGS_cache_high_pri_pool_ratio + FLAGS_cache_low_pri_pool_ratio >
4340           1.0) {
4341         fprintf(stderr,
4342                 "Sum of high_pri_pool_ratio and low_pri_pool_ratio "
4343                 "cannot exceed 1.0.\n");
4344       }
4345       block_based_options.block_cache = cache_;
4346       block_based_options.cache_usage_options.options_overrides.insert(
4347           {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
4348            {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
4349                 ? CacheEntryRoleOptions::Decision::kEnabled
4350                 : CacheEntryRoleOptions::Decision::kDisabled}});
4351       block_based_options.cache_usage_options.options_overrides.insert(
4352           {CacheEntryRole::kFilterConstruction,
4353            {/*.charged = */ FLAGS_charge_filter_construction
4354                 ? CacheEntryRoleOptions::Decision::kEnabled
4355                 : CacheEntryRoleOptions::Decision::kDisabled}});
4356       block_based_options.cache_usage_options.options_overrides.insert(
4357           {CacheEntryRole::kBlockBasedTableReader,
4358            {/*.charged = */ FLAGS_charge_table_reader
4359                 ? CacheEntryRoleOptions::Decision::kEnabled
4360                 : CacheEntryRoleOptions::Decision::kDisabled}});
4361       block_based_options.cache_usage_options.options_overrides.insert(
4362           {CacheEntryRole::kFileMetadata,
4363            {/*.charged = */ FLAGS_charge_file_metadata
4364                 ? CacheEntryRoleOptions::Decision::kEnabled
4365                 : CacheEntryRoleOptions::Decision::kDisabled}});
4366       block_based_options.cache_usage_options.options_overrides.insert(
4367           {CacheEntryRole::kBlobCache,
4368            {/*.charged = */ FLAGS_charge_blob_cache
4369                 ? CacheEntryRoleOptions::Decision::kEnabled
4370                 : CacheEntryRoleOptions::Decision::kDisabled}});
4371       block_based_options.block_cache_compressed = compressed_cache_;
4372       block_based_options.block_size = FLAGS_block_size;
4373       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
4374       block_based_options.index_block_restart_interval =
4375           FLAGS_index_block_restart_interval;
4376       block_based_options.format_version =
4377           static_cast<uint32_t>(FLAGS_format_version);
4378       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
4379       block_based_options.enable_index_compression =
4380           FLAGS_enable_index_compression;
4381       block_based_options.block_align = FLAGS_block_align;
4382       block_based_options.whole_key_filtering = FLAGS_whole_key_filtering;
4383       block_based_options.max_auto_readahead_size =
4384           FLAGS_max_auto_readahead_size;
4385       block_based_options.initial_auto_readahead_size =
4386           FLAGS_initial_auto_readahead_size;
4387       block_based_options.num_file_reads_for_auto_readahead =
4388           FLAGS_num_file_reads_for_auto_readahead;
4389       BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
4390           block_based_options.prepopulate_block_cache;
4391       switch (FLAGS_prepopulate_block_cache) {
4392         case 0:
4393           prepopulate_block_cache =
4394               BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
4395           break;
4396         case 1:
4397           prepopulate_block_cache =
4398               BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
4399           break;
4400         default:
4401           fprintf(stderr, "Unknown prepopulate block cache mode\n");
4402       }
4403       block_based_options.prepopulate_block_cache = prepopulate_block_cache;
4404       if (FLAGS_use_data_block_hash_index) {
4405         block_based_options.data_block_index_type =
4406             ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
4407       } else {
4408         block_based_options.data_block_index_type =
4409             ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
4410       }
4411       block_based_options.data_block_hash_table_util_ratio =
4412           FLAGS_data_block_hash_table_util_ratio;
4413       if (FLAGS_read_cache_path != "") {
4414 #ifndef ROCKSDB_LITE
4415         Status rc_status;
4416
4417         // Read cache need to be provided with a the Logger, we will put all
4418         // reac cache logs in the read cache path in a file named rc_LOG
4419         rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
4420         std::shared_ptr<Logger> read_cache_logger;
4421         if (rc_status.ok()) {
4422           rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
4423                                            &read_cache_logger);
4424         }
4425
4426         if (rc_status.ok()) {
4427           PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
4428                                        FLAGS_read_cache_size,
4429                                        read_cache_logger);
4430
4431           rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
4432           rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
4433           rc_cfg.writer_qdepth = 4;
4434           rc_cfg.writer_dispatch_size = 4 * 1024;
4435
4436           auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
4437           block_based_options.persistent_cache = pcache;
4438           rc_status = pcache->Open();
4439         }
4440
4441         if (!rc_status.ok()) {
4442           fprintf(stderr, "Error initializing read cache, %s\n",
4443                   rc_status.ToString().c_str());
4444           exit(1);
4445         }
4446 #else
4447         fprintf(stderr, "Read cache is not supported in LITE\n");
4448         exit(1);
4449
4450 #endif
4451       }
4452
4453       if (FLAGS_use_blob_cache) {
4454         if (FLAGS_use_shared_block_and_blob_cache) {
4455           options.blob_cache = cache_;
4456         } else {
4457           if (FLAGS_blob_cache_size > 0) {
4458             LRUCacheOptions co;
4459             co.capacity = FLAGS_blob_cache_size;
4460             co.num_shard_bits = FLAGS_blob_cache_numshardbits;
4461             co.memory_allocator = GetCacheAllocator();
4462
4463             options.blob_cache = NewLRUCache(co);
4464           } else {
4465             fprintf(
4466                 stderr,
4467                 "Unable to create a standalone blob cache if blob_cache_size "
4468                 "<= 0.\n");
4469             exit(1);
4470           }
4471         }
4472         switch (FLAGS_prepopulate_blob_cache) {
4473           case 0:
4474             options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
4475             break;
4476           case 1:
4477             options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
4478             break;
4479           default:
4480             fprintf(stderr, "Unknown prepopulate blob cache mode\n");
4481             exit(1);
4482         }
4483
4484         fprintf(stdout,
4485                 "Integrated BlobDB: blob cache enabled"
4486                 ", block and blob caches shared: %d",
4487                 FLAGS_use_shared_block_and_blob_cache);
4488         if (!FLAGS_use_shared_block_and_blob_cache) {
4489           fprintf(stdout,
4490                   ", blob cache size %" PRIu64
4491                   ", blob cache num shard bits: %d",
4492                   FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits);
4493         }
4494         fprintf(stdout, ", blob cache prepopulated: %d\n",
4495                 FLAGS_prepopulate_blob_cache);
4496       } else {
4497         fprintf(stdout, "Integrated BlobDB: blob cache disabled\n");
4498       }
4499
4500       options.table_factory.reset(
4501           NewBlockBasedTableFactory(block_based_options));
4502     }
4503     if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
4504       if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
4505           static_cast<unsigned int>(FLAGS_num_levels)) {
4506         fprintf(stderr, "Insufficient number of fanouts specified %d\n",
4507                 static_cast<int>(
4508                     FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
4509         exit(1);
4510       }
4511       options.max_bytes_for_level_multiplier_additional =
4512           FLAGS_max_bytes_for_level_multiplier_additional_v;
4513     }
4514     options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
4515     options.level0_file_num_compaction_trigger =
4516         FLAGS_level0_file_num_compaction_trigger;
4517     options.level0_slowdown_writes_trigger =
4518         FLAGS_level0_slowdown_writes_trigger;
4519     options.compression = FLAGS_compression_type_e;
4520     if (FLAGS_simulate_hybrid_fs_file != "") {
4521       options.bottommost_temperature = Temperature::kWarm;
4522     }
4523     options.preclude_last_level_data_seconds =
4524         FLAGS_preclude_last_level_data_seconds;
4525     options.preserve_internal_time_seconds =
4526         FLAGS_preserve_internal_time_seconds;
4527     options.sample_for_compression = FLAGS_sample_for_compression;
4528     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
4529     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
4530     options.max_total_wal_size = FLAGS_max_total_wal_size;
4531
4532     if (FLAGS_min_level_to_compress >= 0) {
4533       assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
4534       options.compression_per_level.resize(FLAGS_num_levels);
4535       for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
4536         options.compression_per_level[i] = kNoCompression;
4537       }
4538       for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) {
4539         options.compression_per_level[i] = FLAGS_compression_type_e;
4540       }
4541     }
4542     options.soft_pending_compaction_bytes_limit =
4543         FLAGS_soft_pending_compaction_bytes_limit;
4544     options.hard_pending_compaction_bytes_limit =
4545         FLAGS_hard_pending_compaction_bytes_limit;
4546     options.delayed_write_rate = FLAGS_delayed_write_rate;
4547     options.allow_concurrent_memtable_write =
4548         FLAGS_allow_concurrent_memtable_write;
4549     options.experimental_mempurge_threshold =
4550         FLAGS_experimental_mempurge_threshold;
4551     options.inplace_update_support = FLAGS_inplace_update_support;
4552     options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
4553     options.enable_write_thread_adaptive_yield =
4554         FLAGS_enable_write_thread_adaptive_yield;
4555     options.enable_pipelined_write = FLAGS_enable_pipelined_write;
4556     options.unordered_write = FLAGS_unordered_write;
4557     options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
4558     options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
4559     options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
4560     options.max_compaction_bytes = FLAGS_max_compaction_bytes;
4561     options.disable_auto_compactions = FLAGS_disable_auto_compactions;
4562     options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
4563     options.paranoid_checks = FLAGS_paranoid_checks;
4564     options.force_consistency_checks = FLAGS_force_consistency_checks;
4565     options.check_flush_compaction_key_order =
4566         FLAGS_check_flush_compaction_key_order;
4567     options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
4568     options.ttl = FLAGS_ttl_seconds;
4569     // fill storage options
4570     options.advise_random_on_open = FLAGS_advise_random_on_open;
4571     options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
4572     options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
4573     options.bytes_per_sync = FLAGS_bytes_per_sync;
4574     options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
4575
4576     // merge operator options
4577     if (!FLAGS_merge_operator.empty()) {
4578       s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator,
4579                                           &options.merge_operator);
4580       if (!s.ok()) {
4581         fprintf(stderr, "invalid merge operator[%s]: %s\n",
4582                 FLAGS_merge_operator.c_str(), s.ToString().c_str());
4583         exit(1);
4584       }
4585     }
4586     options.max_successive_merges = FLAGS_max_successive_merges;
4587     options.report_bg_io_stats = FLAGS_report_bg_io_stats;
4588
4589     // set universal style compaction configurations, if applicable
4590     if (FLAGS_universal_size_ratio != 0) {
4591       options.compaction_options_universal.size_ratio =
4592           FLAGS_universal_size_ratio;
4593     }
4594     if (FLAGS_universal_min_merge_width != 0) {
4595       options.compaction_options_universal.min_merge_width =
4596           FLAGS_universal_min_merge_width;
4597     }
4598     if (FLAGS_universal_max_merge_width != 0) {
4599       options.compaction_options_universal.max_merge_width =
4600           FLAGS_universal_max_merge_width;
4601     }
4602     if (FLAGS_universal_max_size_amplification_percent != 0) {
4603       options.compaction_options_universal.max_size_amplification_percent =
4604           FLAGS_universal_max_size_amplification_percent;
4605     }
4606     if (FLAGS_universal_compression_size_percent != -1) {
4607       options.compaction_options_universal.compression_size_percent =
4608           FLAGS_universal_compression_size_percent;
4609     }
4610     options.compaction_options_universal.allow_trivial_move =
4611         FLAGS_universal_allow_trivial_move;
4612     options.compaction_options_universal.incremental =
4613         FLAGS_universal_incremental;
4614     if (FLAGS_thread_status_per_interval > 0) {
4615       options.enable_thread_tracking = true;
4616     }
4617
4618     if (FLAGS_user_timestamp_size > 0) {
4619       if (FLAGS_user_timestamp_size != 8) {
4620         fprintf(stderr, "Only 64 bits timestamps are supported.\n");
4621         exit(1);
4622       }
4623       options.comparator = test::BytewiseComparatorWithU64TsWrapper();
4624     }
4625
4626     options.allow_data_in_errors = FLAGS_allow_data_in_errors;
4627     options.track_and_verify_wals_in_manifest =
4628         FLAGS_track_and_verify_wals_in_manifest;
4629
4630     // Integrated BlobDB
4631     options.enable_blob_files = FLAGS_enable_blob_files;
4632     options.min_blob_size = FLAGS_min_blob_size;
4633     options.blob_file_size = FLAGS_blob_file_size;
4634     options.blob_compression_type =
4635         StringToCompressionType(FLAGS_blob_compression_type.c_str());
4636     options.enable_blob_garbage_collection =
4637         FLAGS_enable_blob_garbage_collection;
4638     options.blob_garbage_collection_age_cutoff =
4639         FLAGS_blob_garbage_collection_age_cutoff;
4640     options.blob_garbage_collection_force_threshold =
4641         FLAGS_blob_garbage_collection_force_threshold;
4642     options.blob_compaction_readahead_size =
4643         FLAGS_blob_compaction_readahead_size;
4644     options.blob_file_starting_level = FLAGS_blob_file_starting_level;
4645
4646 #ifndef ROCKSDB_LITE
4647     if (FLAGS_readonly && FLAGS_transaction_db) {
4648       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
4649       exit(1);
4650     }
4651     if (FLAGS_use_secondary_db &&
4652         (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
4653       fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
4654       exit(1);
4655     }
4656 #endif  // ROCKSDB_LITE
4657     options.memtable_protection_bytes_per_key =
4658         FLAGS_memtable_protection_bytes_per_key;
4659   }
4660
4661   void InitializeOptionsGeneral(Options* opts) {
4662     // Be careful about what is set here to avoid accidentally overwriting
4663     // settings already configured by OPTIONS file. Only configure settings that
4664     // are needed for the benchmark to run, settings for shared objects that
4665     // were not configured already, settings that require dynamically invoking
4666     // APIs, and settings for the benchmark itself.
4667     Options& options = *opts;
4668
4669     // Always set these since they are harmless when not needed and prevent
4670     // a guaranteed failure when they are needed.
4671     options.create_missing_column_families = true;
4672     options.create_if_missing = true;
4673
4674     if (options.statistics == nullptr) {
4675       options.statistics = dbstats;
4676     }
4677
4678     auto table_options =
4679         options.table_factory->GetOptions<BlockBasedTableOptions>();
4680     if (table_options != nullptr) {
4681       if (FLAGS_cache_size > 0) {
4682         // This violates this function's rules on when to set options. But we
4683         // have to do it because the case of unconfigured block cache in OPTIONS
4684         // file is indistinguishable (it is sanitized to 8MB by this point, not
4685         // nullptr), and our regression tests assume this will be the shared
4686         // block cache, even with OPTIONS file provided.
4687         table_options->block_cache = cache_;
4688       }
4689       if (table_options->filter_policy == nullptr) {
4690         if (FLAGS_bloom_bits < 0) {
4691           table_options->filter_policy = BlockBasedTableOptions().filter_policy;
4692         } else if (FLAGS_bloom_bits == 0) {
4693           table_options->filter_policy.reset();
4694         } else {
4695           table_options->filter_policy.reset(
4696               FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
4697                                       : NewBloomFilterPolicy(FLAGS_bloom_bits));
4698         }
4699       }
4700     }
4701
4702     if (options.row_cache == nullptr) {
4703       if (FLAGS_row_cache_size) {
4704         if (FLAGS_cache_numshardbits >= 1) {
4705           options.row_cache =
4706               NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
4707         } else {
4708           options.row_cache = NewLRUCache(FLAGS_row_cache_size);
4709         }
4710       }
4711     }
4712
4713     if (options.env == Env::Default()) {
4714       options.env = FLAGS_env;
4715     }
4716     if (FLAGS_enable_io_prio) {
4717       options.env->LowerThreadPoolIOPriority(Env::LOW);
4718       options.env->LowerThreadPoolIOPriority(Env::HIGH);
4719     }
4720     if (FLAGS_enable_cpu_prio) {
4721       options.env->LowerThreadPoolCPUPriority(Env::LOW);
4722       options.env->LowerThreadPoolCPUPriority(Env::HIGH);
4723     }
4724
4725     if (FLAGS_sine_write_rate) {
4726       FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
4727     }
4728
4729     if (options.rate_limiter == nullptr) {
4730       if (FLAGS_rate_limiter_bytes_per_sec > 0) {
4731         options.rate_limiter.reset(NewGenericRateLimiter(
4732             FLAGS_rate_limiter_bytes_per_sec,
4733             FLAGS_rate_limiter_refill_period_us, 10 /* fairness */,
4734             // TODO: replace this with a more general FLAG for deciding
4735             // RateLimiter::Mode as now we also rate-limit foreground reads e.g,
4736             // Get()/MultiGet()
4737             FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
4738                                       : RateLimiter::Mode::kWritesOnly,
4739             FLAGS_rate_limiter_auto_tuned));
4740       }
4741     }
4742
4743     options.listeners.emplace_back(listener_);
4744
4745     if (options.file_checksum_gen_factory == nullptr) {
4746       if (FLAGS_file_checksum) {
4747         options.file_checksum_gen_factory.reset(
4748             new FileChecksumGenCrc32cFactory());
4749       }
4750     }
4751
4752     if (FLAGS_num_multi_db <= 1) {
4753       OpenDb(options, FLAGS_db, &db_);
4754     } else {
4755       multi_dbs_.clear();
4756       multi_dbs_.resize(FLAGS_num_multi_db);
4757       auto wal_dir = options.wal_dir;
4758       for (int i = 0; i < FLAGS_num_multi_db; i++) {
4759         if (!wal_dir.empty()) {
4760           options.wal_dir = GetPathForMultiple(wal_dir, i);
4761         }
4762         OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
4763       }
4764       options.wal_dir = wal_dir;
4765     }
4766
4767     // KeepFilter is a noop filter, this can be used to test compaction filter
4768     if (options.compaction_filter == nullptr) {
4769       if (FLAGS_use_keep_filter) {
4770         options.compaction_filter = new KeepFilter();
4771         fprintf(stdout, "A noop compaction filter is used\n");
4772       }
4773     }
4774
4775     if (FLAGS_use_existing_keys) {
4776       // Only work on single database
4777       assert(db_.db != nullptr);
4778       ReadOptions read_opts;  // before read_options_ initialized
4779       read_opts.total_order_seek = true;
4780       Iterator* iter = db_.db->NewIterator(read_opts);
4781       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
4782         keys_.emplace_back(iter->key().ToString());
4783       }
4784       delete iter;
4785       FLAGS_num = keys_.size();
4786     }
4787   }
4788
4789   void Open(Options* opts) {
4790     if (!InitializeOptionsFromFile(opts)) {
4791       InitializeOptionsFromFlags(opts);
4792     }
4793
4794     InitializeOptionsGeneral(opts);
4795   }
4796
4797   void OpenDb(Options options, const std::string& db_name,
4798               DBWithColumnFamilies* db) {
4799     uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0;
4800     Status s;
4801     // Open with column families if necessary.
4802     if (FLAGS_num_column_families > 1) {
4803       size_t num_hot = FLAGS_num_column_families;
4804       if (FLAGS_num_hot_column_families > 0 &&
4805           FLAGS_num_hot_column_families < FLAGS_num_column_families) {
4806         num_hot = FLAGS_num_hot_column_families;
4807       } else {
4808         FLAGS_num_hot_column_families = FLAGS_num_column_families;
4809       }
4810       std::vector<ColumnFamilyDescriptor> column_families;
4811       for (size_t i = 0; i < num_hot; i++) {
4812         column_families.push_back(ColumnFamilyDescriptor(
4813             ColumnFamilyName(i), ColumnFamilyOptions(options)));
4814       }
4815       std::vector<int> cfh_idx_to_prob;
4816       if (!FLAGS_column_family_distribution.empty()) {
4817         std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
4818         std::string cf_prob;
4819         int sum = 0;
4820         while (std::getline(cf_prob_stream, cf_prob, ',')) {
4821           cfh_idx_to_prob.push_back(std::stoi(cf_prob));
4822           sum += cfh_idx_to_prob.back();
4823         }
4824         if (sum != 100) {
4825           fprintf(stderr, "column_family_distribution items must sum to 100\n");
4826           exit(1);
4827         }
4828         if (cfh_idx_to_prob.size() != num_hot) {
4829           fprintf(stderr,
4830                   "got %" ROCKSDB_PRIszt
4831                   " column_family_distribution items; expected "
4832                   "%" ROCKSDB_PRIszt "\n",
4833                   cfh_idx_to_prob.size(), num_hot);
4834           exit(1);
4835         }
4836       }
4837 #ifndef ROCKSDB_LITE
4838       if (FLAGS_readonly) {
4839         s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh,
4840                                 &db->db);
4841       } else if (FLAGS_optimistic_transaction_db) {
4842         s = OptimisticTransactionDB::Open(options, db_name, column_families,
4843                                           &db->cfh, &db->opt_txn_db);
4844         if (s.ok()) {
4845           db->db = db->opt_txn_db->GetBaseDB();
4846         }
4847       } else if (FLAGS_transaction_db) {
4848         TransactionDB* ptr;
4849         TransactionDBOptions txn_db_options;
4850         if (options.unordered_write) {
4851           options.two_write_queues = true;
4852           txn_db_options.skip_concurrency_control = true;
4853           txn_db_options.write_policy = WRITE_PREPARED;
4854         }
4855         s = TransactionDB::Open(options, txn_db_options, db_name,
4856                                 column_families, &db->cfh, &ptr);
4857         if (s.ok()) {
4858           db->db = ptr;
4859         }
4860       } else {
4861         s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4862       }
4863 #else
4864       s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4865 #endif  // ROCKSDB_LITE
4866       db->cfh.resize(FLAGS_num_column_families);
4867       db->num_created = num_hot;
4868       db->num_hot = num_hot;
4869       db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
4870 #ifndef ROCKSDB_LITE
4871     } else if (FLAGS_readonly) {
4872       s = DB::OpenForReadOnly(options, db_name, &db->db);
4873     } else if (FLAGS_optimistic_transaction_db) {
4874       s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
4875       if (s.ok()) {
4876         db->db = db->opt_txn_db->GetBaseDB();
4877       }
4878     } else if (FLAGS_transaction_db) {
4879       TransactionDB* ptr = nullptr;
4880       TransactionDBOptions txn_db_options;
4881       if (options.unordered_write) {
4882         options.two_write_queues = true;
4883         txn_db_options.skip_concurrency_control = true;
4884         txn_db_options.write_policy = WRITE_PREPARED;
4885       }
4886       s = CreateLoggerFromOptions(db_name, options, &options.info_log);
4887       if (s.ok()) {
4888         s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
4889       }
4890       if (s.ok()) {
4891         db->db = ptr;
4892       }
4893     } else if (FLAGS_use_blob_db) {
4894       // Stacked BlobDB
4895       blob_db::BlobDBOptions blob_db_options;
4896       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
4897       blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
4898       blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
4899       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
4900       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
4901       blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
4902       blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
4903       blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
4904       blob_db_options.compression = FLAGS_blob_db_compression_type_e;
4905       blob_db::BlobDB* ptr = nullptr;
4906       s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
4907       if (s.ok()) {
4908         db->db = ptr;
4909       }
4910     } else if (FLAGS_use_secondary_db) {
4911       if (FLAGS_secondary_path.empty()) {
4912         std::string default_secondary_path;
4913         FLAGS_env->GetTestDirectory(&default_secondary_path);
4914         default_secondary_path += "/dbbench_secondary";
4915         FLAGS_secondary_path = default_secondary_path;
4916       }
4917       s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
4918       if (s.ok() && FLAGS_secondary_update_interval > 0) {
4919         secondary_update_thread_.reset(new port::Thread(
4920             [this](int interval, DBWithColumnFamilies* _db) {
4921               while (0 == secondary_update_stopped_.load(
4922                               std::memory_order_relaxed)) {
4923                 Status secondary_update_status =
4924                     _db->db->TryCatchUpWithPrimary();
4925                 if (!secondary_update_status.ok()) {
4926                   fprintf(stderr, "Failed to catch up with primary: %s\n",
4927                           secondary_update_status.ToString().c_str());
4928                   break;
4929                 }
4930                 ++secondary_db_updates_;
4931                 FLAGS_env->SleepForMicroseconds(interval * 1000000);
4932               }
4933             },
4934             FLAGS_secondary_update_interval, db));
4935       }
4936 #endif  // ROCKSDB_LITE
4937     } else {
4938       s = DB::Open(options, db_name, &db->db);
4939     }
4940     if (FLAGS_report_open_timing) {
4941       std::cout << "OpenDb:     "
4942                 << (FLAGS_env->NowNanos() - open_start) / 1000000.0
4943                 << " milliseconds\n";
4944     }
4945     if (!s.ok()) {
4946       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
4947       exit(1);
4948     }
4949   }
4950
4951   enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM };
4952
4953   void WriteSeqDeterministic(ThreadState* thread) {
4954     DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
4955   }
4956
4957   void WriteUniqueRandomDeterministic(ThreadState* thread) {
4958     DoDeterministicCompact(thread, open_options_.compaction_style,
4959                            UNIQUE_RANDOM);
4960   }
4961
4962   void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); }
4963
4964   void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); }
4965
4966   void WriteUniqueRandom(ThreadState* thread) {
4967     DoWrite(thread, UNIQUE_RANDOM);
4968   }
4969
4970   class KeyGenerator {
4971    public:
4972     KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
4973                  uint64_t /*num_per_set*/ = 64 * 1024)
4974         : rand_(rand), mode_(mode), num_(num), next_(0) {
4975       if (mode_ == UNIQUE_RANDOM) {
4976         // NOTE: if memory consumption of this approach becomes a concern,
4977         // we can either break it into pieces and only random shuffle a section
4978         // each time. Alternatively, use a bit map implementation
4979         // (https://reviews.facebook.net/differential/diff/54627/)
4980         values_.resize(num_);
4981         for (uint64_t i = 0; i < num_; ++i) {
4982           values_[i] = i;
4983         }
4984         RandomShuffle(values_.begin(), values_.end(),
4985                       static_cast<uint32_t>(seed_base));
4986       }
4987     }
4988
4989     uint64_t Next() {
4990       switch (mode_) {
4991         case SEQUENTIAL:
4992           return next_++;
4993         case RANDOM:
4994           return rand_->Next() % num_;
4995         case UNIQUE_RANDOM:
4996           assert(next_ < num_);
4997           return values_[next_++];
4998       }
4999       assert(false);
5000       return std::numeric_limits<uint64_t>::max();
5001     }
5002
5003     // Only available for UNIQUE_RANDOM mode.
5004     uint64_t Fetch(uint64_t index) {
5005       assert(mode_ == UNIQUE_RANDOM);
5006       assert(index < values_.size());
5007       return values_[index];
5008     }
5009
5010    private:
5011     Random64* rand_;
5012     WriteMode mode_;
5013     const uint64_t num_;
5014     uint64_t next_;
5015     std::vector<uint64_t> values_;
5016   };
5017
5018   DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; }
5019
5020   DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
5021     return SelectDBWithCfh(thread->rand.Next());
5022   }
5023
5024   DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
5025     if (db_.db != nullptr) {
5026       return &db_;
5027     } else {
5028       return &multi_dbs_[rand_int % multi_dbs_.size()];
5029     }
5030   }
5031
5032   double SineRate(double x) {
5033     return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d;
5034   }
5035
5036   void DoWrite(ThreadState* thread, WriteMode write_mode) {
5037     const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
5038     const int64_t num_ops = writes_ == 0 ? num_ : writes_;
5039
5040     size_t num_key_gens = 1;
5041     if (db_.db == nullptr) {
5042       num_key_gens = multi_dbs_.size();
5043     }
5044     std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
5045     int64_t max_ops = num_ops * num_key_gens;
5046     int64_t ops_per_stage = max_ops;
5047     if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
5048       ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
5049                                        FLAGS_num_hot_column_families) +
5050                       1;
5051     }
5052
5053     Duration duration(test_duration, max_ops, ops_per_stage);
5054     const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_;
5055     for (size_t i = 0; i < num_key_gens; i++) {
5056       key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
5057                                          num_per_key_gen, ops_per_stage));
5058     }
5059
5060     if (num_ != FLAGS_num) {
5061       char msg[100];
5062       snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
5063       thread->stats.AddMessage(msg);
5064     }
5065
5066     RandomGenerator gen;
5067     WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
5068                      FLAGS_write_batch_protection_bytes_per_key,
5069                      user_timestamp_size_);
5070     Status s;
5071     int64_t bytes = 0;
5072
5073     std::unique_ptr<const char[]> key_guard;
5074     Slice key = AllocateKey(&key_guard);
5075     std::unique_ptr<const char[]> begin_key_guard;
5076     Slice begin_key = AllocateKey(&begin_key_guard);
5077     std::unique_ptr<const char[]> end_key_guard;
5078     Slice end_key = AllocateKey(&end_key_guard);
5079     double p = 0.0;
5080     uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0;
5081     // If user set overwrite_probability flag,
5082     // check if value is in [0.0,1.0].
5083     if (FLAGS_overwrite_probability > 0.0) {
5084       p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability;
5085       // If overwrite set by user, and UNIQUE_RANDOM mode on,
5086       // the overwrite_window_size must be > 0.
5087       if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) {
5088         fprintf(stderr,
5089                 "Overwrite_window_size must be  strictly greater than 0.\n");
5090         ErrorExit();
5091       }
5092     }
5093
5094     // Default_random_engine provides slightly
5095     // improved throughput over mt19937.
5096     std::default_random_engine overwrite_gen{
5097         static_cast<unsigned int>(seed_base)};
5098     std::bernoulli_distribution overwrite_decider(p);
5099
5100     // Inserted key window is filled with the last N
5101     // keys previously inserted into the DB (with
5102     // N=FLAGS_overwrite_window_size).
5103     // We use a deque struct because:
5104     // - random access is O(1)
5105     // - insertion/removal at beginning/end is also O(1).
5106     std::deque<int64_t> inserted_key_window;
5107     Random64 reservoir_id_gen(seed_base);
5108
5109     // --- Variables used in disposable/persistent keys simulation:
5110     // The following variables are used when
5111     // disposable_entries_batch_size is >0. We simualte a workload
5112     // where the following sequence is repeated multiple times:
5113     // "A set of keys S1 is inserted ('disposable entries'), then after
5114     // some delay another set of keys S2 is inserted ('persistent entries')
5115     // and the first set of keys S1 is deleted. S2 artificially represents
5116     // the insertion of hypothetical results from some undefined computation
5117     // done on the first set of keys S1. The next sequence can start as soon
5118     // as the last disposable entry in the set S1 of this sequence is
5119     // inserted, if the delay is non negligible"
5120     bool skip_for_loop = false, is_disposable_entry = true;
5121     std::vector<uint64_t> disposable_entries_index(num_key_gens, 0);
5122     std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0);
5123     const uint64_t kNumDispAndPersEntries =
5124         FLAGS_disposable_entries_batch_size +
5125         FLAGS_persistent_entries_batch_size;
5126     if (kNumDispAndPersEntries > 0) {
5127       if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) ||
5128           (p > 0.0)) {
5129         fprintf(
5130             stderr,
5131             "Disposable/persistent deletes are not compatible with overwrites "
5132             "and DeleteRanges; and are only supported in filluniquerandom.\n");
5133         ErrorExit();
5134       }
5135       if (FLAGS_disposable_entries_value_size < 0 ||
5136           FLAGS_persistent_entries_value_size < 0) {
5137         fprintf(
5138             stderr,
5139             "disposable_entries_value_size and persistent_entries_value_size"
5140             "have to be positive.\n");
5141         ErrorExit();
5142       }
5143     }
5144     Random rnd_disposable_entry(static_cast<uint32_t>(seed_base));
5145     std::string random_value;
5146     // Queue that stores scheduled timestamp of disposable entries deletes,
5147     // along with starting index of disposable entry keys to delete.
5148     std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q(
5149         num_key_gens);
5150     // --- End of variables used in disposable/persistent keys simulation.
5151
5152     std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
5153     std::vector<Slice> expanded_keys;
5154     if (FLAGS_expand_range_tombstones) {
5155       expanded_key_guards.resize(range_tombstone_width_);
5156       for (auto& expanded_key_guard : expanded_key_guards) {
5157         expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
5158       }
5159     }
5160
5161     std::unique_ptr<char[]> ts_guard;
5162     if (user_timestamp_size_ > 0) {
5163       ts_guard.reset(new char[user_timestamp_size_]);
5164     }
5165
5166     int64_t stage = 0;
5167     int64_t num_written = 0;
5168     int64_t next_seq_db_at = num_ops;
5169     size_t id = 0;
5170     int64_t num_range_deletions = 0;
5171
5172     while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) {
5173       if (duration.GetStage() != stage) {
5174         stage = duration.GetStage();
5175         if (db_.db != nullptr) {
5176           db_.CreateNewCf(open_options_, stage);
5177         } else {
5178           for (auto& db : multi_dbs_) {
5179             db.CreateNewCf(open_options_, stage);
5180           }
5181         }
5182       }
5183
5184       if (write_mode != SEQUENTIAL) {
5185         id = thread->rand.Next() % num_key_gens;
5186       } else {
5187         // When doing a sequential load with multiple databases, load them in
5188         // order rather than all at the same time to avoid:
5189         // 1) long delays between flushing memtables
5190         // 2) flushing memtables for all of them at the same point in time
5191         // 3) not putting the same number of keys in each database
5192         if (num_written >= next_seq_db_at) {
5193           next_seq_db_at += num_ops;
5194           id++;
5195           if (id >= num_key_gens) {
5196             fprintf(stderr, "Logic error. Filled all databases\n");
5197             ErrorExit();
5198           }
5199         }
5200       }
5201       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
5202
5203       batch.Clear();
5204       int64_t batch_bytes = 0;
5205
5206       for (int64_t j = 0; j < entries_per_batch_; j++) {
5207         int64_t rand_num = 0;
5208         if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
5209           if ((inserted_key_window.size() > 0) &&
5210               overwrite_decider(overwrite_gen)) {
5211             num_overwrites++;
5212             rand_num = inserted_key_window[reservoir_id_gen.Next() %
5213                                            inserted_key_window.size()];
5214           } else {
5215             num_unique_keys++;
5216             rand_num = key_gens[id]->Next();
5217             if (inserted_key_window.size() < FLAGS_overwrite_window_size) {
5218               inserted_key_window.push_back(rand_num);
5219             } else {
5220               inserted_key_window.pop_front();
5221               inserted_key_window.push_back(rand_num);
5222             }
5223           }
5224         } else if (kNumDispAndPersEntries > 0) {
5225           // Check if queue is non-empty and if we need to insert
5226           // 'persistent' KV entries (KV entries that are never deleted)
5227           // and delete disposable entries previously inserted.
5228           if (!disposable_entries_q[id].empty() &&
5229               (disposable_entries_q[id].front().first <
5230                FLAGS_env->NowMicros())) {
5231             // If we need to perform a "merge op" pattern,
5232             // we first write all the persistent KV entries not targeted
5233             // by deletes, and then we write the disposable entries deletes.
5234             if (persistent_ent_and_del_index[id] <
5235                 FLAGS_persistent_entries_batch_size) {
5236               // Generate key to insert.
5237               rand_num =
5238                   key_gens[id]->Fetch(disposable_entries_q[id].front().second +
5239                                       FLAGS_disposable_entries_batch_size +
5240                                       persistent_ent_and_del_index[id]);
5241               persistent_ent_and_del_index[id]++;
5242               is_disposable_entry = false;
5243               skip_for_loop = false;
5244             } else if (persistent_ent_and_del_index[id] <
5245                        kNumDispAndPersEntries) {
5246               // Find key of the entry to delete.
5247               rand_num =
5248                   key_gens[id]->Fetch(disposable_entries_q[id].front().second +
5249                                       (persistent_ent_and_del_index[id] -
5250                                        FLAGS_persistent_entries_batch_size));
5251               persistent_ent_and_del_index[id]++;
5252               GenerateKeyFromInt(rand_num, FLAGS_num, &key);
5253               // For the delete operation, everything happens here and we
5254               // skip the rest of the for-loop, which is designed for
5255               // inserts.
5256               if (FLAGS_num_column_families <= 1) {
5257                 batch.Delete(key);
5258               } else {
5259                 // We use same rand_num as seed for key and column family so
5260                 // that we can deterministically find the cfh corresponding to a
5261                 // particular key while reading the key.
5262                 batch.Delete(db_with_cfh->GetCfh(rand_num), key);
5263               }
5264               // A delete only includes Key+Timestamp (no value).
5265               batch_bytes += key_size_ + user_timestamp_size_;
5266               bytes += key_size_ + user_timestamp_size_;
5267               num_selective_deletes++;
5268               // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
5269               skip_for_loop = true;
5270             } else {
5271               assert(false);  // should never reach this point.
5272             }
5273             // If disposable_entries_q needs to be updated (ie: when a selective
5274             // insert+delete was successfully completed, pop the job out of the
5275             // queue).
5276             if (!disposable_entries_q[id].empty() &&
5277                 (disposable_entries_q[id].front().first <
5278                  FLAGS_env->NowMicros()) &&
5279                 persistent_ent_and_del_index[id] == kNumDispAndPersEntries) {
5280               disposable_entries_q[id].pop();
5281               persistent_ent_and_del_index[id] = 0;
5282             }
5283
5284             // If we are deleting disposable entries, skip the rest of the
5285             // for-loop since there is no key-value inserts at this moment in
5286             // time.
5287             if (skip_for_loop) {
5288               continue;
5289             }
5290
5291           }
5292           // If no job is in the queue, then we keep inserting disposable KV
5293           // entries that will be deleted later by a series of deletes.
5294           else {
5295             rand_num = key_gens[id]->Fetch(disposable_entries_index[id]);
5296             disposable_entries_index[id]++;
5297             is_disposable_entry = true;
5298             if ((disposable_entries_index[id] %
5299                  FLAGS_disposable_entries_batch_size) == 0) {
5300               // Skip the persistent KV entries inserts for now
5301               disposable_entries_index[id] +=
5302                   FLAGS_persistent_entries_batch_size;
5303             }
5304           }
5305         } else {
5306           rand_num = key_gens[id]->Next();
5307         }
5308         GenerateKeyFromInt(rand_num, FLAGS_num, &key);
5309         Slice val;
5310         if (kNumDispAndPersEntries > 0) {
5311           random_value = rnd_disposable_entry.RandomString(
5312               is_disposable_entry ? FLAGS_disposable_entries_value_size
5313                                   : FLAGS_persistent_entries_value_size);
5314           val = Slice(random_value);
5315           num_unique_keys++;
5316         } else {
5317           val = gen.Generate();
5318         }
5319         if (use_blob_db_) {
5320 #ifndef ROCKSDB_LITE
5321           // Stacked BlobDB
5322           blob_db::BlobDB* blobdb =
5323               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
5324           if (FLAGS_blob_db_max_ttl_range > 0) {
5325             int ttl = rand() % FLAGS_blob_db_max_ttl_range;
5326             s = blobdb->PutWithTTL(write_options_, key, val, ttl);
5327           } else {
5328             s = blobdb->Put(write_options_, key, val);
5329           }
5330 #endif  //  ROCKSDB_LITE
5331         } else if (FLAGS_num_column_families <= 1) {
5332           batch.Put(key, val);
5333         } else {
5334           // We use same rand_num as seed for key and column family so that we
5335           // can deterministically find the cfh corresponding to a particular
5336           // key while reading the key.
5337           batch.Put(db_with_cfh->GetCfh(rand_num), key, val);
5338         }
5339         batch_bytes += val.size() + key_size_ + user_timestamp_size_;
5340         bytes += val.size() + key_size_ + user_timestamp_size_;
5341         ++num_written;
5342
5343         // If all disposable entries have been inserted, then we need to
5344         // add in the job queue a call for 'persistent entry insertions +
5345         // disposable entry deletions'.
5346         if (kNumDispAndPersEntries > 0 && is_disposable_entry &&
5347             ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) {
5348           // Queue contains [timestamp, starting_idx],
5349           // timestamp = current_time + delay (minimum aboslute time when to
5350           // start inserting the selective deletes) starting_idx = index in the
5351           // keygen of the rand_num to generate the key of the first KV entry to
5352           // delete (= key of the first selective delete).
5353           disposable_entries_q[id].push(std::make_pair(
5354               FLAGS_env->NowMicros() +
5355                   FLAGS_disposable_entries_delete_delay /* timestamp */,
5356               disposable_entries_index[id] - kNumDispAndPersEntries
5357               /*starting idx*/));
5358         }
5359         if (writes_per_range_tombstone_ > 0 &&
5360             num_written > writes_before_delete_range_ &&
5361             (num_written - writes_before_delete_range_) /
5362                     writes_per_range_tombstone_ <=
5363                 max_num_range_tombstones_ &&
5364             (num_written - writes_before_delete_range_) %
5365                     writes_per_range_tombstone_ ==
5366                 0) {
5367           num_range_deletions++;
5368           int64_t begin_num = key_gens[id]->Next();
5369           if (FLAGS_expand_range_tombstones) {
5370             for (int64_t offset = 0; offset < range_tombstone_width_;
5371                  ++offset) {
5372               GenerateKeyFromInt(begin_num + offset, FLAGS_num,
5373                                  &expanded_keys[offset]);
5374               if (use_blob_db_) {
5375 #ifndef ROCKSDB_LITE
5376                 // Stacked BlobDB
5377                 s = db_with_cfh->db->Delete(write_options_,
5378                                             expanded_keys[offset]);
5379 #endif  //  ROCKSDB_LITE
5380               } else if (FLAGS_num_column_families <= 1) {
5381                 batch.Delete(expanded_keys[offset]);
5382               } else {
5383                 batch.Delete(db_with_cfh->GetCfh(rand_num),
5384                              expanded_keys[offset]);
5385               }
5386             }
5387           } else {
5388             GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
5389             GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
5390                                &end_key);
5391             if (use_blob_db_) {
5392 #ifndef ROCKSDB_LITE
5393               // Stacked BlobDB
5394               s = db_with_cfh->db->DeleteRange(
5395                   write_options_, db_with_cfh->db->DefaultColumnFamily(),
5396                   begin_key, end_key);
5397 #endif  //  ROCKSDB_LITE
5398             } else if (FLAGS_num_column_families <= 1) {
5399               batch.DeleteRange(begin_key, end_key);
5400             } else {
5401               batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
5402                                 end_key);
5403             }
5404           }
5405         }
5406       }
5407       if (thread->shared->write_rate_limiter.get() != nullptr) {
5408         thread->shared->write_rate_limiter->Request(
5409             batch_bytes, Env::IO_HIGH, nullptr /* stats */,
5410             RateLimiter::OpType::kWrite);
5411         // Set time at which last op finished to Now() to hide latency and
5412         // sleep from rate limiter. Also, do the check once per batch, not
5413         // once per write.
5414         thread->stats.ResetLastOpTime();
5415       }
5416       if (user_timestamp_size_ > 0) {
5417         Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
5418         s = batch.UpdateTimestamps(
5419             user_ts, [this](uint32_t) { return user_timestamp_size_; });
5420         if (!s.ok()) {
5421           fprintf(stderr, "assign timestamp to write batch: %s\n",
5422                   s.ToString().c_str());
5423           ErrorExit();
5424         }
5425       }
5426       if (!use_blob_db_) {
5427         // Not stacked BlobDB
5428         s = db_with_cfh->db->Write(write_options_, &batch);
5429       }
5430       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
5431                                 entries_per_batch_, kWrite);
5432       if (FLAGS_sine_write_rate) {
5433         uint64_t now = FLAGS_env->NowMicros();
5434
5435         uint64_t usecs_since_last;
5436         if (now > thread->stats.GetSineInterval()) {
5437           usecs_since_last = now - thread->stats.GetSineInterval();
5438         } else {
5439           usecs_since_last = 0;
5440         }
5441
5442         if (usecs_since_last >
5443             (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
5444           double usecs_since_start =
5445               static_cast<double>(now - thread->stats.GetStart());
5446           thread->stats.ResetSineInterval();
5447           uint64_t write_rate =
5448               static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
5449           thread->shared->write_rate_limiter.reset(
5450               NewGenericRateLimiter(write_rate));
5451         }
5452       }
5453       if (!s.ok()) {
5454         s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
5455       }
5456
5457       if (!s.ok()) {
5458         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
5459         ErrorExit();
5460       }
5461     }
5462     if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
5463       fprintf(stdout,
5464               "Number of unique keys inserted: %" PRIu64
5465               ".\nNumber of overwrites: %" PRIu64 "\n",
5466               num_unique_keys, num_overwrites);
5467     } else if (kNumDispAndPersEntries > 0) {
5468       fprintf(stdout,
5469               "Number of unique keys inserted (disposable+persistent): %" PRIu64
5470               ".\nNumber of 'disposable entry delete': %" PRIu64 "\n",
5471               num_written, num_selective_deletes);
5472     }
5473     if (num_range_deletions > 0) {
5474       std::cout << "Number of range deletions: " << num_range_deletions
5475                 << std::endl;
5476     }
5477     thread->stats.AddBytes(bytes);
5478   }
5479
5480   Status DoDeterministicCompact(ThreadState* thread,
5481                                 CompactionStyle compaction_style,
5482                                 WriteMode write_mode) {
5483 #ifndef ROCKSDB_LITE
5484     ColumnFamilyMetaData meta;
5485     std::vector<DB*> db_list;
5486     if (db_.db != nullptr) {
5487       db_list.push_back(db_.db);
5488     } else {
5489       for (auto& db : multi_dbs_) {
5490         db_list.push_back(db.db);
5491       }
5492     }
5493     std::vector<Options> options_list;
5494     for (auto db : db_list) {
5495       options_list.push_back(db->GetOptions());
5496       if (compaction_style != kCompactionStyleFIFO) {
5497         db->SetOptions({{"disable_auto_compactions", "1"},
5498                         {"level0_slowdown_writes_trigger", "400000000"},
5499                         {"level0_stop_writes_trigger", "400000000"}});
5500       } else {
5501         db->SetOptions({{"disable_auto_compactions", "1"}});
5502       }
5503     }
5504
5505     assert(!db_list.empty());
5506     auto num_db = db_list.size();
5507     size_t num_levels = static_cast<size_t>(open_options_.num_levels);
5508     size_t output_level = open_options_.num_levels - 1;
5509     std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
5510     std::vector<size_t> num_files_at_level0(num_db, 0);
5511     if (compaction_style == kCompactionStyleLevel) {
5512       if (num_levels == 0) {
5513         return Status::InvalidArgument("num_levels should be larger than 1");
5514       }
5515       bool should_stop = false;
5516       while (!should_stop) {
5517         if (sorted_runs[0].empty()) {
5518           DoWrite(thread, write_mode);
5519         } else {
5520           DoWrite(thread, UNIQUE_RANDOM);
5521         }
5522         for (size_t i = 0; i < num_db; i++) {
5523           auto db = db_list[i];
5524           db->Flush(FlushOptions());
5525           db->GetColumnFamilyMetaData(&meta);
5526           if (num_files_at_level0[i] == meta.levels[0].files.size() ||
5527               writes_ == 0) {
5528             should_stop = true;
5529             continue;
5530           }
5531           sorted_runs[i].emplace_back(
5532               meta.levels[0].files.begin(),
5533               meta.levels[0].files.end() - num_files_at_level0[i]);
5534           num_files_at_level0[i] = meta.levels[0].files.size();
5535           if (sorted_runs[i].back().size() == 1) {
5536             should_stop = true;
5537             continue;
5538           }
5539           if (sorted_runs[i].size() == output_level) {
5540             auto& L1 = sorted_runs[i].back();
5541             L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
5542             should_stop = true;
5543             continue;
5544           }
5545         }
5546         writes_ /=
5547             static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
5548       }
5549       for (size_t i = 0; i < num_db; i++) {
5550         if (sorted_runs[i].size() < num_levels - 1) {
5551           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
5552                   num_levels);
5553           exit(1);
5554         }
5555       }
5556       for (size_t i = 0; i < num_db; i++) {
5557         auto db = db_list[i];
5558         auto compactionOptions = CompactionOptions();
5559         compactionOptions.compression = FLAGS_compression_type_e;
5560         auto options = db->GetOptions();
5561         MutableCFOptions mutable_cf_options(options);
5562         for (size_t j = 0; j < sorted_runs[i].size(); j++) {
5563           compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
5564               mutable_cf_options, static_cast<int>(output_level),
5565               compaction_style);
5566           std::cout << sorted_runs[i][j].size() << std::endl;
5567           db->CompactFiles(
5568               compactionOptions,
5569               {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
5570               static_cast<int>(output_level - j) /*level*/);
5571         }
5572       }
5573     } else if (compaction_style == kCompactionStyleUniversal) {
5574       auto ratio = open_options_.compaction_options_universal.size_ratio;
5575       bool should_stop = false;
5576       while (!should_stop) {
5577         if (sorted_runs[0].empty()) {
5578           DoWrite(thread, write_mode);
5579         } else {
5580           DoWrite(thread, UNIQUE_RANDOM);
5581         }
5582         for (size_t i = 0; i < num_db; i++) {
5583           auto db = db_list[i];
5584           db->Flush(FlushOptions());
5585           db->GetColumnFamilyMetaData(&meta);
5586           if (num_files_at_level0[i] == meta.levels[0].files.size() ||
5587               writes_ == 0) {
5588             should_stop = true;
5589             continue;
5590           }
5591           sorted_runs[i].emplace_back(
5592               meta.levels[0].files.begin(),
5593               meta.levels[0].files.end() - num_files_at_level0[i]);
5594           num_files_at_level0[i] = meta.levels[0].files.size();
5595           if (sorted_runs[i].back().size() == 1) {
5596             should_stop = true;
5597             continue;
5598           }
5599           num_files_at_level0[i] = meta.levels[0].files.size();
5600         }
5601         writes_ = static_cast<int64_t>(writes_ * static_cast<double>(100) /
5602                                        (ratio + 200));
5603       }
5604       for (size_t i = 0; i < num_db; i++) {
5605         if (sorted_runs[i].size() < num_levels) {
5606           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
5607                   num_levels);
5608           exit(1);
5609         }
5610       }
5611       for (size_t i = 0; i < num_db; i++) {
5612         auto db = db_list[i];
5613         auto compactionOptions = CompactionOptions();
5614         compactionOptions.compression = FLAGS_compression_type_e;
5615         auto options = db->GetOptions();
5616         MutableCFOptions mutable_cf_options(options);
5617         for (size_t j = 0; j < sorted_runs[i].size(); j++) {
5618           compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
5619               mutable_cf_options, static_cast<int>(output_level),
5620               compaction_style);
5621           db->CompactFiles(
5622               compactionOptions,
5623               {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
5624               (output_level > j ? static_cast<int>(output_level - j)
5625                                 : 0) /*level*/);
5626         }
5627       }
5628     } else if (compaction_style == kCompactionStyleFIFO) {
5629       if (num_levels != 1) {
5630         return Status::InvalidArgument(
5631             "num_levels should be 1 for FIFO compaction");
5632       }
5633       if (FLAGS_num_multi_db != 0) {
5634         return Status::InvalidArgument("Doesn't support multiDB");
5635       }
5636       auto db = db_list[0];
5637       std::vector<std::string> file_names;
5638       while (true) {
5639         if (sorted_runs[0].empty()) {
5640           DoWrite(thread, write_mode);
5641         } else {
5642           DoWrite(thread, UNIQUE_RANDOM);
5643         }
5644         db->Flush(FlushOptions());
5645         db->GetColumnFamilyMetaData(&meta);
5646         auto total_size = meta.levels[0].size;
5647         if (total_size >=
5648             db->GetOptions().compaction_options_fifo.max_table_files_size) {
5649           for (auto file_meta : meta.levels[0].files) {
5650             file_names.emplace_back(file_meta.name);
5651           }
5652           break;
5653         }
5654       }
5655       // TODO(shuzhang1989): Investigate why CompactFiles not working
5656       // auto compactionOptions = CompactionOptions();
5657       // db->CompactFiles(compactionOptions, file_names, 0);
5658       auto compactionOptions = CompactRangeOptions();
5659       db->CompactRange(compactionOptions, nullptr, nullptr);
5660     } else {
5661       fprintf(stdout,
5662               "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
5663               "filldeterministic");
5664       return Status::InvalidArgument("None compaction is not supported");
5665     }
5666
5667 // Verify seqno and key range
5668 // Note: the seqno get changed at the max level by implementation
5669 // optimization, so skip the check of the max level.
5670 #ifndef NDEBUG
5671     for (size_t k = 0; k < num_db; k++) {
5672       auto db = db_list[k];
5673       db->GetColumnFamilyMetaData(&meta);
5674       // verify the number of sorted runs
5675       if (compaction_style == kCompactionStyleLevel) {
5676         assert(num_levels - 1 == sorted_runs[k].size());
5677       } else if (compaction_style == kCompactionStyleUniversal) {
5678         assert(meta.levels[0].files.size() + num_levels - 1 ==
5679                sorted_runs[k].size());
5680       } else if (compaction_style == kCompactionStyleFIFO) {
5681         // TODO(gzh): FIFO compaction
5682         db->GetColumnFamilyMetaData(&meta);
5683         auto total_size = meta.levels[0].size;
5684         assert(total_size <=
5685                db->GetOptions().compaction_options_fifo.max_table_files_size);
5686         break;
5687       }
5688
5689       // verify smallest/largest seqno and key range of each sorted run
5690       auto max_level = num_levels - 1;
5691       int level;
5692       for (size_t i = 0; i < sorted_runs[k].size(); i++) {
5693         level = static_cast<int>(max_level - i);
5694         SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
5695         SequenceNumber sorted_run_largest_seqno = 0;
5696         std::string sorted_run_smallest_key, sorted_run_largest_key;
5697         bool first_key = true;
5698         for (auto fileMeta : sorted_runs[k][i]) {
5699           sorted_run_smallest_seqno =
5700               std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
5701           sorted_run_largest_seqno =
5702               std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
5703           if (first_key ||
5704               db->DefaultColumnFamily()->GetComparator()->Compare(
5705                   fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
5706             sorted_run_smallest_key = fileMeta.smallestkey;
5707           }
5708           if (first_key ||
5709               db->DefaultColumnFamily()->GetComparator()->Compare(
5710                   fileMeta.largestkey, sorted_run_largest_key) > 0) {
5711             sorted_run_largest_key = fileMeta.largestkey;
5712           }
5713           first_key = false;
5714         }
5715         if (compaction_style == kCompactionStyleLevel ||
5716             (compaction_style == kCompactionStyleUniversal && level > 0)) {
5717           SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
5718           SequenceNumber level_largest_seqno = 0;
5719           for (auto fileMeta : meta.levels[level].files) {
5720             level_smallest_seqno =
5721                 std::min(level_smallest_seqno, fileMeta.smallest_seqno);
5722             level_largest_seqno =
5723                 std::max(level_largest_seqno, fileMeta.largest_seqno);
5724           }
5725           assert(sorted_run_smallest_key ==
5726                  meta.levels[level].files.front().smallestkey);
5727           assert(sorted_run_largest_key ==
5728                  meta.levels[level].files.back().largestkey);
5729           if (level != static_cast<int>(max_level)) {
5730             // compaction at max_level would change sequence number
5731             assert(sorted_run_smallest_seqno == level_smallest_seqno);
5732             assert(sorted_run_largest_seqno == level_largest_seqno);
5733           }
5734         } else if (compaction_style == kCompactionStyleUniversal) {
5735           // level <= 0 means sorted runs on level 0
5736           auto level0_file =
5737               meta.levels[0].files[sorted_runs[k].size() - 1 - i];
5738           assert(sorted_run_smallest_key == level0_file.smallestkey);
5739           assert(sorted_run_largest_key == level0_file.largestkey);
5740           if (level != static_cast<int>(max_level)) {
5741             assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
5742             assert(sorted_run_largest_seqno == level0_file.largest_seqno);
5743           }
5744         }
5745       }
5746     }
5747 #endif
5748     // print the size of each sorted_run
5749     for (size_t k = 0; k < num_db; k++) {
5750       auto db = db_list[k];
5751       fprintf(stdout,
5752               "---------------------- DB %" ROCKSDB_PRIszt
5753               " LSM ---------------------\n",
5754               k);
5755       db->GetColumnFamilyMetaData(&meta);
5756       for (auto& levelMeta : meta.levels) {
5757         if (levelMeta.files.empty()) {
5758           continue;
5759         }
5760         if (levelMeta.level == 0) {
5761           for (auto& fileMeta : levelMeta.files) {
5762             fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n",
5763                     levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
5764           }
5765         } else {
5766           fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
5767                   levelMeta.level, levelMeta.files.front().name.c_str(),
5768                   levelMeta.files.back().name.c_str(), levelMeta.size);
5769         }
5770       }
5771     }
5772     for (size_t i = 0; i < num_db; i++) {
5773       db_list[i]->SetOptions(
5774           {{"disable_auto_compactions",
5775             std::to_string(options_list[i].disable_auto_compactions)},
5776            {"level0_slowdown_writes_trigger",
5777             std::to_string(options_list[i].level0_slowdown_writes_trigger)},
5778            {"level0_stop_writes_trigger",
5779             std::to_string(options_list[i].level0_stop_writes_trigger)}});
5780     }
5781     return Status::OK();
5782 #else
5783     (void)thread;
5784     (void)compaction_style;
5785     (void)write_mode;
5786     fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
5787     return Status::NotSupported(
5788         "Rocksdb Lite doesn't support filldeterministic");
5789 #endif  // ROCKSDB_LITE
5790   }
5791
5792   void ReadSequential(ThreadState* thread) {
5793     if (db_.db != nullptr) {
5794       ReadSequential(thread, db_.db);
5795     } else {
5796       for (const auto& db_with_cfh : multi_dbs_) {
5797         ReadSequential(thread, db_with_cfh.db);
5798       }
5799     }
5800   }
5801
5802   void ReadSequential(ThreadState* thread, DB* db) {
5803     ReadOptions options = read_options_;
5804     std::unique_ptr<char[]> ts_guard;
5805     Slice ts;
5806     if (user_timestamp_size_ > 0) {
5807       ts_guard.reset(new char[user_timestamp_size_]);
5808       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5809       options.timestamp = &ts;
5810     }
5811
5812     options.adaptive_readahead = FLAGS_adaptive_readahead;
5813     options.async_io = FLAGS_async_io;
5814
5815     Iterator* iter = db->NewIterator(options);
5816     int64_t i = 0;
5817     int64_t bytes = 0;
5818     for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
5819       bytes += iter->key().size() + iter->value().size();
5820       thread->stats.FinishedOps(nullptr, db, 1, kRead);
5821       ++i;
5822
5823       if (thread->shared->read_rate_limiter.get() != nullptr &&
5824           i % 1024 == 1023) {
5825         thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5826                                                    nullptr /* stats */,
5827                                                    RateLimiter::OpType::kRead);
5828       }
5829     }
5830
5831     delete iter;
5832     thread->stats.AddBytes(bytes);
5833   }
5834
5835   void ReadToRowCache(ThreadState* thread) {
5836     int64_t read = 0;
5837     int64_t found = 0;
5838     int64_t bytes = 0;
5839     int64_t key_rand = 0;
5840     std::unique_ptr<const char[]> key_guard;
5841     Slice key = AllocateKey(&key_guard);
5842     PinnableSlice pinnable_val;
5843
5844     while (key_rand < FLAGS_num) {
5845       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5846       // We use same key_rand as seed for key and column family so that we can
5847       // deterministically find the cfh corresponding to a particular key, as it
5848       // is done in DoWrite method.
5849       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5850       key_rand++;
5851       read++;
5852       Status s;
5853       if (FLAGS_num_column_families > 1) {
5854         s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
5855                                  key, &pinnable_val);
5856       } else {
5857         pinnable_val.Reset();
5858         s = db_with_cfh->db->Get(read_options_,
5859                                  db_with_cfh->db->DefaultColumnFamily(), key,
5860                                  &pinnable_val);
5861       }
5862
5863       if (s.ok()) {
5864         found++;
5865         bytes += key.size() + pinnable_val.size();
5866       } else if (!s.IsNotFound()) {
5867         fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5868         abort();
5869       }
5870
5871       if (thread->shared->read_rate_limiter.get() != nullptr &&
5872           read % 256 == 255) {
5873         thread->shared->read_rate_limiter->Request(
5874             256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5875       }
5876
5877       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5878     }
5879
5880     char msg[100];
5881     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
5882              read);
5883
5884     thread->stats.AddBytes(bytes);
5885     thread->stats.AddMessage(msg);
5886   }
5887
5888   void ReadReverse(ThreadState* thread) {
5889     if (db_.db != nullptr) {
5890       ReadReverse(thread, db_.db);
5891     } else {
5892       for (const auto& db_with_cfh : multi_dbs_) {
5893         ReadReverse(thread, db_with_cfh.db);
5894       }
5895     }
5896   }
5897
5898   void ReadReverse(ThreadState* thread, DB* db) {
5899     Iterator* iter = db->NewIterator(read_options_);
5900     int64_t i = 0;
5901     int64_t bytes = 0;
5902     for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
5903       bytes += iter->key().size() + iter->value().size();
5904       thread->stats.FinishedOps(nullptr, db, 1, kRead);
5905       ++i;
5906       if (thread->shared->read_rate_limiter.get() != nullptr &&
5907           i % 1024 == 1023) {
5908         thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5909                                                    nullptr /* stats */,
5910                                                    RateLimiter::OpType::kRead);
5911       }
5912     }
5913     delete iter;
5914     thread->stats.AddBytes(bytes);
5915   }
5916
5917   void ReadRandomFast(ThreadState* thread) {
5918     int64_t read = 0;
5919     int64_t found = 0;
5920     int64_t nonexist = 0;
5921     ReadOptions options = read_options_;
5922     std::unique_ptr<const char[]> key_guard;
5923     Slice key = AllocateKey(&key_guard);
5924     std::string value;
5925     Slice ts;
5926     std::unique_ptr<char[]> ts_guard;
5927     if (user_timestamp_size_ > 0) {
5928       ts_guard.reset(new char[user_timestamp_size_]);
5929     }
5930     DB* db = SelectDBWithCfh(thread)->db;
5931
5932     int64_t pot = 1;
5933     while (pot < FLAGS_num) {
5934       pot <<= 1;
5935     }
5936
5937     Duration duration(FLAGS_duration, reads_);
5938     do {
5939       for (int i = 0; i < 100; ++i) {
5940         int64_t key_rand = thread->rand.Next() & (pot - 1);
5941         GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5942         ++read;
5943         std::string ts_ret;
5944         std::string* ts_ptr = nullptr;
5945         if (user_timestamp_size_ > 0) {
5946           ts = mock_app_clock_->GetTimestampForRead(thread->rand,
5947                                                     ts_guard.get());
5948           options.timestamp = &ts;
5949           ts_ptr = &ts_ret;
5950         }
5951         auto status = db->Get(options, key, &value, ts_ptr);
5952         if (status.ok()) {
5953           ++found;
5954         } else if (!status.IsNotFound()) {
5955           fprintf(stderr, "Get returned an error: %s\n",
5956                   status.ToString().c_str());
5957           abort();
5958         }
5959         if (key_rand >= FLAGS_num) {
5960           ++nonexist;
5961         }
5962       }
5963       if (thread->shared->read_rate_limiter.get() != nullptr) {
5964         thread->shared->read_rate_limiter->Request(
5965             100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5966       }
5967
5968       thread->stats.FinishedOps(nullptr, db, 100, kRead);
5969     } while (!duration.Done(100));
5970
5971     char msg[100];
5972     snprintf(msg, sizeof(msg),
5973              "(%" PRIu64 " of %" PRIu64
5974              " found, "
5975              "issued %" PRIu64 " non-exist keys)\n",
5976              found, read, nonexist);
5977
5978     thread->stats.AddMessage(msg);
5979   }
5980
5981   int64_t GetRandomKey(Random64* rand) {
5982     uint64_t rand_int = rand->Next();
5983     int64_t key_rand;
5984     if (read_random_exp_range_ == 0) {
5985       key_rand = rand_int % FLAGS_num;
5986     } else {
5987       const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
5988       long double order = -static_cast<long double>(rand_int % kBigInt) /
5989                           static_cast<long double>(kBigInt) *
5990                           read_random_exp_range_;
5991       long double exp_ran = std::exp(order);
5992       uint64_t rand_num =
5993           static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
5994       // Map to a different number to avoid locality.
5995       const uint64_t kBigPrime = 0x5bd1e995;
5996       // Overflow is like %(2^64). Will have little impact of results.
5997       key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
5998     }
5999     return key_rand;
6000   }
6001
6002   void ReadRandom(ThreadState* thread) {
6003     int64_t read = 0;
6004     int64_t found = 0;
6005     int64_t bytes = 0;
6006     int num_keys = 0;
6007     int64_t key_rand = 0;
6008     ReadOptions options = read_options_;
6009     std::unique_ptr<const char[]> key_guard;
6010     Slice key = AllocateKey(&key_guard);
6011     PinnableSlice pinnable_val;
6012     std::vector<PinnableSlice> pinnable_vals;
6013     if (read_operands_) {
6014       // Start off with a small-ish value that'll be increased later if
6015       // `GetMergeOperands()` tells us it is not large enough.
6016       pinnable_vals.resize(8);
6017     }
6018     std::unique_ptr<char[]> ts_guard;
6019     Slice ts;
6020     if (user_timestamp_size_ > 0) {
6021       ts_guard.reset(new char[user_timestamp_size_]);
6022     }
6023
6024     Duration duration(FLAGS_duration, reads_);
6025     while (!duration.Done(1)) {
6026       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6027       // We use same key_rand as seed for key and column family so that we can
6028       // deterministically find the cfh corresponding to a particular key, as it
6029       // is done in DoWrite method.
6030       if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
6031         if (++num_keys == entries_per_batch_) {
6032           num_keys = 0;
6033           key_rand = GetRandomKey(&thread->rand);
6034           if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
6035               FLAGS_num) {
6036             key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
6037           }
6038         } else {
6039           key_rand += FLAGS_multiread_stride;
6040         }
6041       } else {
6042         key_rand = GetRandomKey(&thread->rand);
6043       }
6044       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
6045       read++;
6046       std::string ts_ret;
6047       std::string* ts_ptr = nullptr;
6048       if (user_timestamp_size_ > 0) {
6049         ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6050         options.timestamp = &ts;
6051         ts_ptr = &ts_ret;
6052       }
6053       Status s;
6054       pinnable_val.Reset();
6055       for (size_t i = 0; i < pinnable_vals.size(); ++i) {
6056         pinnable_vals[i].Reset();
6057       }
6058       ColumnFamilyHandle* cfh;
6059       if (FLAGS_num_column_families > 1) {
6060         cfh = db_with_cfh->GetCfh(key_rand);
6061       } else {
6062         cfh = db_with_cfh->db->DefaultColumnFamily();
6063       }
6064       if (read_operands_) {
6065         GetMergeOperandsOptions get_merge_operands_options;
6066         get_merge_operands_options.expected_max_number_of_operands =
6067             static_cast<int>(pinnable_vals.size());
6068         int number_of_operands;
6069         s = db_with_cfh->db->GetMergeOperands(
6070             options, cfh, key, pinnable_vals.data(),
6071             &get_merge_operands_options, &number_of_operands);
6072         if (s.IsIncomplete()) {
6073           // Should only happen a few times when we encounter a key that had
6074           // more merge operands than any key seen so far. Production use case
6075           // would typically retry in such event to get all the operands so do
6076           // that here.
6077           pinnable_vals.resize(number_of_operands);
6078           get_merge_operands_options.expected_max_number_of_operands =
6079               static_cast<int>(pinnable_vals.size());
6080           s = db_with_cfh->db->GetMergeOperands(
6081               options, cfh, key, pinnable_vals.data(),
6082               &get_merge_operands_options, &number_of_operands);
6083         }
6084       } else {
6085         s = db_with_cfh->db->Get(options, cfh, key, &pinnable_val, ts_ptr);
6086       }
6087
6088       if (s.ok()) {
6089         found++;
6090         bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
6091         for (size_t i = 0; i < pinnable_vals.size(); ++i) {
6092           bytes += pinnable_vals[i].size();
6093           pinnable_vals[i].Reset();
6094         }
6095       } else if (!s.IsNotFound()) {
6096         fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
6097         abort();
6098       }
6099
6100       if (thread->shared->read_rate_limiter.get() != nullptr &&
6101           read % 256 == 255) {
6102         thread->shared->read_rate_limiter->Request(
6103             256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6104       }
6105
6106       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6107     }
6108
6109     char msg[100];
6110     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
6111              read);
6112
6113     thread->stats.AddBytes(bytes);
6114     thread->stats.AddMessage(msg);
6115   }
6116
6117   // Calls MultiGet over a list of keys from a random distribution.
6118   // Returns the total number of keys found.
6119   void MultiReadRandom(ThreadState* thread) {
6120     int64_t read = 0;
6121     int64_t bytes = 0;
6122     int64_t num_multireads = 0;
6123     int64_t found = 0;
6124     ReadOptions options = read_options_;
6125     std::vector<Slice> keys;
6126     std::vector<std::unique_ptr<const char[]>> key_guards;
6127     std::vector<std::string> values(entries_per_batch_);
6128     PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
6129     std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
6130     std::vector<Status> stat_list(entries_per_batch_);
6131     while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
6132       key_guards.push_back(std::unique_ptr<const char[]>());
6133       keys.push_back(AllocateKey(&key_guards.back()));
6134     }
6135
6136     std::unique_ptr<char[]> ts_guard;
6137     if (user_timestamp_size_ > 0) {
6138       ts_guard.reset(new char[user_timestamp_size_]);
6139     }
6140
6141     Duration duration(FLAGS_duration, reads_);
6142     while (!duration.Done(entries_per_batch_)) {
6143       DB* db = SelectDB(thread);
6144       if (FLAGS_multiread_stride) {
6145         int64_t key = GetRandomKey(&thread->rand);
6146         if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
6147             static_cast<int64_t>(FLAGS_num)) {
6148           key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
6149         }
6150         for (int64_t i = 0; i < entries_per_batch_; ++i) {
6151           GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
6152           key += FLAGS_multiread_stride;
6153         }
6154       } else {
6155         for (int64_t i = 0; i < entries_per_batch_; ++i) {
6156           GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
6157         }
6158       }
6159       Slice ts;
6160       if (user_timestamp_size_ > 0) {
6161         ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6162         options.timestamp = &ts;
6163       }
6164       if (!FLAGS_multiread_batched) {
6165         std::vector<Status> statuses = db->MultiGet(options, keys, &values);
6166         assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
6167
6168         read += entries_per_batch_;
6169         num_multireads++;
6170         for (int64_t i = 0; i < entries_per_batch_; ++i) {
6171           if (statuses[i].ok()) {
6172             bytes += keys[i].size() + values[i].size() + user_timestamp_size_;
6173             ++found;
6174           } else if (!statuses[i].IsNotFound()) {
6175             fprintf(stderr, "MultiGet returned an error: %s\n",
6176                     statuses[i].ToString().c_str());
6177             abort();
6178           }
6179         }
6180       } else {
6181         db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
6182                      keys.data(), pin_values, stat_list.data());
6183
6184         read += entries_per_batch_;
6185         num_multireads++;
6186         for (int64_t i = 0; i < entries_per_batch_; ++i) {
6187           if (stat_list[i].ok()) {
6188             bytes +=
6189                 keys[i].size() + pin_values[i].size() + user_timestamp_size_;
6190             ++found;
6191           } else if (!stat_list[i].IsNotFound()) {
6192             fprintf(stderr, "MultiGet returned an error: %s\n",
6193                     stat_list[i].ToString().c_str());
6194             abort();
6195           }
6196           stat_list[i] = Status::OK();
6197           pin_values[i].Reset();
6198         }
6199       }
6200       if (thread->shared->read_rate_limiter.get() != nullptr &&
6201           num_multireads % 256 == 255) {
6202         thread->shared->read_rate_limiter->Request(
6203             256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
6204             RateLimiter::OpType::kRead);
6205       }
6206       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
6207     }
6208
6209     char msg[100];
6210     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
6211              read);
6212     thread->stats.AddBytes(bytes);
6213     thread->stats.AddMessage(msg);
6214   }
6215
6216   // Calls ApproximateSize over random key ranges.
6217   void ApproximateSizeRandom(ThreadState* thread) {
6218     int64_t size_sum = 0;
6219     int64_t num_sizes = 0;
6220     const size_t batch_size = entries_per_batch_;
6221     std::vector<Range> ranges;
6222     std::vector<Slice> lkeys;
6223     std::vector<std::unique_ptr<const char[]>> lkey_guards;
6224     std::vector<Slice> rkeys;
6225     std::vector<std::unique_ptr<const char[]>> rkey_guards;
6226     std::vector<uint64_t> sizes;
6227     while (ranges.size() < batch_size) {
6228       // Ugly without C++17 return from emplace_back
6229       lkey_guards.emplace_back();
6230       rkey_guards.emplace_back();
6231       lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
6232       rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
6233       ranges.emplace_back(lkeys.back(), rkeys.back());
6234       sizes.push_back(0);
6235     }
6236     Duration duration(FLAGS_duration, reads_);
6237     while (!duration.Done(1)) {
6238       DB* db = SelectDB(thread);
6239       for (size_t i = 0; i < batch_size; ++i) {
6240         int64_t lkey = GetRandomKey(&thread->rand);
6241         int64_t rkey = GetRandomKey(&thread->rand);
6242         if (lkey > rkey) {
6243           std::swap(lkey, rkey);
6244         }
6245         GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
6246         GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
6247       }
6248       db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
6249                               &sizes[0]);
6250       num_sizes += entries_per_batch_;
6251       for (int64_t size : sizes) {
6252         size_sum += size;
6253       }
6254       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
6255     }
6256
6257     char msg[100];
6258     snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
6259              static_cast<double>(size_sum) / static_cast<double>(num_sizes));
6260     thread->stats.AddMessage(msg);
6261   }
6262
6263   // The inverse function of Pareto distribution
6264   int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
6265     double ret;
6266     if (k == 0.0) {
6267       ret = theta - sigma * std::log(u);
6268     } else {
6269       ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
6270     }
6271     return static_cast<int64_t>(ceil(ret));
6272   }
6273   // The inverse function of power distribution (y=ax^b)
6274   int64_t PowerCdfInversion(double u, double a, double b) {
6275     double ret;
6276     ret = std::pow((u / a), (1 / b));
6277     return static_cast<int64_t>(ceil(ret));
6278   }
6279
6280   // Add the noice to the QPS
6281   double AddNoise(double origin, double noise_ratio) {
6282     if (noise_ratio < 0.0 || noise_ratio > 1.0) {
6283       return origin;
6284     }
6285     int band_int = static_cast<int>(FLAGS_sine_a);
6286     double delta = (rand() % band_int - band_int / 2) * noise_ratio;
6287     if (origin + delta < 0) {
6288       return origin;
6289     } else {
6290       return (origin + delta);
6291     }
6292   }
6293
6294   // Decide the ratio of different query types
6295   // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
6296   class QueryDecider {
6297    public:
6298     std::vector<int> type_;
6299     std::vector<double> ratio_;
6300     int range_;
6301
6302     QueryDecider() {}
6303     ~QueryDecider() {}
6304
6305     Status Initiate(std::vector<double> ratio_input) {
6306       int range_max = 1000;
6307       double sum = 0.0;
6308       for (auto& ratio : ratio_input) {
6309         sum += ratio;
6310       }
6311       range_ = 0;
6312       for (auto& ratio : ratio_input) {
6313         range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
6314         type_.push_back(range_);
6315         ratio_.push_back(ratio / sum);
6316       }
6317       return Status::OK();
6318     }
6319
6320     int GetType(int64_t rand_num) {
6321       if (rand_num < 0) {
6322         rand_num = rand_num * (-1);
6323       }
6324       assert(range_ != 0);
6325       int pos = static_cast<int>(rand_num % range_);
6326       for (int i = 0; i < static_cast<int>(type_.size()); i++) {
6327         if (pos < type_[i]) {
6328           return i;
6329         }
6330       }
6331       return 0;
6332     }
6333   };
6334
6335   // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
6336   // to transfer a random value to one keyrange based on the hotness.
6337   struct KeyrangeUnit {
6338     int64_t keyrange_start;
6339     int64_t keyrange_access;
6340     int64_t keyrange_keys;
6341   };
6342
6343   // From our observations, the prefix hotness (key-range hotness) follows
6344   // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
6345   // However, we cannot directly use the inverse function to decide a
6346   // key-range from a random distribution. To achieve it, we create a list of
6347   // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
6348   // decided based on the hotness of the key-range. When a random value is
6349   // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
6350   // and one KeyrangeUnit is selected. The probability of a  KeyrangeUnit being
6351   // selected is the same as the hotness of this KeyrangeUnit. After that, the
6352   // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
6353   // can based on the power distribution (y=ax^b) to generate the offset of
6354   // the key in the selected key-range. In this way, we generate the keyID
6355   // based on the hotness of the prefix and also the key hotness distribution.
6356   class GenerateTwoTermExpKeys {
6357    public:
6358     // Avoid uninitialized warning-as-error in some compilers
6359     int64_t keyrange_rand_max_ = 0;
6360     int64_t keyrange_size_ = 0;
6361     int64_t keyrange_num_ = 0;
6362     std::vector<KeyrangeUnit> keyrange_set_;
6363
6364     // Initiate the KeyrangeUnit vector and calculate the size of each
6365     // KeyrangeUnit.
6366     Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
6367                                    double prefix_b, double prefix_c,
6368                                    double prefix_d) {
6369       int64_t amplify = 0;
6370       int64_t keyrange_start = 0;
6371       if (FLAGS_keyrange_num <= 0) {
6372         keyrange_num_ = 1;
6373       } else {
6374         keyrange_num_ = FLAGS_keyrange_num;
6375       }
6376       keyrange_size_ = total_keys / keyrange_num_;
6377
6378       // Calculate the key-range shares size based on the input parameters
6379       for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
6380         // Step 1. Calculate the probability that this key range will be
6381         // accessed in a query. It is based on the two-term expoential
6382         // distribution
6383         double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
6384                             prefix_c * std::exp(prefix_d * pfx);
6385         if (keyrange_p < std::pow(10.0, -16.0)) {
6386           keyrange_p = 0.0;
6387         }
6388         // Step 2. Calculate the amplify
6389         // In order to allocate a query to a key-range based on the random
6390         // number generated for this query, we need to extend the probability
6391         // of each key range from [0,1] to [0, amplify]. Amplify is calculated
6392         // by 1/(smallest key-range probability). In this way, we ensure that
6393         // all key-ranges are assigned with an Integer that  >=0
6394         if (amplify == 0 && keyrange_p > 0) {
6395           amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
6396         }
6397
6398         // Step 3. For each key-range, we calculate its position in the
6399         // [0, amplify] range, including the start, the size (keyrange_access)
6400         KeyrangeUnit p_unit;
6401         p_unit.keyrange_start = keyrange_start;
6402         if (0.0 >= keyrange_p) {
6403           p_unit.keyrange_access = 0;
6404         } else {
6405           p_unit.keyrange_access =
6406               static_cast<int64_t>(std::floor(amplify * keyrange_p));
6407         }
6408         p_unit.keyrange_keys = keyrange_size_;
6409         keyrange_set_.push_back(p_unit);
6410         keyrange_start += p_unit.keyrange_access;
6411       }
6412       keyrange_rand_max_ = keyrange_start;
6413
6414       // Step 4. Shuffle the key-ranges randomly
6415       // Since the access probability is calculated from small to large,
6416       // If we do not re-allocate them, hot key-ranges are always at the end
6417       // and cold key-ranges are at the begin of the key space. Therefore, the
6418       // key-ranges are shuffled and the rand seed is only decide by the
6419       // key-range hotness distribution. With the same distribution parameters
6420       // the shuffle results are the same.
6421       Random64 rand_loca(keyrange_rand_max_);
6422       for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
6423         int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
6424         assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
6425                pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
6426         std::swap(keyrange_set_[i], keyrange_set_[pos]);
6427       }
6428
6429       // Step 5. Recalculate the prefix start postion after shuffling
6430       int64_t offset = 0;
6431       for (auto& p_unit : keyrange_set_) {
6432         p_unit.keyrange_start = offset;
6433         offset += p_unit.keyrange_access;
6434       }
6435
6436       return Status::OK();
6437     }
6438
6439     // Generate the Key ID according to the input ini_rand and key distribution
6440     int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
6441                          double key_dist_b) {
6442       int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
6443
6444       // Calculate and select one key-range that contains the new key
6445       int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
6446       while (start + 1 < end) {
6447         int64_t mid = start + (end - start) / 2;
6448         assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
6449         if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
6450           end = mid;
6451         } else {
6452           start = mid;
6453         }
6454       }
6455       int64_t keyrange_id = start;
6456
6457       // Select one key in the key-range and compose the keyID
6458       int64_t key_offset = 0, key_seed;
6459       if (key_dist_a == 0.0 || key_dist_b == 0.0) {
6460         key_offset = ini_rand % keyrange_size_;
6461       } else {
6462         double u =
6463             static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
6464         key_seed = static_cast<int64_t>(
6465             ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
6466         Random64 rand_key(key_seed);
6467         key_offset = rand_key.Next() % keyrange_size_;
6468       }
6469       return keyrange_size_ * keyrange_id + key_offset;
6470     }
6471   };
6472
6473   // The social graph workload mixed with Get, Put, Iterator queries.
6474   // The value size and iterator length follow Pareto distribution.
6475   // The overall key access follow power distribution. If user models the
6476   // workload based on different key-ranges (or different prefixes), user
6477   // can use two-term-exponential distribution to fit the workload. User
6478   // needs to decide the ratio between Get, Put, Iterator queries before
6479   // starting the benchmark.
6480   void MixGraph(ThreadState* thread) {
6481     int64_t gets = 0;
6482     int64_t puts = 0;
6483     int64_t get_found = 0;
6484     int64_t seek = 0;
6485     int64_t seek_found = 0;
6486     int64_t bytes = 0;
6487     double total_scan_length = 0;
6488     double total_val_size = 0;
6489     const int64_t default_value_max = 1 * 1024 * 1024;
6490     int64_t value_max = default_value_max;
6491     int64_t scan_len_max = FLAGS_mix_max_scan_len;
6492     double write_rate = 1000000.0;
6493     double read_rate = 1000000.0;
6494     bool use_prefix_modeling = false;
6495     bool use_random_modeling = false;
6496     GenerateTwoTermExpKeys gen_exp;
6497     std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
6498                               FLAGS_mix_seek_ratio};
6499     char value_buffer[default_value_max];
6500     QueryDecider query;
6501     RandomGenerator gen;
6502     Status s;
6503     if (value_max > FLAGS_mix_max_value_size) {
6504       value_max = FLAGS_mix_max_value_size;
6505     }
6506
6507     std::unique_ptr<const char[]> key_guard;
6508     Slice key = AllocateKey(&key_guard);
6509     PinnableSlice pinnable_val;
6510     query.Initiate(ratio);
6511
6512     // the limit of qps initiation
6513     if (FLAGS_sine_mix_rate) {
6514       thread->shared->read_rate_limiter.reset(
6515           NewGenericRateLimiter(static_cast<int64_t>(read_rate)));
6516       thread->shared->write_rate_limiter.reset(
6517           NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
6518     }
6519
6520     // Decide if user wants to use prefix based key generation
6521     if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
6522         FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
6523       use_prefix_modeling = true;
6524       gen_exp.InitiateExpDistribution(
6525           FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
6526           FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
6527     }
6528     if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
6529       use_random_modeling = true;
6530     }
6531
6532     Duration duration(FLAGS_duration, reads_);
6533     while (!duration.Done(1)) {
6534       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6535       int64_t ini_rand, rand_v, key_rand, key_seed;
6536       ini_rand = GetRandomKey(&thread->rand);
6537       rand_v = ini_rand % FLAGS_num;
6538       double u = static_cast<double>(rand_v) / FLAGS_num;
6539
6540       // Generate the keyID based on the key hotness and prefix hotness
6541       if (use_random_modeling) {
6542         key_rand = ini_rand;
6543       } else if (use_prefix_modeling) {
6544         key_rand =
6545             gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
6546       } else {
6547         key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
6548         Random64 rand(key_seed);
6549         key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
6550       }
6551       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
6552       int query_type = query.GetType(rand_v);
6553
6554       // change the qps
6555       uint64_t now = FLAGS_env->NowMicros();
6556       uint64_t usecs_since_last;
6557       if (now > thread->stats.GetSineInterval()) {
6558         usecs_since_last = now - thread->stats.GetSineInterval();
6559       } else {
6560         usecs_since_last = 0;
6561       }
6562
6563       if (FLAGS_sine_mix_rate &&
6564           usecs_since_last >
6565               (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
6566         double usecs_since_start =
6567             static_cast<double>(now - thread->stats.GetStart());
6568         thread->stats.ResetSineInterval();
6569         double mix_rate_with_noise = AddNoise(
6570             SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
6571         read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
6572         write_rate = mix_rate_with_noise * query.ratio_[1];
6573
6574         if (read_rate > 0) {
6575           thread->shared->read_rate_limiter->SetBytesPerSecond(
6576               static_cast<int64_t>(read_rate));
6577         }
6578         if (write_rate > 0) {
6579           thread->shared->write_rate_limiter->SetBytesPerSecond(
6580               static_cast<int64_t>(write_rate));
6581         }
6582       }
6583       // Start the query
6584       if (query_type == 0) {
6585         // the Get query
6586         gets++;
6587         if (FLAGS_num_column_families > 1) {
6588           s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
6589                                    key, &pinnable_val);
6590         } else {
6591           pinnable_val.Reset();
6592           s = db_with_cfh->db->Get(read_options_,
6593                                    db_with_cfh->db->DefaultColumnFamily(), key,
6594                                    &pinnable_val);
6595         }
6596
6597         if (s.ok()) {
6598           get_found++;
6599           bytes += key.size() + pinnable_val.size();
6600         } else if (!s.IsNotFound()) {
6601           fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
6602           abort();
6603         }
6604
6605         if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) {
6606           thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH,
6607                                                      nullptr /*stats*/);
6608         }
6609         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6610       } else if (query_type == 1) {
6611         // the Put query
6612         puts++;
6613         int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta,
6614                                               FLAGS_value_k, FLAGS_value_sigma);
6615         if (val_size < 10) {
6616           val_size = 10;
6617         } else if (val_size > value_max) {
6618           val_size = val_size % value_max;
6619         }
6620         total_val_size += val_size;
6621
6622         s = db_with_cfh->db->Put(
6623             write_options_, key,
6624             gen.Generate(static_cast<unsigned int>(val_size)));
6625         if (!s.ok()) {
6626           fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6627           ErrorExit();
6628         }
6629
6630         if (thread->shared->write_rate_limiter && puts % 100 == 0) {
6631           thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH,
6632                                                       nullptr /*stats*/);
6633         }
6634         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
6635       } else if (query_type == 2) {
6636         // Seek query
6637         if (db_with_cfh->db != nullptr) {
6638           Iterator* single_iter = nullptr;
6639           single_iter = db_with_cfh->db->NewIterator(read_options_);
6640           if (single_iter != nullptr) {
6641             single_iter->Seek(key);
6642             seek++;
6643             if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
6644               seek_found++;
6645             }
6646             int64_t scan_length =
6647                 ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
6648                                    FLAGS_iter_sigma) %
6649                 scan_len_max;
6650             for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
6651               Slice value = single_iter->value();
6652               memcpy(value_buffer, value.data(),
6653                      std::min(value.size(), sizeof(value_buffer)));
6654               bytes += single_iter->key().size() + single_iter->value().size();
6655               single_iter->Next();
6656               assert(single_iter->status().ok());
6657               total_scan_length++;
6658             }
6659           }
6660           delete single_iter;
6661         }
6662         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
6663       }
6664     }
6665     char msg[256];
6666     snprintf(msg, sizeof(msg),
6667              "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64
6668              ", reads %" PRIu64 " in %" PRIu64
6669              " found, "
6670              "avg size: %.1f value, %.1f scan)\n",
6671              gets, puts, seek, get_found + seek_found, gets + seek,
6672              total_val_size / puts, total_scan_length / seek);
6673
6674     thread->stats.AddBytes(bytes);
6675     thread->stats.AddMessage(msg);
6676   }
6677
6678   void IteratorCreation(ThreadState* thread) {
6679     Duration duration(FLAGS_duration, reads_);
6680     ReadOptions options = read_options_;
6681     std::unique_ptr<char[]> ts_guard;
6682     if (user_timestamp_size_ > 0) {
6683       ts_guard.reset(new char[user_timestamp_size_]);
6684     }
6685     while (!duration.Done(1)) {
6686       DB* db = SelectDB(thread);
6687       Slice ts;
6688       if (user_timestamp_size_ > 0) {
6689         ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6690         options.timestamp = &ts;
6691       }
6692       Iterator* iter = db->NewIterator(options);
6693       delete iter;
6694       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
6695     }
6696   }
6697
6698   void IteratorCreationWhileWriting(ThreadState* thread) {
6699     if (thread->tid > 0) {
6700       IteratorCreation(thread);
6701     } else {
6702       BGWriter(thread, kWrite);
6703     }
6704   }
6705
6706   void SeekRandom(ThreadState* thread) {
6707     int64_t read = 0;
6708     int64_t found = 0;
6709     int64_t bytes = 0;
6710     ReadOptions options = read_options_;
6711     std::unique_ptr<char[]> ts_guard;
6712     Slice ts;
6713     if (user_timestamp_size_ > 0) {
6714       ts_guard.reset(new char[user_timestamp_size_]);
6715       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6716       options.timestamp = &ts;
6717     }
6718
6719     std::vector<Iterator*> tailing_iters;
6720     if (FLAGS_use_tailing_iterator) {
6721       if (db_.db != nullptr) {
6722         tailing_iters.push_back(db_.db->NewIterator(options));
6723       } else {
6724         for (const auto& db_with_cfh : multi_dbs_) {
6725           tailing_iters.push_back(db_with_cfh.db->NewIterator(options));
6726         }
6727       }
6728     }
6729     options.auto_prefix_mode = FLAGS_auto_prefix_mode;
6730
6731     std::unique_ptr<const char[]> key_guard;
6732     Slice key = AllocateKey(&key_guard);
6733
6734     std::unique_ptr<const char[]> upper_bound_key_guard;
6735     Slice upper_bound = AllocateKey(&upper_bound_key_guard);
6736     std::unique_ptr<const char[]> lower_bound_key_guard;
6737     Slice lower_bound = AllocateKey(&lower_bound_key_guard);
6738
6739     Duration duration(FLAGS_duration, reads_);
6740     char value_buffer[256];
6741     while (!duration.Done(1)) {
6742       int64_t seek_pos = thread->rand.Next() % FLAGS_num;
6743       GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
6744                                 &key);
6745       if (FLAGS_max_scan_distance != 0) {
6746         if (FLAGS_reverse_iterator) {
6747           GenerateKeyFromInt(
6748               static_cast<uint64_t>(std::max(
6749                   static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
6750               FLAGS_num, &lower_bound);
6751           options.iterate_lower_bound = &lower_bound;
6752         } else {
6753           auto min_num =
6754               std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
6755           GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
6756                              &upper_bound);
6757           options.iterate_upper_bound = &upper_bound;
6758         }
6759       } else if (FLAGS_auto_prefix_mode && prefix_extractor_ &&
6760                  !FLAGS_reverse_iterator) {
6761         // Set upper bound to next prefix
6762         auto mutable_upper_bound = const_cast<char*>(upper_bound.data());
6763         std::memcpy(mutable_upper_bound, key.data(), prefix_size_);
6764         mutable_upper_bound[prefix_size_ - 1]++;
6765         upper_bound = Slice(upper_bound.data(), prefix_size_);
6766         options.iterate_upper_bound = &upper_bound;
6767       }
6768
6769       // Pick a Iterator to use
6770       uint64_t db_idx_to_use =
6771           (db_.db == nullptr)
6772               ? (uint64_t{thread->rand.Next()} % multi_dbs_.size())
6773               : 0;
6774       std::unique_ptr<Iterator> single_iter;
6775       Iterator* iter_to_use;
6776       if (FLAGS_use_tailing_iterator) {
6777         iter_to_use = tailing_iters[db_idx_to_use];
6778       } else {
6779         if (db_.db != nullptr) {
6780           single_iter.reset(db_.db->NewIterator(options));
6781         } else {
6782           single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options));
6783         }
6784         iter_to_use = single_iter.get();
6785       }
6786
6787       iter_to_use->Seek(key);
6788       read++;
6789       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
6790         found++;
6791       }
6792
6793       for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
6794         // Copy out iterator's value to make sure we read them.
6795         Slice value = iter_to_use->value();
6796         memcpy(value_buffer, value.data(),
6797                std::min(value.size(), sizeof(value_buffer)));
6798         bytes += iter_to_use->key().size() + iter_to_use->value().size();
6799
6800         if (!FLAGS_reverse_iterator) {
6801           iter_to_use->Next();
6802         } else {
6803           iter_to_use->Prev();
6804         }
6805         assert(iter_to_use->status().ok());
6806       }
6807
6808       if (thread->shared->read_rate_limiter.get() != nullptr &&
6809           read % 256 == 255) {
6810         thread->shared->read_rate_limiter->Request(
6811             256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6812       }
6813
6814       thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6815     }
6816     for (auto iter : tailing_iters) {
6817       delete iter;
6818     }
6819
6820     char msg[100];
6821     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
6822              read);
6823     thread->stats.AddBytes(bytes);
6824     thread->stats.AddMessage(msg);
6825   }
6826
6827   void SeekRandomWhileWriting(ThreadState* thread) {
6828     if (thread->tid > 0) {
6829       SeekRandom(thread);
6830     } else {
6831       BGWriter(thread, kWrite);
6832     }
6833   }
6834
6835   void SeekRandomWhileMerging(ThreadState* thread) {
6836     if (thread->tid > 0) {
6837       SeekRandom(thread);
6838     } else {
6839       BGWriter(thread, kMerge);
6840     }
6841   }
6842
6843   void DoDelete(ThreadState* thread, bool seq) {
6844     WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6845                      FLAGS_write_batch_protection_bytes_per_key,
6846                      user_timestamp_size_);
6847     Duration duration(seq ? 0 : FLAGS_duration, deletes_);
6848     int64_t i = 0;
6849     std::unique_ptr<const char[]> key_guard;
6850     Slice key = AllocateKey(&key_guard);
6851     std::unique_ptr<char[]> ts_guard;
6852     Slice ts;
6853     if (user_timestamp_size_ > 0) {
6854       ts_guard.reset(new char[user_timestamp_size_]);
6855     }
6856
6857     while (!duration.Done(entries_per_batch_)) {
6858       DB* db = SelectDB(thread);
6859       batch.Clear();
6860       for (int64_t j = 0; j < entries_per_batch_; ++j) {
6861         const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
6862         GenerateKeyFromInt(k, FLAGS_num, &key);
6863         batch.Delete(key);
6864       }
6865       Status s;
6866       if (user_timestamp_size_ > 0) {
6867         ts = mock_app_clock_->Allocate(ts_guard.get());
6868         s = batch.UpdateTimestamps(
6869             ts, [this](uint32_t) { return user_timestamp_size_; });
6870         if (!s.ok()) {
6871           fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
6872           ErrorExit();
6873         }
6874       }
6875       s = db->Write(write_options_, &batch);
6876       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
6877       if (!s.ok()) {
6878         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
6879         exit(1);
6880       }
6881       i += entries_per_batch_;
6882     }
6883   }
6884
6885   void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); }
6886
6887   void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); }
6888
6889   void ReadWhileWriting(ThreadState* thread) {
6890     if (thread->tid > 0) {
6891       ReadRandom(thread);
6892     } else {
6893       BGWriter(thread, kWrite);
6894     }
6895   }
6896
6897   void MultiReadWhileWriting(ThreadState* thread) {
6898     if (thread->tid > 0) {
6899       MultiReadRandom(thread);
6900     } else {
6901       BGWriter(thread, kWrite);
6902     }
6903   }
6904
6905   void ReadWhileMerging(ThreadState* thread) {
6906     if (thread->tid > 0) {
6907       ReadRandom(thread);
6908     } else {
6909       BGWriter(thread, kMerge);
6910     }
6911   }
6912
6913   void BGWriter(ThreadState* thread, enum OperationType write_merge) {
6914     // Special thread that keeps writing until other threads are done.
6915     RandomGenerator gen;
6916     int64_t bytes = 0;
6917
6918     std::unique_ptr<RateLimiter> write_rate_limiter;
6919     if (FLAGS_benchmark_write_rate_limit > 0) {
6920       write_rate_limiter.reset(
6921           NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
6922     }
6923
6924     // Don't merge stats from this thread with the readers.
6925     thread->stats.SetExcludeFromMerge();
6926
6927     std::unique_ptr<const char[]> key_guard;
6928     Slice key = AllocateKey(&key_guard);
6929     std::unique_ptr<char[]> ts_guard;
6930     std::unique_ptr<const char[]> begin_key_guard;
6931     Slice begin_key = AllocateKey(&begin_key_guard);
6932     std::unique_ptr<const char[]> end_key_guard;
6933     Slice end_key = AllocateKey(&end_key_guard);
6934     uint64_t num_range_deletions = 0;
6935     std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
6936     std::vector<Slice> expanded_keys;
6937     if (FLAGS_expand_range_tombstones) {
6938       expanded_key_guards.resize(range_tombstone_width_);
6939       for (auto& expanded_key_guard : expanded_key_guards) {
6940         expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
6941       }
6942     }
6943     if (user_timestamp_size_ > 0) {
6944       ts_guard.reset(new char[user_timestamp_size_]);
6945     }
6946     uint32_t written = 0;
6947     bool hint_printed = false;
6948
6949     while (true) {
6950       DB* db = SelectDB(thread);
6951       {
6952         MutexLock l(&thread->shared->mu);
6953         if (FLAGS_finish_after_writes && written == writes_) {
6954           fprintf(stderr, "Exiting the writer after %u writes...\n", written);
6955           break;
6956         }
6957         if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
6958           // Other threads have finished
6959           if (FLAGS_finish_after_writes) {
6960             // Wait for the writes to be finished
6961             if (!hint_printed) {
6962               fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
6963                       static_cast<int>(writes_) - written);
6964               hint_printed = true;
6965             }
6966           } else {
6967             // Finish the write immediately
6968             break;
6969           }
6970         }
6971       }
6972
6973       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6974       Status s;
6975
6976       Slice val = gen.Generate();
6977       Slice ts;
6978       if (user_timestamp_size_ > 0) {
6979         ts = mock_app_clock_->Allocate(ts_guard.get());
6980       }
6981       if (write_merge == kWrite) {
6982         if (user_timestamp_size_ == 0) {
6983           s = db->Put(write_options_, key, val);
6984         } else {
6985           s = db->Put(write_options_, key, ts, val);
6986         }
6987       } else {
6988         s = db->Merge(write_options_, key, val);
6989       }
6990       // Restore write_options_
6991       written++;
6992
6993       if (!s.ok()) {
6994         fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
6995         exit(1);
6996       }
6997       bytes += key.size() + val.size() + user_timestamp_size_;
6998       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6999
7000       if (FLAGS_benchmark_write_rate_limit > 0) {
7001         write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
7002                                     nullptr /* stats */,
7003                                     RateLimiter::OpType::kWrite);
7004       }
7005
7006       if (writes_per_range_tombstone_ > 0 &&
7007           written > writes_before_delete_range_ &&
7008           (written - writes_before_delete_range_) /
7009                   writes_per_range_tombstone_ <=
7010               max_num_range_tombstones_ &&
7011           (written - writes_before_delete_range_) %
7012                   writes_per_range_tombstone_ ==
7013               0) {
7014         num_range_deletions++;
7015         int64_t begin_num = thread->rand.Next() % FLAGS_num;
7016         if (FLAGS_expand_range_tombstones) {
7017           for (int64_t offset = 0; offset < range_tombstone_width_; ++offset) {
7018             GenerateKeyFromInt(begin_num + offset, FLAGS_num,
7019                                &expanded_keys[offset]);
7020             if (!db->Delete(write_options_, expanded_keys[offset]).ok()) {
7021               fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
7022               exit(1);
7023             }
7024           }
7025         } else {
7026           GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
7027           GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
7028                              &end_key);
7029           if (!db->DeleteRange(write_options_, db->DefaultColumnFamily(),
7030                                begin_key, end_key)
7031                    .ok()) {
7032             fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str());
7033             exit(1);
7034           }
7035         }
7036         thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7037         // TODO: DeleteRange is not included in calculcation of bytes/rate
7038         // limiter request
7039       }
7040     }
7041     if (num_range_deletions > 0) {
7042       std::cout << "Number of range deletions: " << num_range_deletions
7043                 << std::endl;
7044     }
7045     thread->stats.AddBytes(bytes);
7046   }
7047
7048   void ReadWhileScanning(ThreadState* thread) {
7049     if (thread->tid > 0) {
7050       ReadRandom(thread);
7051     } else {
7052       BGScan(thread);
7053     }
7054   }
7055
7056   void BGScan(ThreadState* thread) {
7057     if (FLAGS_num_multi_db > 0) {
7058       fprintf(stderr, "Not supporting multiple DBs.\n");
7059       abort();
7060     }
7061     assert(db_.db != nullptr);
7062     ReadOptions read_options = read_options_;
7063     std::unique_ptr<char[]> ts_guard;
7064     Slice ts;
7065     if (user_timestamp_size_ > 0) {
7066       ts_guard.reset(new char[user_timestamp_size_]);
7067       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
7068       read_options.timestamp = &ts;
7069     }
7070     Iterator* iter = db_.db->NewIterator(read_options);
7071
7072     fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
7073     Duration duration(FLAGS_duration, reads_);
7074     uint64_t num_seek_to_first = 0;
7075     uint64_t num_next = 0;
7076     while (!duration.Done(1)) {
7077       if (!iter->Valid()) {
7078         iter->SeekToFirst();
7079         num_seek_to_first++;
7080       } else if (!iter->status().ok()) {
7081         fprintf(stderr, "Iterator error: %s\n",
7082                 iter->status().ToString().c_str());
7083         abort();
7084       } else {
7085         iter->Next();
7086         num_next++;
7087       }
7088
7089       thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
7090     }
7091     delete iter;
7092   }
7093
7094   // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
7095   // in DB atomically i.e in a single batch. Also refer GetMany.
7096   Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
7097                  const Slice& value) {
7098     std::string suffixes[3] = {"2", "1", "0"};
7099     std::string keys[3];
7100
7101     WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
7102                      FLAGS_write_batch_protection_bytes_per_key,
7103                      user_timestamp_size_);
7104     Status s;
7105     for (int i = 0; i < 3; i++) {
7106       keys[i] = key.ToString() + suffixes[i];
7107       batch.Put(keys[i], value);
7108     }
7109
7110     std::unique_ptr<char[]> ts_guard;
7111     if (user_timestamp_size_ > 0) {
7112       ts_guard.reset(new char[user_timestamp_size_]);
7113       Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7114       s = batch.UpdateTimestamps(
7115           ts, [this](uint32_t) { return user_timestamp_size_; });
7116       if (!s.ok()) {
7117         fprintf(stderr, "assign timestamp to batch: %s\n",
7118                 s.ToString().c_str());
7119         ErrorExit();
7120       }
7121     }
7122
7123     s = db->Write(writeoptions, &batch);
7124     return s;
7125   }
7126
7127   // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
7128   // in DB atomically i.e in a single batch. Also refer GetMany.
7129   Status DeleteMany(DB* db, const WriteOptions& writeoptions,
7130                     const Slice& key) {
7131     std::string suffixes[3] = {"1", "2", "0"};
7132     std::string keys[3];
7133
7134     WriteBatch batch(0, 0, FLAGS_write_batch_protection_bytes_per_key,
7135                      user_timestamp_size_);
7136     Status s;
7137     for (int i = 0; i < 3; i++) {
7138       keys[i] = key.ToString() + suffixes[i];
7139       batch.Delete(keys[i]);
7140     }
7141
7142     std::unique_ptr<char[]> ts_guard;
7143     if (user_timestamp_size_ > 0) {
7144       ts_guard.reset(new char[user_timestamp_size_]);
7145       Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7146       s = batch.UpdateTimestamps(
7147           ts, [this](uint32_t) { return user_timestamp_size_; });
7148       if (!s.ok()) {
7149         fprintf(stderr, "assign timestamp to batch: %s\n",
7150                 s.ToString().c_str());
7151         ErrorExit();
7152       }
7153     }
7154
7155     s = db->Write(writeoptions, &batch);
7156     return s;
7157   }
7158
7159   // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
7160   // in the same snapshot, and verifies that all the values are identical.
7161   // ASSUMES that PutMany was used to put (K, V) into the DB.
7162   Status GetMany(DB* db, const Slice& key, std::string* value) {
7163     std::string suffixes[3] = {"0", "1", "2"};
7164     std::string keys[3];
7165     Slice key_slices[3];
7166     std::string values[3];
7167     ReadOptions readoptionscopy = read_options_;
7168
7169     std::unique_ptr<char[]> ts_guard;
7170     Slice ts;
7171     if (user_timestamp_size_ > 0) {
7172       ts_guard.reset(new char[user_timestamp_size_]);
7173       ts = mock_app_clock_->Allocate(ts_guard.get());
7174       readoptionscopy.timestamp = &ts;
7175     }
7176
7177     readoptionscopy.snapshot = db->GetSnapshot();
7178     Status s;
7179     for (int i = 0; i < 3; i++) {
7180       keys[i] = key.ToString() + suffixes[i];
7181       key_slices[i] = keys[i];
7182       s = db->Get(readoptionscopy, key_slices[i], value);
7183       if (!s.ok() && !s.IsNotFound()) {
7184         fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7185         values[i] = "";
7186         // we continue after error rather than exiting so that we can
7187         // find more errors if any
7188       } else if (s.IsNotFound()) {
7189         values[i] = "";
7190       } else {
7191         values[i] = *value;
7192       }
7193     }
7194     db->ReleaseSnapshot(readoptionscopy.snapshot);
7195
7196     if ((values[0] != values[1]) || (values[1] != values[2])) {
7197       fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
7198               key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
7199               values[2].c_str());
7200       // we continue after error rather than exiting so that we can
7201       // find more errors if any
7202     }
7203
7204     return s;
7205   }
7206
7207   // Differs from readrandomwriterandom in the following ways:
7208   // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
7209   // (b) Does deletes as well (per FLAGS_deletepercent)
7210   // (c) In order to achieve high % of 'found' during lookups, and to do
7211   //     multiple writes (including puts and deletes) it uses upto
7212   //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
7213   // (d) Does not have a MultiGet option.
7214   void RandomWithVerify(ThreadState* thread) {
7215     RandomGenerator gen;
7216     std::string value;
7217     int64_t found = 0;
7218     int get_weight = 0;
7219     int put_weight = 0;
7220     int delete_weight = 0;
7221     int64_t gets_done = 0;
7222     int64_t puts_done = 0;
7223     int64_t deletes_done = 0;
7224
7225     std::unique_ptr<const char[]> key_guard;
7226     Slice key = AllocateKey(&key_guard);
7227
7228     // the number of iterations is the larger of read_ or write_
7229     for (int64_t i = 0; i < readwrites_; i++) {
7230       DB* db = SelectDB(thread);
7231       if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
7232         // one batch completed, reinitialize for next batch
7233         get_weight = FLAGS_readwritepercent;
7234         delete_weight = FLAGS_deletepercent;
7235         put_weight = 100 - get_weight - delete_weight;
7236       }
7237       GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
7238                          FLAGS_numdistinct, &key);
7239       if (get_weight > 0) {
7240         // do all the gets first
7241         Status s = GetMany(db, key, &value);
7242         if (!s.ok() && !s.IsNotFound()) {
7243           fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
7244           // we continue after error rather than exiting so that we can
7245           // find more errors if any
7246         } else if (!s.IsNotFound()) {
7247           found++;
7248         }
7249         get_weight--;
7250         gets_done++;
7251         thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
7252       } else if (put_weight > 0) {
7253         // then do all the corresponding number of puts
7254         // for all the gets we have done earlier
7255         Status s = PutMany(db, write_options_, key, gen.Generate());
7256         if (!s.ok()) {
7257           fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
7258           exit(1);
7259         }
7260         put_weight--;
7261         puts_done++;
7262         thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7263       } else if (delete_weight > 0) {
7264         Status s = DeleteMany(db, write_options_, key);
7265         if (!s.ok()) {
7266           fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
7267           exit(1);
7268         }
7269         delete_weight--;
7270         deletes_done++;
7271         thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
7272       }
7273     }
7274     char msg[128];
7275     snprintf(msg, sizeof(msg),
7276              "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64
7277              " found:%" PRIu64 ")",
7278              gets_done, puts_done, deletes_done, readwrites_, found);
7279     thread->stats.AddMessage(msg);
7280   }
7281
7282   // This is different from ReadWhileWriting because it does not use
7283   // an extra thread.
7284   void ReadRandomWriteRandom(ThreadState* thread) {
7285     ReadOptions options = read_options_;
7286     RandomGenerator gen;
7287     std::string value;
7288     int64_t found = 0;
7289     int get_weight = 0;
7290     int put_weight = 0;
7291     int64_t reads_done = 0;
7292     int64_t writes_done = 0;
7293     Duration duration(FLAGS_duration, readwrites_);
7294
7295     std::unique_ptr<const char[]> key_guard;
7296     Slice key = AllocateKey(&key_guard);
7297
7298     std::unique_ptr<char[]> ts_guard;
7299     if (user_timestamp_size_ > 0) {
7300       ts_guard.reset(new char[user_timestamp_size_]);
7301     }
7302
7303     // the number of iterations is the larger of read_ or write_
7304     while (!duration.Done(1)) {
7305       DB* db = SelectDB(thread);
7306       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7307       if (get_weight == 0 && put_weight == 0) {
7308         // one batch completed, reinitialize for next batch
7309         get_weight = FLAGS_readwritepercent;
7310         put_weight = 100 - get_weight;
7311       }
7312       if (get_weight > 0) {
7313         // do all the gets first
7314         Slice ts;
7315         if (user_timestamp_size_ > 0) {
7316           ts = mock_app_clock_->GetTimestampForRead(thread->rand,
7317                                                     ts_guard.get());
7318           options.timestamp = &ts;
7319         }
7320         Status s = db->Get(options, key, &value);
7321         if (!s.ok() && !s.IsNotFound()) {
7322           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7323           // we continue after error rather than exiting so that we can
7324           // find more errors if any
7325         } else if (!s.IsNotFound()) {
7326           found++;
7327         }
7328         get_weight--;
7329         reads_done++;
7330         thread->stats.FinishedOps(nullptr, db, 1, kRead);
7331       } else if (put_weight > 0) {
7332         // then do all the corresponding number of puts
7333         // for all the gets we have done earlier
7334         Status s;
7335         if (user_timestamp_size_ > 0) {
7336           Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7337           s = db->Put(write_options_, key, ts, gen.Generate());
7338         } else {
7339           s = db->Put(write_options_, key, gen.Generate());
7340         }
7341         if (!s.ok()) {
7342           fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7343           ErrorExit();
7344         }
7345         put_weight--;
7346         writes_done++;
7347         thread->stats.FinishedOps(nullptr, db, 1, kWrite);
7348       }
7349     }
7350     char msg[100];
7351     snprintf(msg, sizeof(msg),
7352              "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64
7353              " found:%" PRIu64 ")",
7354              reads_done, writes_done, readwrites_, found);
7355     thread->stats.AddMessage(msg);
7356   }
7357
7358   //
7359   // Read-modify-write for random keys
7360   void UpdateRandom(ThreadState* thread) {
7361     ReadOptions options = read_options_;
7362     RandomGenerator gen;
7363     std::string value;
7364     int64_t found = 0;
7365     int64_t bytes = 0;
7366     Duration duration(FLAGS_duration, readwrites_);
7367
7368     std::unique_ptr<const char[]> key_guard;
7369     Slice key = AllocateKey(&key_guard);
7370     std::unique_ptr<char[]> ts_guard;
7371     if (user_timestamp_size_ > 0) {
7372       ts_guard.reset(new char[user_timestamp_size_]);
7373     }
7374     // the number of iterations is the larger of read_ or write_
7375     while (!duration.Done(1)) {
7376       DB* db = SelectDB(thread);
7377       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7378       Slice ts;
7379       if (user_timestamp_size_ > 0) {
7380         // Read with newest timestamp because we are doing rmw.
7381         ts = mock_app_clock_->Allocate(ts_guard.get());
7382         options.timestamp = &ts;
7383       }
7384
7385       auto status = db->Get(options, key, &value);
7386       if (status.ok()) {
7387         ++found;
7388         bytes += key.size() + value.size() + user_timestamp_size_;
7389       } else if (!status.IsNotFound()) {
7390         fprintf(stderr, "Get returned an error: %s\n",
7391                 status.ToString().c_str());
7392         abort();
7393       }
7394
7395       if (thread->shared->write_rate_limiter) {
7396         thread->shared->write_rate_limiter->Request(
7397             key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
7398             RateLimiter::OpType::kWrite);
7399       }
7400
7401       Slice val = gen.Generate();
7402       Status s;
7403       if (user_timestamp_size_ > 0) {
7404         ts = mock_app_clock_->Allocate(ts_guard.get());
7405         s = db->Put(write_options_, key, ts, val);
7406       } else {
7407         s = db->Put(write_options_, key, val);
7408       }
7409       if (!s.ok()) {
7410         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7411         exit(1);
7412       }
7413       bytes += key.size() + val.size() + user_timestamp_size_;
7414       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
7415     }
7416     char msg[100];
7417     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7418              readwrites_, found);
7419     thread->stats.AddBytes(bytes);
7420     thread->stats.AddMessage(msg);
7421   }
7422
7423   // Read-XOR-write for random keys. Xors the existing value with a randomly
7424   // generated value, and stores the result. Assuming A in the array of bytes
7425   // representing the existing value, we generate an array B of the same size,
7426   // then compute C = A^B as C[i]=A[i]^B[i], and store C
7427   void XORUpdateRandom(ThreadState* thread) {
7428     ReadOptions options = read_options_;
7429     RandomGenerator gen;
7430     std::string existing_value;
7431     int64_t found = 0;
7432     Duration duration(FLAGS_duration, readwrites_);
7433
7434     BytesXOROperator xor_operator;
7435
7436     std::unique_ptr<const char[]> key_guard;
7437     Slice key = AllocateKey(&key_guard);
7438     std::unique_ptr<char[]> ts_guard;
7439     if (user_timestamp_size_ > 0) {
7440       ts_guard.reset(new char[user_timestamp_size_]);
7441     }
7442     // the number of iterations is the larger of read_ or write_
7443     while (!duration.Done(1)) {
7444       DB* db = SelectDB(thread);
7445       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7446       Slice ts;
7447       if (user_timestamp_size_ > 0) {
7448         ts = mock_app_clock_->Allocate(ts_guard.get());
7449         options.timestamp = &ts;
7450       }
7451
7452       auto status = db->Get(options, key, &existing_value);
7453       if (status.ok()) {
7454         ++found;
7455       } else if (!status.IsNotFound()) {
7456         fprintf(stderr, "Get returned an error: %s\n",
7457                 status.ToString().c_str());
7458         exit(1);
7459       }
7460
7461       Slice value =
7462           gen.Generate(static_cast<unsigned int>(existing_value.size()));
7463       std::string new_value;
7464
7465       if (status.ok()) {
7466         Slice existing_value_slice = Slice(existing_value);
7467         xor_operator.XOR(&existing_value_slice, value, &new_value);
7468       } else {
7469         xor_operator.XOR(nullptr, value, &new_value);
7470       }
7471
7472       Status s;
7473       if (user_timestamp_size_ > 0) {
7474         ts = mock_app_clock_->Allocate(ts_guard.get());
7475         s = db->Put(write_options_, key, ts, Slice(new_value));
7476       } else {
7477         s = db->Put(write_options_, key, Slice(new_value));
7478       }
7479       if (!s.ok()) {
7480         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7481         ErrorExit();
7482       }
7483       thread->stats.FinishedOps(nullptr, db, 1);
7484     }
7485     char msg[100];
7486     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7487              readwrites_, found);
7488     thread->stats.AddMessage(msg);
7489   }
7490
7491   // Read-modify-write for random keys.
7492   // Each operation causes the key grow by value_size (simulating an append).
7493   // Generally used for benchmarking against merges of similar type
7494   void AppendRandom(ThreadState* thread) {
7495     ReadOptions options = read_options_;
7496     RandomGenerator gen;
7497     std::string value;
7498     int64_t found = 0;
7499     int64_t bytes = 0;
7500
7501     std::unique_ptr<const char[]> key_guard;
7502     Slice key = AllocateKey(&key_guard);
7503     std::unique_ptr<char[]> ts_guard;
7504     if (user_timestamp_size_ > 0) {
7505       ts_guard.reset(new char[user_timestamp_size_]);
7506     }
7507     // The number of iterations is the larger of read_ or write_
7508     Duration duration(FLAGS_duration, readwrites_);
7509     while (!duration.Done(1)) {
7510       DB* db = SelectDB(thread);
7511       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7512       Slice ts;
7513       if (user_timestamp_size_ > 0) {
7514         ts = mock_app_clock_->Allocate(ts_guard.get());
7515         options.timestamp = &ts;
7516       }
7517
7518       auto status = db->Get(options, key, &value);
7519       if (status.ok()) {
7520         ++found;
7521         bytes += key.size() + value.size() + user_timestamp_size_;
7522       } else if (!status.IsNotFound()) {
7523         fprintf(stderr, "Get returned an error: %s\n",
7524                 status.ToString().c_str());
7525         abort();
7526       } else {
7527         // If not existing, then just assume an empty string of data
7528         value.clear();
7529       }
7530
7531       // Update the value (by appending data)
7532       Slice operand = gen.Generate();
7533       if (value.size() > 0) {
7534         // Use a delimiter to match the semantics for StringAppendOperator
7535         value.append(1, ',');
7536       }
7537       value.append(operand.data(), operand.size());
7538
7539       Status s;
7540       if (user_timestamp_size_ > 0) {
7541         ts = mock_app_clock_->Allocate(ts_guard.get());
7542         s = db->Put(write_options_, key, ts, value);
7543       } else {
7544         // Write back to the database
7545         s = db->Put(write_options_, key, value);
7546       }
7547       if (!s.ok()) {
7548         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7549         ErrorExit();
7550       }
7551       bytes += key.size() + value.size() + user_timestamp_size_;
7552       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
7553     }
7554
7555     char msg[100];
7556     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7557              readwrites_, found);
7558     thread->stats.AddBytes(bytes);
7559     thread->stats.AddMessage(msg);
7560   }
7561
7562   // Read-modify-write for random keys (using MergeOperator)
7563   // The merge operator to use should be defined by FLAGS_merge_operator
7564   // Adjust FLAGS_value_size so that the keys are reasonable for this operator
7565   // Assumes that the merge operator is non-null (i.e.: is well-defined)
7566   //
7567   // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
7568   // to simulate random additions over 64-bit integers using merge.
7569   //
7570   // The number of merges on the same key can be controlled by adjusting
7571   // FLAGS_merge_keys.
7572   void MergeRandom(ThreadState* thread) {
7573     RandomGenerator gen;
7574     int64_t bytes = 0;
7575     std::unique_ptr<const char[]> key_guard;
7576     Slice key = AllocateKey(&key_guard);
7577     // The number of iterations is the larger of read_ or write_
7578     Duration duration(FLAGS_duration, readwrites_);
7579     while (!duration.Done(1)) {
7580       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
7581       int64_t key_rand = thread->rand.Next() % merge_keys_;
7582       GenerateKeyFromInt(key_rand, merge_keys_, &key);
7583
7584       Status s;
7585       Slice val = gen.Generate();
7586       if (FLAGS_num_column_families > 1) {
7587         s = db_with_cfh->db->Merge(write_options_,
7588                                    db_with_cfh->GetCfh(key_rand), key, val);
7589       } else {
7590         s = db_with_cfh->db->Merge(
7591             write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val);
7592       }
7593
7594       if (!s.ok()) {
7595         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
7596         exit(1);
7597       }
7598       bytes += key.size() + val.size();
7599       thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
7600     }
7601
7602     // Print some statistics
7603     char msg[100];
7604     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
7605     thread->stats.AddBytes(bytes);
7606     thread->stats.AddMessage(msg);
7607   }
7608
7609   // Read and merge random keys. The amount of reads and merges are controlled
7610   // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
7611   // keys (and thus also the number of reads and merges on the same key) can be
7612   // adjusted with FLAGS_merge_keys.
7613   //
7614   // As with MergeRandom, the merge operator to use should be defined by
7615   // FLAGS_merge_operator.
7616   void ReadRandomMergeRandom(ThreadState* thread) {
7617     RandomGenerator gen;
7618     std::string value;
7619     int64_t num_hits = 0;
7620     int64_t num_gets = 0;
7621     int64_t num_merges = 0;
7622     size_t max_length = 0;
7623
7624     std::unique_ptr<const char[]> key_guard;
7625     Slice key = AllocateKey(&key_guard);
7626     // the number of iterations is the larger of read_ or write_
7627     Duration duration(FLAGS_duration, readwrites_);
7628     while (!duration.Done(1)) {
7629       DB* db = SelectDB(thread);
7630       GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
7631
7632       bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
7633
7634       if (do_merge) {
7635         Status s = db->Merge(write_options_, key, gen.Generate());
7636         if (!s.ok()) {
7637           fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
7638           exit(1);
7639         }
7640         num_merges++;
7641         thread->stats.FinishedOps(nullptr, db, 1, kMerge);
7642       } else {
7643         Status s = db->Get(read_options_, key, &value);
7644         if (value.length() > max_length) max_length = value.length();
7645
7646         if (!s.ok() && !s.IsNotFound()) {
7647           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7648           // we continue after error rather than exiting so that we can
7649           // find more errors if any
7650         } else if (!s.IsNotFound()) {
7651           num_hits++;
7652         }
7653         num_gets++;
7654         thread->stats.FinishedOps(nullptr, db, 1, kRead);
7655       }
7656     }
7657
7658     char msg[100];
7659     snprintf(msg, sizeof(msg),
7660              "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
7661              " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
7662              num_gets, num_merges, readwrites_, num_hits, max_length);
7663     thread->stats.AddMessage(msg);
7664   }
7665
7666   void WriteSeqSeekSeq(ThreadState* thread) {
7667     writes_ = FLAGS_num;
7668     DoWrite(thread, SEQUENTIAL);
7669     // exclude writes from the ops/sec calculation
7670     thread->stats.Start(thread->tid);
7671
7672     DB* db = SelectDB(thread);
7673     ReadOptions read_opts = read_options_;
7674     std::unique_ptr<char[]> ts_guard;
7675     Slice ts;
7676     if (user_timestamp_size_ > 0) {
7677       ts_guard.reset(new char[user_timestamp_size_]);
7678       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
7679       read_opts.timestamp = &ts;
7680     }
7681     std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
7682
7683     std::unique_ptr<const char[]> key_guard;
7684     Slice key = AllocateKey(&key_guard);
7685     for (int64_t i = 0; i < FLAGS_num; ++i) {
7686       GenerateKeyFromInt(i, FLAGS_num, &key);
7687       iter->Seek(key);
7688       assert(iter->Valid() && iter->key() == key);
7689       thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7690
7691       for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
7692         if (!FLAGS_reverse_iterator) {
7693           iter->Next();
7694         } else {
7695           iter->Prev();
7696         }
7697         GenerateKeyFromInt(++i, FLAGS_num, &key);
7698         assert(iter->Valid() && iter->key() == key);
7699         thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7700       }
7701
7702       iter->Seek(key);
7703       assert(iter->Valid() && iter->key() == key);
7704       thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7705     }
7706   }
7707
7708   bool binary_search(std::vector<int>& data, int start, int end, int key) {
7709     if (data.empty()) return false;
7710     if (start > end) return false;
7711     int mid = start + (end - start) / 2;
7712     if (mid > static_cast<int>(data.size()) - 1) return false;
7713     if (data[mid] == key) {
7714       return true;
7715     } else if (data[mid] > key) {
7716       return binary_search(data, start, mid - 1, key);
7717     } else {
7718       return binary_search(data, mid + 1, end, key);
7719     }
7720   }
7721
7722   // Does a bunch of merge operations for a key(key1) where the merge operand
7723   // is a sorted list. Next performance comparison is done between doing a Get
7724   // for key1 followed by searching for another key(key2) in the large sorted
7725   // list vs calling GetMergeOperands for key1 and then searching for the key2
7726   // in all the sorted sub-lists. Later case is expected to be a lot faster.
7727   void GetMergeOperands(ThreadState* thread) {
7728     DB* db = SelectDB(thread);
7729     const int kTotalValues = 100000;
7730     const int kListSize = 100;
7731     std::string key = "my_key";
7732     std::string value;
7733
7734     for (int i = 1; i < kTotalValues; i++) {
7735       if (i % kListSize == 0) {
7736         // Remove trailing ','
7737         value.pop_back();
7738         db->Merge(WriteOptions(), key, value);
7739         value.clear();
7740       } else {
7741         value.append(std::to_string(i)).append(",");
7742       }
7743     }
7744
7745     SortList s;
7746     std::vector<int> data;
7747     // This value can be experimented with and it will demonstrate the
7748     // perf difference between doing a Get and searching for lookup_key in the
7749     // resultant large sorted list vs doing GetMergeOperands and searching
7750     // for lookup_key within this resultant sorted sub-lists.
7751     int lookup_key = 1;
7752
7753     // Get API call
7754     std::cout << "--- Get API call --- \n";
7755     PinnableSlice p_slice;
7756     uint64_t st = FLAGS_env->NowNanos();
7757     db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
7758     s.MakeVector(data, p_slice);
7759     bool found =
7760         binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
7761     std::cout << "Found key? " << std::to_string(found) << "\n";
7762     uint64_t sp = FLAGS_env->NowNanos();
7763     std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
7764     std::string* dat_ = p_slice.GetSelf();
7765     std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
7766               << "\n";
7767     data.clear();
7768
7769     // GetMergeOperands API call
7770     std::cout << "--- GetMergeOperands API --- \n";
7771     std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
7772     st = FLAGS_env->NowNanos();
7773     int number_of_operands = 0;
7774     GetMergeOperandsOptions get_merge_operands_options;
7775     get_merge_operands_options.expected_max_number_of_operands =
7776         (kTotalValues / 100) + 1;
7777     db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
7778                          a_slice.data(), &get_merge_operands_options,
7779                          &number_of_operands);
7780     for (PinnableSlice& psl : a_slice) {
7781       s.MakeVector(data, psl);
7782       found =
7783           binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
7784       data.clear();
7785       if (found) break;
7786     }
7787     std::cout << "Found key? " << std::to_string(found) << "\n";
7788     sp = FLAGS_env->NowNanos();
7789     std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
7790               << " seconds \n";
7791     int to_print = 0;
7792     std::cout << "Sample data from GetMergeOperands API call: ";
7793     for (PinnableSlice& psl : a_slice) {
7794       std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
7795       if (to_print++ > 2) break;
7796     }
7797   }
7798
7799 #ifndef ROCKSDB_LITE
7800   void VerifyChecksum(ThreadState* thread) {
7801     DB* db = SelectDB(thread);
7802     ReadOptions ro;
7803     ro.adaptive_readahead = FLAGS_adaptive_readahead;
7804     ro.async_io = FLAGS_async_io;
7805     ro.rate_limiter_priority =
7806         FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
7807     ro.readahead_size = FLAGS_readahead_size;
7808     Status s = db->VerifyChecksum(ro);
7809     if (!s.ok()) {
7810       fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
7811       exit(1);
7812     }
7813   }
7814
7815   void VerifyFileChecksums(ThreadState* thread) {
7816     DB* db = SelectDB(thread);
7817     ReadOptions ro;
7818     ro.adaptive_readahead = FLAGS_adaptive_readahead;
7819     ro.async_io = FLAGS_async_io;
7820     ro.rate_limiter_priority =
7821         FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
7822     ro.readahead_size = FLAGS_readahead_size;
7823     Status s = db->VerifyFileChecksums(ro);
7824     if (!s.ok()) {
7825       fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
7826               s.ToString().c_str());
7827       exit(1);
7828     }
7829   }
7830
7831   // This benchmark stress tests Transactions.  For a given --duration (or
7832   // total number of --writes, a Transaction will perform a read-modify-write
7833   // to increment the value of a key in each of N(--transaction-sets) sets of
7834   // keys (where each set has --num keys).  If --threads is set, this will be
7835   // done in parallel.
7836   //
7837   // To test transactions, use --transaction_db=true.  Not setting this
7838   // parameter
7839   // will run the same benchmark without transactions.
7840   //
7841   // RandomTransactionVerify() will then validate the correctness of the results
7842   // by checking if the sum of all keys in each set is the same.
7843   void RandomTransaction(ThreadState* thread) {
7844     Duration duration(FLAGS_duration, readwrites_);
7845     uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
7846     uint64_t transactions_done = 0;
7847
7848     if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
7849       fprintf(stderr, "invalid value for transaction_sets\n");
7850       abort();
7851     }
7852
7853     TransactionOptions txn_options;
7854     txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
7855     txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
7856
7857     RandomTransactionInserter inserter(&thread->rand, write_options_,
7858                                        read_options_, FLAGS_num,
7859                                        num_prefix_ranges);
7860
7861     if (FLAGS_num_multi_db > 1) {
7862       fprintf(stderr,
7863               "Cannot run RandomTransaction benchmark with "
7864               "FLAGS_multi_db > 1.");
7865       abort();
7866     }
7867
7868     while (!duration.Done(1)) {
7869       bool success;
7870
7871       // RandomTransactionInserter will attempt to insert a key for each
7872       // # of FLAGS_transaction_sets
7873       if (FLAGS_optimistic_transaction_db) {
7874         success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
7875       } else if (FLAGS_transaction_db) {
7876         TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
7877         success = inserter.TransactionDBInsert(txn_db, txn_options);
7878       } else {
7879         success = inserter.DBInsert(db_.db);
7880       }
7881
7882       if (!success) {
7883         fprintf(stderr, "Unexpected error: %s\n",
7884                 inserter.GetLastStatus().ToString().c_str());
7885         abort();
7886       }
7887
7888       thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
7889       transactions_done++;
7890     }
7891
7892     char msg[100];
7893     if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
7894       snprintf(msg, sizeof(msg),
7895                "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
7896                transactions_done, inserter.GetFailureCount());
7897     } else {
7898       snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
7899     }
7900     thread->stats.AddMessage(msg);
7901     thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
7902   }
7903
7904   // Verifies consistency of data after RandomTransaction() has been run.
7905   // Since each iteration of RandomTransaction() incremented a key in each set
7906   // by the same value, the sum of the keys in each set should be the same.
7907   void RandomTransactionVerify() {
7908     if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
7909       // transactions not used, nothing to verify.
7910       return;
7911     }
7912
7913     Status s = RandomTransactionInserter::Verify(
7914         db_.db, static_cast<uint16_t>(FLAGS_transaction_sets));
7915
7916     if (s.ok()) {
7917       fprintf(stdout, "RandomTransactionVerify Success.\n");
7918     } else {
7919       fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
7920     }
7921   }
7922 #endif  // ROCKSDB_LITE
7923
7924   // Writes and deletes random keys without overwriting keys.
7925   //
7926   // This benchmark is intended to partially replicate the behavior of MyRocks
7927   // secondary indices: All data is stored in keys and updates happen by
7928   // deleting the old version of the key and inserting the new version.
7929   void RandomReplaceKeys(ThreadState* thread) {
7930     std::unique_ptr<const char[]> key_guard;
7931     Slice key = AllocateKey(&key_guard);
7932     std::unique_ptr<char[]> ts_guard;
7933     if (user_timestamp_size_ > 0) {
7934       ts_guard.reset(new char[user_timestamp_size_]);
7935     }
7936     std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
7937     size_t max_counter = 50;
7938     RandomGenerator gen;
7939
7940     Status s;
7941     DB* db = SelectDB(thread);
7942     for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
7943       GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
7944       if (user_timestamp_size_ > 0) {
7945         Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7946         s = db->Put(write_options_, key, ts, gen.Generate());
7947       } else {
7948         s = db->Put(write_options_, key, gen.Generate());
7949       }
7950       if (!s.ok()) {
7951         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7952         exit(1);
7953       }
7954     }
7955
7956     db->GetSnapshot();
7957
7958     std::default_random_engine generator;
7959     std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
7960                                                   FLAGS_stddev);
7961     Duration duration(FLAGS_duration, FLAGS_num);
7962     while (!duration.Done(1)) {
7963       int64_t rnd_id = static_cast<int64_t>(distribution(generator));
7964       int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
7965                                 static_cast<int64_t>(0));
7966       GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7967                          &key);
7968       if (user_timestamp_size_ > 0) {
7969         Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7970         s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts)
7971                                      : db->Delete(write_options_, key, ts);
7972       } else {
7973         s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
7974                                      : db->Delete(write_options_, key);
7975       }
7976       if (s.ok()) {
7977         counters[key_id] = (counters[key_id] + 1) % max_counter;
7978         GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7979                            &key);
7980         if (user_timestamp_size_ > 0) {
7981           Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7982           s = db->Put(write_options_, key, ts, Slice());
7983         } else {
7984           s = db->Put(write_options_, key, Slice());
7985         }
7986       }
7987
7988       if (!s.ok()) {
7989         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7990         exit(1);
7991       }
7992
7993       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
7994     }
7995
7996     char msg[200];
7997     snprintf(msg, sizeof(msg),
7998              "use single deletes: %d, "
7999              "standard deviation: %lf\n",
8000              FLAGS_use_single_deletes, FLAGS_stddev);
8001     thread->stats.AddMessage(msg);
8002   }
8003
8004   void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
8005     int64_t read = 0;
8006     int64_t found = 0;
8007     int64_t bytes = 0;
8008
8009     Iterator* iter = nullptr;
8010     // Only work on single database
8011     assert(db_.db != nullptr);
8012     iter = db_.db->NewIterator(read_options_);
8013
8014     std::unique_ptr<const char[]> key_guard;
8015     Slice key = AllocateKey(&key_guard);
8016
8017     char value_buffer[256];
8018     while (true) {
8019       {
8020         MutexLock l(&thread->shared->mu);
8021         if (thread->shared->num_done >= 1) {
8022           // Write thread have finished
8023           break;
8024         }
8025       }
8026       if (!FLAGS_use_tailing_iterator) {
8027         delete iter;
8028         iter = db_.db->NewIterator(read_options_);
8029       }
8030       // Pick a Iterator to use
8031
8032       int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
8033       GenerateKeyFromInt(key_id, FLAGS_num, &key);
8034       // Reset last 8 bytes to 0
8035       char* start = const_cast<char*>(key.data());
8036       start += key.size() - 8;
8037       memset(start, 0, 8);
8038       ++read;
8039
8040       bool key_found = false;
8041       // Seek the prefix
8042       for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
8043            iter->Next()) {
8044         key_found = true;
8045         // Copy out iterator's value to make sure we read them.
8046         if (do_deletion) {
8047           bytes += iter->key().size();
8048           if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
8049             thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
8050             db_.db->Delete(write_options_, iter->key());
8051           } else {
8052             break;
8053           }
8054         } else {
8055           bytes += iter->key().size() + iter->value().size();
8056           thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
8057           Slice value = iter->value();
8058           memcpy(value_buffer, value.data(),
8059                  std::min(value.size(), sizeof(value_buffer)));
8060
8061           assert(iter->status().ok());
8062         }
8063       }
8064       found += key_found;
8065
8066       if (thread->shared->read_rate_limiter.get() != nullptr) {
8067         thread->shared->read_rate_limiter->Request(
8068             1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
8069       }
8070     }
8071     delete iter;
8072
8073     char msg[100];
8074     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
8075              read);
8076     thread->stats.AddBytes(bytes);
8077     thread->stats.AddMessage(msg);
8078   }
8079
8080   void TimeSeriesWrite(ThreadState* thread) {
8081     // Special thread that keeps writing until other threads are done.
8082     RandomGenerator gen;
8083     int64_t bytes = 0;
8084
8085     // Don't merge stats from this thread with the readers.
8086     thread->stats.SetExcludeFromMerge();
8087
8088     std::unique_ptr<RateLimiter> write_rate_limiter;
8089     if (FLAGS_benchmark_write_rate_limit > 0) {
8090       write_rate_limiter.reset(
8091           NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
8092     }
8093
8094     std::unique_ptr<const char[]> key_guard;
8095     Slice key = AllocateKey(&key_guard);
8096
8097     Duration duration(FLAGS_duration, writes_);
8098     while (!duration.Done(1)) {
8099       DB* db = SelectDB(thread);
8100
8101       uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
8102       // Write key id
8103       GenerateKeyFromInt(key_id, FLAGS_num, &key);
8104       // Write timestamp
8105
8106       char* start = const_cast<char*>(key.data());
8107       char* pos = start + 8;
8108       int bytes_to_fill =
8109           std::min(key_size_ - static_cast<int>(pos - start), 8);
8110       uint64_t timestamp_value = timestamp_emulator_->Get();
8111       if (port::kLittleEndian) {
8112         for (int i = 0; i < bytes_to_fill; ++i) {
8113           pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
8114         }
8115       } else {
8116         memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
8117       }
8118
8119       timestamp_emulator_->Inc();
8120
8121       Status s;
8122       Slice val = gen.Generate();
8123       s = db->Put(write_options_, key, val);
8124
8125       if (!s.ok()) {
8126         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
8127         ErrorExit();
8128       }
8129       bytes = key.size() + val.size();
8130       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
8131       thread->stats.AddBytes(bytes);
8132
8133       if (FLAGS_benchmark_write_rate_limit > 0) {
8134         write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
8135                                     nullptr /* stats */,
8136                                     RateLimiter::OpType::kWrite);
8137       }
8138     }
8139   }
8140
8141   void TimeSeries(ThreadState* thread) {
8142     if (thread->tid > 0) {
8143       bool do_deletion = FLAGS_expire_style == "delete" &&
8144                          thread->tid <= FLAGS_num_deletion_threads;
8145       TimeSeriesReadOrDelete(thread, do_deletion);
8146     } else {
8147       TimeSeriesWrite(thread);
8148       thread->stats.Stop();
8149       thread->stats.Report("timeseries write");
8150     }
8151   }
8152
8153   void Compact(ThreadState* thread) {
8154     DB* db = SelectDB(thread);
8155     CompactRangeOptions cro;
8156     cro.bottommost_level_compaction =
8157         BottommostLevelCompaction::kForceOptimized;
8158     db->CompactRange(cro, nullptr, nullptr);
8159   }
8160
8161   void CompactAll() {
8162     if (db_.db != nullptr) {
8163       db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8164     }
8165     for (const auto& db_with_cfh : multi_dbs_) {
8166       db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8167     }
8168   }
8169
8170 #ifndef ROCKSDB_LITE
8171   void WaitForCompactionHelper(DBWithColumnFamilies& db) {
8172     // This is an imperfect way of waiting for compaction. The loop and sleep
8173     // is done because a thread that finishes a compaction job should get a
8174     // chance to pickup a new compaction job.
8175
8176     std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
8177                                      DB::Properties::kNumRunningFlushes,
8178                                      DB::Properties::kCompactionPending,
8179                                      DB::Properties::kNumRunningCompactions};
8180
8181     fprintf(stdout, "waitforcompaction(%s): started\n",
8182             db.db->GetName().c_str());
8183
8184     while (true) {
8185       bool retry = false;
8186
8187       for (const auto& k : keys) {
8188         uint64_t v;
8189         if (!db.db->GetIntProperty(k, &v)) {
8190           fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
8191                   db.db->GetName().c_str(), k.c_str());
8192           exit(1);
8193         } else if (v > 0) {
8194           fprintf(stdout,
8195                   "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
8196                   db.db->GetName().c_str(), k.c_str());
8197           FLAGS_env->SleepForMicroseconds(10 * 1000000);
8198           retry = true;
8199           break;
8200         }
8201       }
8202
8203       if (!retry) {
8204         fprintf(stdout, "waitforcompaction(%s): finished\n",
8205                 db.db->GetName().c_str());
8206         return;
8207       }
8208     }
8209   }
8210
8211   void WaitForCompaction() {
8212     // Give background threads a chance to wake
8213     FLAGS_env->SleepForMicroseconds(5 * 1000000);
8214
8215     // I am skeptical that this check race free. I hope that checking twice
8216     // reduces the chance.
8217     if (db_.db != nullptr) {
8218       WaitForCompactionHelper(db_);
8219       WaitForCompactionHelper(db_);
8220     } else {
8221       for (auto& db_with_cfh : multi_dbs_) {
8222         WaitForCompactionHelper(db_with_cfh);
8223         WaitForCompactionHelper(db_with_cfh);
8224       }
8225     }
8226   }
8227
8228   bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
8229     std::vector<LiveFileMetaData> files;
8230     db_with_cfh.db->GetLiveFilesMetaData(&files);
8231
8232     assert(from_level == 0 || from_level == 1);
8233
8234     int real_from_level = from_level;
8235     if (real_from_level > 0) {
8236       // With dynamic leveled compaction the first level with data beyond L0
8237       // might not be L1.
8238       real_from_level = std::numeric_limits<int>::max();
8239
8240       for (auto& f : files) {
8241         if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
8242       }
8243
8244       if (real_from_level == std::numeric_limits<int>::max()) {
8245         fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
8246         return true;
8247       }
8248     }
8249
8250     // The goal is to compact from from_level to the level that follows it,
8251     // and with dynamic leveled compaction the next level might not be
8252     // real_from_level+1
8253     int next_level = std::numeric_limits<int>::max();
8254
8255     std::vector<std::string> files_to_compact;
8256     for (auto& f : files) {
8257       if (f.level == real_from_level)
8258         files_to_compact.push_back(f.name);
8259       else if (f.level > real_from_level && f.level < next_level)
8260         next_level = f.level;
8261     }
8262
8263     if (files_to_compact.empty()) {
8264       fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
8265       return true;
8266     } else if (next_level == std::numeric_limits<int>::max()) {
8267       // There is no data beyond real_from_level. So we are done.
8268       fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
8269               real_from_level);
8270       return true;
8271     }
8272
8273     fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
8274             from_level, static_cast<int>(files_to_compact.size()),
8275             real_from_level, next_level);
8276
8277     ROCKSDB_NAMESPACE::CompactionOptions options;
8278     // Lets RocksDB use the configured compression for this level
8279     options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
8280
8281     ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
8282     db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
8283     options.output_file_size_limit = cfDesc.options.target_file_size_base;
8284
8285     Status status =
8286         db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
8287     if (!status.ok()) {
8288       // This can fail for valid reasons including the operation was aborted
8289       // or a filename is invalid because background compaction removed it.
8290       // Having read the current cases for which an error is raised I prefer
8291       // not to figure out whether an exception should be thrown here.
8292       fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
8293               status.ToString().c_str());
8294       return false;
8295     }
8296     return true;
8297   }
8298
8299   void CompactLevel(int from_level) {
8300     if (db_.db != nullptr) {
8301       while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
8302     }
8303     for (auto& db_with_cfh : multi_dbs_) {
8304       while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
8305     }
8306   }
8307 #endif
8308
8309   void Flush() {
8310     FlushOptions flush_opt;
8311     flush_opt.wait = true;
8312
8313     if (db_.db != nullptr) {
8314       Status s;
8315       if (FLAGS_num_column_families > 1) {
8316         s = db_.db->Flush(flush_opt, db_.cfh);
8317       } else {
8318         s = db_.db->Flush(flush_opt, db_.db->DefaultColumnFamily());
8319       }
8320
8321       if (!s.ok()) {
8322         fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
8323         exit(1);
8324       }
8325     } else {
8326       for (const auto& db_with_cfh : multi_dbs_) {
8327         Status s;
8328         if (FLAGS_num_column_families > 1) {
8329           s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
8330         } else {
8331           s = db_with_cfh.db->Flush(flush_opt,
8332                                     db_with_cfh.db->DefaultColumnFamily());
8333         }
8334
8335         if (!s.ok()) {
8336           fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
8337           exit(1);
8338         }
8339       }
8340     }
8341     fprintf(stdout, "flush memtable\n");
8342   }
8343
8344   void ResetStats() {
8345     if (db_.db != nullptr) {
8346       db_.db->ResetStats();
8347     }
8348     for (const auto& db_with_cfh : multi_dbs_) {
8349       db_with_cfh.db->ResetStats();
8350     }
8351   }
8352
8353   void PrintStatsHistory() {
8354     if (db_.db != nullptr) {
8355       PrintStatsHistoryImpl(db_.db, false);
8356     }
8357     for (const auto& db_with_cfh : multi_dbs_) {
8358       PrintStatsHistoryImpl(db_with_cfh.db, true);
8359     }
8360   }
8361
8362   void PrintStatsHistoryImpl(DB* db, bool print_header) {
8363     if (print_header) {
8364       fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8365     }
8366
8367     std::unique_ptr<StatsHistoryIterator> shi;
8368     Status s =
8369         db->GetStatsHistory(0, std::numeric_limits<uint64_t>::max(), &shi);
8370     if (!s.ok()) {
8371       fprintf(stdout, "%s\n", s.ToString().c_str());
8372       return;
8373     }
8374     assert(shi);
8375     while (shi->Valid()) {
8376       uint64_t stats_time = shi->GetStatsTime();
8377       fprintf(stdout, "------ %s ------\n",
8378               TimeToHumanString(static_cast<int>(stats_time)).c_str());
8379       for (auto& entry : shi->GetStatsMap()) {
8380         fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
8381                 entry.first.c_str(), entry.second);
8382       }
8383       shi->Next();
8384     }
8385   }
8386
8387   void PrintStats(const char* key) {
8388     if (db_.db != nullptr) {
8389       PrintStats(db_.db, key, false);
8390     }
8391     for (const auto& db_with_cfh : multi_dbs_) {
8392       PrintStats(db_with_cfh.db, key, true);
8393     }
8394   }
8395
8396   void PrintStats(DB* db, const char* key, bool print_header = false) {
8397     if (print_header) {
8398       fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8399     }
8400     std::string stats;
8401     if (!db->GetProperty(key, &stats)) {
8402       stats = "(failed)";
8403     }
8404     fprintf(stdout, "\n%s\n", stats.c_str());
8405   }
8406
8407   void PrintStats(const std::vector<std::string>& keys) {
8408     if (db_.db != nullptr) {
8409       PrintStats(db_.db, keys);
8410     }
8411     for (const auto& db_with_cfh : multi_dbs_) {
8412       PrintStats(db_with_cfh.db, keys, true);
8413     }
8414   }
8415
8416   void PrintStats(DB* db, const std::vector<std::string>& keys,
8417                   bool print_header = false) {
8418     if (print_header) {
8419       fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8420     }
8421
8422     for (const auto& key : keys) {
8423       std::string stats;
8424       if (!db->GetProperty(key, &stats)) {
8425         stats = "(failed)";
8426       }
8427       fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
8428     }
8429   }
8430
8431 #ifndef ROCKSDB_LITE
8432
8433   void Replay(ThreadState* thread) {
8434     if (db_.db != nullptr) {
8435       Replay(thread, &db_);
8436     }
8437   }
8438
8439   void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
8440     Status s;
8441     std::unique_ptr<TraceReader> trace_reader;
8442     s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
8443                            &trace_reader);
8444     if (!s.ok()) {
8445       fprintf(
8446           stderr,
8447           "Encountered an error creating a TraceReader from the trace file. "
8448           "Error: %s\n",
8449           s.ToString().c_str());
8450       exit(1);
8451     }
8452     std::unique_ptr<Replayer> replayer;
8453     s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
8454                                             std::move(trace_reader), &replayer);
8455     if (!s.ok()) {
8456       fprintf(stderr,
8457               "Encountered an error creating a default Replayer. "
8458               "Error: %s\n",
8459               s.ToString().c_str());
8460       exit(1);
8461     }
8462     s = replayer->Prepare();
8463     if (!s.ok()) {
8464       fprintf(stderr, "Prepare for replay failed. Error: %s\n",
8465               s.ToString().c_str());
8466     }
8467     s = replayer->Replay(
8468         ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads),
8469                       FLAGS_trace_replay_fast_forward),
8470         nullptr);
8471     replayer.reset();
8472     if (s.ok()) {
8473       fprintf(stdout, "Replay completed from trace_file: %s\n",
8474               FLAGS_trace_file.c_str());
8475     } else {
8476       fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str());
8477     }
8478   }
8479
8480   void Backup(ThreadState* thread) {
8481     DB* db = SelectDB(thread);
8482     std::unique_ptr<BackupEngineOptions> engine_options(
8483         new BackupEngineOptions(FLAGS_backup_dir));
8484     Status s;
8485     BackupEngine* backup_engine;
8486     if (FLAGS_backup_rate_limit > 0) {
8487       engine_options->backup_rate_limiter.reset(NewGenericRateLimiter(
8488           FLAGS_backup_rate_limit, 100000 /* refill_period_us */,
8489           10 /* fairness */, RateLimiter::Mode::kAllIo));
8490     }
8491     // Build new backup of the entire DB
8492     engine_options->destroy_old_data = true;
8493     s = BackupEngine::Open(FLAGS_env, *engine_options, &backup_engine);
8494     assert(s.ok());
8495     s = backup_engine->CreateNewBackup(db);
8496     assert(s.ok());
8497     std::vector<BackupInfo> backup_info;
8498     backup_engine->GetBackupInfo(&backup_info);
8499     // Verify that a new backup is created
8500     assert(backup_info.size() == 1);
8501   }
8502
8503   void Restore(ThreadState* /* thread */) {
8504     std::unique_ptr<BackupEngineOptions> engine_options(
8505         new BackupEngineOptions(FLAGS_backup_dir));
8506     if (FLAGS_restore_rate_limit > 0) {
8507       engine_options->restore_rate_limiter.reset(NewGenericRateLimiter(
8508           FLAGS_restore_rate_limit, 100000 /* refill_period_us */,
8509           10 /* fairness */, RateLimiter::Mode::kAllIo));
8510     }
8511     BackupEngineReadOnly* backup_engine;
8512     Status s =
8513         BackupEngineReadOnly::Open(FLAGS_env, *engine_options, &backup_engine);
8514     assert(s.ok());
8515     s = backup_engine->RestoreDBFromLatestBackup(FLAGS_restore_dir,
8516                                                  FLAGS_restore_dir);
8517     assert(s.ok());
8518     delete backup_engine;
8519   }
8520
8521 #endif  // ROCKSDB_LITE
8522 };
8523
8524 int db_bench_tool(int argc, char** argv) {
8525   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
8526   ConfigOptions config_options;
8527   static bool initialized = false;
8528   if (!initialized) {
8529     SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
8530                     " [OPTIONS]...");
8531     SetVersionString(GetRocksVersionAsString(true));
8532     initialized = true;
8533   }
8534   ParseCommandLineFlags(&argc, &argv, true);
8535   FLAGS_compaction_style_e =
8536       (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
8537 #ifndef ROCKSDB_LITE
8538   if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
8539     fprintf(stderr,
8540             "Cannot provide both --statistics and --statistics_string.\n");
8541     exit(1);
8542   }
8543   if (!FLAGS_statistics_string.empty()) {
8544     Status s = Statistics::CreateFromString(config_options,
8545                                             FLAGS_statistics_string, &dbstats);
8546     if (dbstats == nullptr) {
8547       fprintf(stderr,
8548               "No Statistics registered matching string: %s status=%s\n",
8549               FLAGS_statistics_string.c_str(), s.ToString().c_str());
8550       exit(1);
8551     }
8552   }
8553 #endif  // ROCKSDB_LITE
8554   if (FLAGS_statistics) {
8555     dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
8556   }
8557   if (dbstats) {
8558     dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
8559   }
8560   FLAGS_compaction_pri_e =
8561       (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
8562
8563   std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
8564       FLAGS_max_bytes_for_level_multiplier_additional, ',');
8565   for (size_t j = 0; j < fanout.size(); j++) {
8566     FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
8567 #ifndef CYGWIN
8568         std::stoi(fanout[j]));
8569 #else
8570         stoi(fanout[j]));
8571 #endif
8572   }
8573
8574   FLAGS_compression_type_e =
8575       StringToCompressionType(FLAGS_compression_type.c_str());
8576
8577   FLAGS_wal_compression_e =
8578       StringToCompressionType(FLAGS_wal_compression.c_str());
8579
8580   FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType(
8581       FLAGS_compressed_secondary_cache_compression_type.c_str());
8582
8583 #ifndef ROCKSDB_LITE
8584   // Stacked BlobDB
8585   FLAGS_blob_db_compression_type_e =
8586       StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
8587
8588   int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
8589   if (env_opts > 1) {
8590     fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
8591     exit(1);
8592   }
8593
8594   if (env_opts == 1) {
8595     Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri,
8596                                   &FLAGS_env, &env_guard);
8597     if (!s.ok()) {
8598       fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
8599       exit(1);
8600     }
8601   } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
8602     //**TODO: Make the simulate fs something that can be loaded
8603     // from the ObjectRegistry...
8604     static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
8605         NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
8606             FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
8607             /*throughput_multiplier=*/
8608             int{FLAGS_simulate_hybrid_hdd_multipliers},
8609             /*is_full_fs_warm=*/FLAGS_simulate_hdd));
8610     FLAGS_env = composite_env.get();
8611   }
8612
8613   // Let -readonly imply -use_existing_db
8614   FLAGS_use_existing_db |= FLAGS_readonly;
8615 #endif  // ROCKSDB_LITE
8616
8617   if (FLAGS_build_info) {
8618     std::string build_info;
8619     std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl;
8620     // Similar to --version, nothing else will be done when this flag is set
8621     exit(0);
8622   }
8623
8624   if (!FLAGS_seed) {
8625     uint64_t now = FLAGS_env->GetSystemClock()->NowMicros();
8626     seed_base = static_cast<int64_t>(now);
8627     fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n",
8628             seed_base);
8629   } else {
8630     seed_base = FLAGS_seed;
8631   }
8632
8633   if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
8634     fprintf(stderr,
8635             "`-use_existing_db` must be true for `-use_existing_keys` to be "
8636             "settable\n");
8637     exit(1);
8638   }
8639
8640   if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
8641     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
8642   else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
8643     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
8644   else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
8645     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
8646   else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
8647     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
8648   else {
8649     fprintf(stdout, "Unknown compaction fadvice:%s\n",
8650             FLAGS_compaction_fadvice.c_str());
8651     exit(1);
8652   }
8653
8654   FLAGS_value_size_distribution_type_e =
8655       StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
8656
8657   // Note options sanitization may increase thread pool sizes according to
8658   // max_background_flushes/max_background_compactions/max_background_jobs
8659   FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
8660                                   ROCKSDB_NAMESPACE::Env::Priority::HIGH);
8661   FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
8662                                   ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
8663   FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
8664                                   ROCKSDB_NAMESPACE::Env::Priority::LOW);
8665
8666   // Choose a location for the test database if none given with --db=<path>
8667   if (FLAGS_db.empty()) {
8668     std::string default_db_path;
8669     FLAGS_env->GetTestDirectory(&default_db_path);
8670     default_db_path += "/dbbench";
8671     FLAGS_db = default_db_path;
8672   }
8673
8674   if (FLAGS_backup_dir.empty()) {
8675     FLAGS_backup_dir = FLAGS_db + "/backup";
8676   }
8677
8678   if (FLAGS_restore_dir.empty()) {
8679     FLAGS_restore_dir = FLAGS_db + "/restore";
8680   }
8681
8682   if (FLAGS_stats_interval_seconds > 0) {
8683     // When both are set then FLAGS_stats_interval determines the frequency
8684     // at which the timer is checked for FLAGS_stats_interval_seconds
8685     FLAGS_stats_interval = 1000;
8686   }
8687
8688   if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
8689     fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
8690     exit(1);
8691   }
8692
8693   ROCKSDB_NAMESPACE::Benchmark benchmark;
8694   benchmark.Run();
8695
8696 #ifndef ROCKSDB_LITE
8697   if (FLAGS_print_malloc_stats) {
8698     std::string stats_string;
8699     ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
8700     fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
8701   }
8702 #endif  // ROCKSDB_LITE
8703
8704   return 0;
8705 }
8706 }  // namespace ROCKSDB_NAMESPACE
8707 #endif