1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
20 #include <sys/types.h>
22 #include <mach/host_info.h>
23 #include <mach/mach_host.h>
24 #include <sys/sysctl.h>
27 #include <sys/sysctl.h>
31 #include <condition_variable>
38 #include <unordered_map>
40 #include "db/db_impl/db_impl.h"
41 #include "db/malloc_stats.h"
42 #include "db/version_set.h"
43 #include "monitoring/histogram.h"
44 #include "monitoring/statistics.h"
45 #include "options/cf_options.h"
46 #include "port/port.h"
47 #include "port/stack_trace.h"
48 #include "rocksdb/cache.h"
49 #include "rocksdb/convenience.h"
50 #include "rocksdb/db.h"
51 #include "rocksdb/env.h"
52 #include "rocksdb/filter_policy.h"
53 #include "rocksdb/memtablerep.h"
54 #include "rocksdb/options.h"
55 #include "rocksdb/perf_context.h"
56 #include "rocksdb/persistent_cache.h"
57 #include "rocksdb/rate_limiter.h"
58 #include "rocksdb/secondary_cache.h"
59 #include "rocksdb/slice.h"
60 #include "rocksdb/slice_transform.h"
61 #include "rocksdb/stats_history.h"
62 #include "rocksdb/table.h"
63 #include "rocksdb/utilities/backup_engine.h"
64 #include "rocksdb/utilities/object_registry.h"
65 #include "rocksdb/utilities/optimistic_transaction_db.h"
66 #include "rocksdb/utilities/options_type.h"
67 #include "rocksdb/utilities/options_util.h"
69 #include "rocksdb/utilities/replayer.h"
70 #endif // ROCKSDB_LITE
71 #include "rocksdb/utilities/sim_cache.h"
72 #include "rocksdb/utilities/transaction.h"
73 #include "rocksdb/utilities/transaction_db.h"
74 #include "rocksdb/write_batch.h"
75 #include "test_util/testutil.h"
76 #include "test_util/transaction_test_util.h"
77 #include "tools/simulated_hybrid_file_system.h"
78 #include "util/cast_util.h"
79 #include "util/compression.h"
80 #include "util/crc32c.h"
81 #include "util/file_checksum_helper.h"
82 #include "util/gflags_compat.h"
83 #include "util/mutexlock.h"
84 #include "util/random.h"
85 #include "util/stderr_logger.h"
86 #include "util/string_util.h"
87 #include "util/xxhash.h"
88 #include "utilities/blob_db/blob_db.h"
89 #include "utilities/counted_fs.h"
90 #include "utilities/merge_operators.h"
91 #include "utilities/merge_operators/bytesxor.h"
92 #include "utilities/merge_operators/sortlist.h"
93 #include "utilities/persistent_cache/block_cache_tier.h"
96 #include "memory/memkind_kmem_allocator.h"
100 #include <io.h> // open/close
103 using GFLAGS_NAMESPACE::ParseCommandLineFlags
;
104 using GFLAGS_NAMESPACE::RegisterFlagValidator
;
105 using GFLAGS_NAMESPACE::SetUsageMessage
;
106 using GFLAGS_NAMESPACE::SetVersionString
;
109 #define IF_ROCKSDB_LITE(Then, Else) Then
111 #define IF_ROCKSDB_LITE(Then, Else) Else
117 "fillseqdeterministic,"
120 "filluniquerandomdeterministic,"
124 "newiteratorwhilewriting,"
126 "seekrandomwhilewriting,"
127 "seekrandomwhilemerging,"
147 "readrandomwriterandom,"
150 "approximatesizerandom,"
165 "readrandomoperands,"
169 "Comma-separated list of operations to run in the specified"
170 " order. Available benchmarks:\n"
171 "\tfillseq -- write N values in sequential key"
172 " order in async mode\n"
173 "\tfillseqdeterministic -- write N values in the specified"
174 " key order and keep the shape of the LSM tree\n"
175 "\tfillrandom -- write N values in random key order in async"
177 "\tfilluniquerandomdeterministic -- write N values in a random"
178 " key order and keep the shape of the LSM tree\n"
179 "\toverwrite -- overwrite N values in random key order in "
181 "\tfillsync -- write N/1000 values in random key order in "
183 "\tfill100K -- write N/1000 100K values in random order in"
185 "\tdeleteseq -- delete N keys in sequential order\n"
186 "\tdeleterandom -- delete N keys in random order\n"
187 "\treadseq -- read N times sequentially\n"
188 "\treadtocache -- 1 thread reading database sequentially\n"
189 "\treadreverse -- read N times in reverse order\n"
190 "\treadrandom -- read N times in random order\n"
191 "\treadmissing -- read N missing keys in random order\n"
192 "\treadwhilewriting -- 1 writer, N threads doing random "
194 "\treadwhilemerging -- 1 merger, N threads doing random "
196 "\treadwhilescanning -- 1 thread doing full table scan, "
197 "N threads doing random reads\n"
198 "\treadrandomwriterandom -- N threads doing random-read, "
200 "\tupdaterandom -- N threads doing read-modify-write for random "
202 "\txorupdaterandom -- N threads doing read-XOR-write for "
204 "\tappendrandom -- N threads doing read-modify-write with "
206 "\tmergerandom -- same as updaterandom/appendrandom using merge"
208 "Must be used with merge_operator\n"
209 "\treadrandommergerandom -- perform N random read-or-merge "
210 "operations. Must be used with merge_operator\n"
211 "\tnewiterator -- repeated iterator creation\n"
212 "\tseekrandom -- N random seeks, call Next seek_nexts times "
214 "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
216 "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
218 "\tcrc32c -- repeated crc32c of <block size> data\n"
219 "\txxhash -- repeated xxHash of <block size> data\n"
220 "\txxhash64 -- repeated xxHash64 of <block size> data\n"
221 "\txxh3 -- repeated XXH3 of <block size> data\n"
222 "\tacquireload -- load N*1000 times\n"
223 "\tfillseekseq -- write N values in sequential key, then read "
224 "them by seeking to each key\n"
225 "\trandomtransaction -- execute N random transactions and "
226 "verify correctness\n"
227 "\trandomreplacekeys -- randomly replaces N keys by deleting "
228 "the old version and putting the new version\n\n"
229 "\ttimeseries -- 1 writer generates time series data "
230 "and multiple readers doing random reads on id\n\n"
232 "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
233 "\tcompactall -- Compact the entire DB\n"
235 "\tcompact0 -- compact L0 into L1\n"
236 "\tcompact1 -- compact L1 into L2\n"
237 "\twaitforcompaction - pause until compaction is (probably) done\n"
239 "\tflush - flush the memtable\n"
240 "\tstats -- Print DB stats\n"
241 "\tresetstats -- Reset DB stats\n"
242 "\tlevelstats -- Print the number of files and bytes per level\n"
243 "\tmemstats -- Print memtable stats\n"
244 "\tsstables -- Print sstable info\n"
245 "\theapprofile -- Dump a heap profile (if supported by this port)\n"
247 "\treplay -- replay the trace file specified with trace_file\n"
249 "\tgetmergeoperands -- Insert lots of merge records which are a list of "
250 "sorted ints for a key and then compare performance of lookup for another "
251 "key by doing a Get followed by binary searching in the large sorted list "
252 "vs doing a GetMergeOperands and binary searching in the operands which "
253 "are sorted sub-lists. The MergeOperator used is sortlist.h\n"
254 "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An "
255 "operation includes a rare but possible retry in case it got "
256 "`Status::Incomplete()`. This happens upon encountering more keys than "
257 "have ever been seen by the thread (or eight initially)\n"
258 "\tbackup -- Create a backup of the current DB and verify that a new backup is corrected. "
259 "Rate limit can be specified through --backup_rate_limit\n"
260 "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n");
262 DEFINE_int64(num
, 1000000, "Number of key/values to place in database");
264 DEFINE_int64(numdistinct
, 1000,
265 "Number of distinct keys to use. Used in RandomWithVerify to "
266 "read/write on fewer keys so that gets are more likely to find the"
267 " key and puts are more likely to update the same key");
269 DEFINE_int64(merge_keys
, -1,
270 "Number of distinct keys to use for MergeRandom and "
271 "ReadRandomMergeRandom. "
272 "If negative, there will be FLAGS_num keys.");
273 DEFINE_int32(num_column_families
, 1, "Number of Column Families to use.");
276 num_hot_column_families
, 0,
277 "Number of Hot Column Families. If more than 0, only write to this "
278 "number of column families. After finishing all the writes to them, "
279 "create new set of column families and insert to them. Only used "
280 "when num_column_families > 1.");
282 DEFINE_string(column_family_distribution
, "",
283 "Comma-separated list of percentages, where the ith element "
284 "indicates the probability of an op using the ith column family. "
285 "The number of elements must be `num_hot_column_families` if "
286 "specified; otherwise, it must be `num_column_families`. The "
287 "sum of elements must be 100. E.g., if `num_column_families=4`, "
288 "and `num_hot_column_families=0`, a valid list could be "
291 DEFINE_int64(reads
, -1,
292 "Number of read operations to do. "
293 "If negative, do FLAGS_num reads.");
295 DEFINE_int64(deletes
, -1,
296 "Number of delete operations to do. "
297 "If negative, do FLAGS_num deletions.");
299 DEFINE_int32(bloom_locality
, 0, "Control bloom filter probes locality");
301 DEFINE_int64(seed
, 0,
302 "Seed base for random number generators. "
303 "When 0 it is derived from the current time.");
304 static int64_t seed_base
;
306 DEFINE_int32(threads
, 1, "Number of concurrent threads to run.");
308 DEFINE_int32(duration
, 0,
309 "Time in seconds for the random-ops tests to run."
310 " When 0 then num & reads determine the test duration");
312 DEFINE_string(value_size_distribution_type
, "fixed",
313 "Value size distribution type: fixed, uniform, normal");
315 DEFINE_int32(value_size
, 100, "Size of each value in fixed distribution");
316 static unsigned int value_size
= 100;
318 DEFINE_int32(value_size_min
, 100, "Min size of random value");
320 DEFINE_int32(value_size_max
, 102400, "Max size of random value");
322 DEFINE_int32(seek_nexts
, 0,
323 "How many times to call Next() after Seek() in "
324 "fillseekseq, seekrandom, seekrandomwhilewriting and "
325 "seekrandomwhilemerging");
327 DEFINE_bool(reverse_iterator
, false,
328 "When true use Prev rather than Next for iterators that do "
329 "Seek and then Next");
331 DEFINE_bool(auto_prefix_mode
, false, "Set auto_prefix_mode for seek benchmark");
333 DEFINE_int64(max_scan_distance
, 0,
334 "Used to define iterate_upper_bound (or iterate_lower_bound "
335 "if FLAGS_reverse_iterator is set to true) when value is nonzero");
337 DEFINE_bool(use_uint64_comparator
, false, "use Uint64 user comparator");
339 DEFINE_int64(batch_size
, 1, "Batch size");
341 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
345 static bool ValidateUint32Range(const char* flagname
, uint64_t value
) {
346 if (value
> std::numeric_limits
<uint32_t>::max()) {
347 fprintf(stderr
, "Invalid value for --%s: %lu, overflow\n", flagname
,
348 (unsigned long)value
);
354 DEFINE_int32(key_size
, 16, "size of each key");
356 DEFINE_int32(user_timestamp_size
, 0,
357 "number of bytes in a user-defined timestamp");
359 DEFINE_int32(num_multi_db
, 0,
360 "Number of DBs used in the benchmark. 0 means single DB.");
362 DEFINE_double(compression_ratio
, 0.5,
363 "Arrange to generate values that shrink to this fraction of "
364 "their original size after compression");
367 overwrite_probability
, 0.0,
368 "Used in 'filluniquerandom' benchmark: for each write operation, "
369 "we give a probability to perform an overwrite instead. The key used for "
370 "the overwrite is randomly chosen from the last 'overwrite_window_size' "
371 "keys previously inserted into the DB. "
372 "Valid overwrite_probability values: [0.0, 1.0].");
374 DEFINE_uint32(overwrite_window_size
, 1,
375 "Used in 'filluniquerandom' benchmark. For each write operation,"
376 " when the overwrite_probability flag is set by the user, the "
377 "key used to perform an overwrite is randomly chosen from the "
378 "last 'overwrite_window_size' keys previously inserted into DB. "
379 "Warning: large values can affect throughput. "
380 "Valid overwrite_window_size values: [1, kMaxUint32].");
383 disposable_entries_delete_delay
, 0,
384 "Minimum delay in microseconds for the series of Deletes "
385 "to be issued. When 0 the insertion of the last disposable entry is "
386 "immediately followed by the issuance of the Deletes. "
387 "(only compatible with fillanddeleteuniquerandom benchmark).");
389 DEFINE_uint64(disposable_entries_batch_size
, 0,
390 "Number of consecutively inserted disposable KV entries "
391 "that will be deleted after 'delete_delay' microseconds. "
392 "A series of Deletes is always issued once all the "
393 "disposable KV entries it targets have been inserted "
394 "into the DB. When 0 no deletes are issued and a "
395 "regular 'filluniquerandom' benchmark occurs. "
396 "(only compatible with fillanddeleteuniquerandom benchmark)");
398 DEFINE_int32(disposable_entries_value_size
, 64,
399 "Size of the values (in bytes) of the entries targeted by "
400 "selective deletes. "
401 "(only compatible with fillanddeleteuniquerandom benchmark)");
404 persistent_entries_batch_size
, 0,
405 "Number of KV entries being inserted right before the deletes "
406 "targeting the disposable KV entries are issued. These "
407 "persistent keys are not targeted by the deletes, and will always "
408 "remain valid in the DB. (only compatible with "
409 "--benchmarks='fillanddeleteuniquerandom' "
410 "and used when--disposable_entries_batch_size is > 0).");
412 DEFINE_int32(persistent_entries_value_size
, 64,
413 "Size of the values (in bytes) of the entries not targeted by "
414 "deletes. (only compatible with "
415 "--benchmarks='fillanddeleteuniquerandom' "
416 "and used when--disposable_entries_batch_size is > 0).");
418 DEFINE_double(read_random_exp_range
, 0.0,
419 "Read random's key will be generated using distribution of "
420 "num * exp(-r) where r is uniform number from 0 to this value. "
421 "The larger the number is, the more skewed the reads are. "
422 "Only used in readrandom and multireadrandom benchmarks.");
424 DEFINE_bool(histogram
, false, "Print histogram of operation timings");
426 DEFINE_bool(confidence_interval_only
, false,
427 "Print 95% confidence interval upper and lower bounds only for "
430 DEFINE_bool(enable_numa
, false,
431 "Make operations aware of NUMA architecture and bind memory "
432 "and cpus corresponding to nodes together. In NUMA, memory "
433 "in same node as CPUs are closer when compared to memory in "
434 "other nodes. Reads can be faster when the process is bound to "
435 "CPU and memory of same node. Use \"$numactl --hardware\" command "
436 "to see NUMA memory architecture.");
438 DEFINE_int64(db_write_buffer_size
,
439 ROCKSDB_NAMESPACE::Options().db_write_buffer_size
,
440 "Number of bytes to buffer in all memtables before compacting");
442 DEFINE_bool(cost_write_buffer_to_cache
, false,
443 "The usage of memtable is costed to the block cache");
445 DEFINE_int64(arena_block_size
, ROCKSDB_NAMESPACE::Options().arena_block_size
,
446 "The size, in bytes, of one block in arena memory allocation.");
448 DEFINE_int64(write_buffer_size
, ROCKSDB_NAMESPACE::Options().write_buffer_size
,
449 "Number of bytes to buffer in memtable before compacting");
451 DEFINE_int32(max_write_buffer_number
,
452 ROCKSDB_NAMESPACE::Options().max_write_buffer_number
,
453 "The number of in-memory memtables. Each memtable is of size"
454 " write_buffer_size bytes.");
456 DEFINE_int32(min_write_buffer_number_to_merge
,
457 ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge
,
458 "The minimum number of write buffers that will be merged together"
459 "before writing to storage. This is cheap because it is an"
460 "in-memory merge. If this feature is not enabled, then all these"
461 "write buffers are flushed to L0 as separate files and this "
462 "increases read amplification because a get request has to check"
463 " in all of these files. Also, an in-memory merge may result in"
464 " writing less data to storage if there are duplicate records "
465 " in each of these individual write buffers.");
467 DEFINE_int32(max_write_buffer_number_to_maintain
,
468 ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain
,
469 "The total maximum number of write buffers to maintain in memory "
470 "including copies of buffers that have already been flushed. "
471 "Unlike max_write_buffer_number, this parameter does not affect "
472 "flushing. This controls the minimum amount of write history "
473 "that will be available in memory for conflict checking when "
474 "Transactions are used. If this value is too low, some "
475 "transactions may fail at commit time due to not being able to "
476 "determine whether there were any write conflicts. Setting this "
477 "value to 0 will cause write buffers to be freed immediately "
478 "after they are flushed. If this value is set to -1, "
479 "'max_write_buffer_number' will be used.");
481 DEFINE_int64(max_write_buffer_size_to_maintain
,
482 ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain
,
483 "The total maximum size of write buffers to maintain in memory "
484 "including copies of buffers that have already been flushed. "
485 "Unlike max_write_buffer_number, this parameter does not affect "
486 "flushing. This controls the minimum amount of write history "
487 "that will be available in memory for conflict checking when "
488 "Transactions are used. If this value is too low, some "
489 "transactions may fail at commit time due to not being able to "
490 "determine whether there were any write conflicts. Setting this "
491 "value to 0 will cause write buffers to be freed immediately "
492 "after they are flushed. If this value is set to -1, "
493 "'max_write_buffer_number' will be used.");
495 DEFINE_int32(max_background_jobs
,
496 ROCKSDB_NAMESPACE::Options().max_background_jobs
,
497 "The maximum number of concurrent background jobs that can occur "
500 DEFINE_int32(num_bottom_pri_threads
, 0,
501 "The number of threads in the bottom-priority thread pool (used "
502 "by universal compaction only).");
504 DEFINE_int32(num_high_pri_threads
, 0,
505 "The maximum number of concurrent background compactions"
506 " that can occur in parallel.");
508 DEFINE_int32(num_low_pri_threads
, 0,
509 "The maximum number of concurrent background compactions"
510 " that can occur in parallel.");
512 DEFINE_int32(max_background_compactions
,
513 ROCKSDB_NAMESPACE::Options().max_background_compactions
,
514 "The maximum number of concurrent background compactions"
515 " that can occur in parallel.");
517 DEFINE_uint64(subcompactions
, 1,
518 "Maximum number of subcompactions to divide L0-L1 compactions "
520 static const bool FLAGS_subcompactions_dummy
__attribute__((__unused__
)) =
521 RegisterFlagValidator(&FLAGS_subcompactions
, &ValidateUint32Range
);
523 DEFINE_int32(max_background_flushes
,
524 ROCKSDB_NAMESPACE::Options().max_background_flushes
,
525 "The maximum number of concurrent background flushes"
526 " that can occur in parallel.");
528 static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e
;
529 DEFINE_int32(compaction_style
,
530 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style
,
531 "style of compaction: level-based, universal and fifo");
533 static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e
;
534 DEFINE_int32(compaction_pri
,
535 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri
,
536 "priority of files to compaction: by size or by data age");
538 DEFINE_int32(universal_size_ratio
, 0,
539 "Percentage flexibility while comparing file size "
540 "(for universal compaction only).");
542 DEFINE_int32(universal_min_merge_width
, 0,
543 "The minimum number of files in a single compaction run "
544 "(for universal compaction only).");
546 DEFINE_int32(universal_max_merge_width
, 0,
547 "The max number of files to compact in universal style "
550 DEFINE_int32(universal_max_size_amplification_percent
, 0,
551 "The max size amplification for universal style compaction");
553 DEFINE_int32(universal_compression_size_percent
, -1,
554 "The percentage of the database to compress for universal "
555 "compaction. -1 means compress everything.");
557 DEFINE_bool(universal_allow_trivial_move
, false,
558 "Allow trivial move in universal compaction.");
560 DEFINE_bool(universal_incremental
, false,
561 "Enable incremental compactions in universal compaction.");
563 DEFINE_int64(cache_size
, 8 << 20, // 8MB
564 "Number of bytes to use as a cache of uncompressed data");
566 DEFINE_int32(cache_numshardbits
, -1,
567 "Number of shards for the block cache"
568 " is 2 ** cache_numshardbits. Negative means use default settings."
569 " This is applied only if FLAGS_cache_size is non-negative.");
571 DEFINE_double(cache_high_pri_pool_ratio
, 0.0,
572 "Ratio of block cache reserve for high pri blocks. "
573 "If > 0.0, we also enable "
574 "cache_index_and_filter_blocks_with_high_priority.");
576 DEFINE_double(cache_low_pri_pool_ratio
, 0.0,
577 "Ratio of block cache reserve for low pri blocks.");
579 DEFINE_string(cache_type
, "lru_cache", "Type of block cache.");
581 DEFINE_bool(use_compressed_secondary_cache
, false,
582 "Use the CompressedSecondaryCache as the secondary cache.");
584 DEFINE_int64(compressed_secondary_cache_size
, 8 << 20, // 8MB
585 "Number of bytes to use as a cache of data");
587 DEFINE_int32(compressed_secondary_cache_numshardbits
, 6,
588 "Number of shards for the block cache"
589 " is 2 ** compressed_secondary_cache_numshardbits."
590 " Negative means use default settings."
591 " This is applied only if FLAGS_cache_size is non-negative.");
593 DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio
, 0.0,
594 "Ratio of block cache reserve for high pri blocks. "
595 "If > 0.0, we also enable "
596 "cache_index_and_filter_blocks_with_high_priority.");
598 DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio
, 0.0,
599 "Ratio of block cache reserve for low pri blocks.");
601 DEFINE_string(compressed_secondary_cache_compression_type
, "lz4",
602 "The compression algorithm to use for large "
603 "values stored in CompressedSecondaryCache.");
604 static enum ROCKSDB_NAMESPACE::CompressionType
605 FLAGS_compressed_secondary_cache_compression_type_e
=
606 ROCKSDB_NAMESPACE::kLZ4Compression
;
609 compressed_secondary_cache_compress_format_version
, 2,
610 "compress_format_version can have two values: "
611 "compress_format_version == 1 -- decompressed size is not included"
612 " in the block header."
613 "compress_format_version == 2 -- decompressed size is included"
614 " in the block header in varint32 format.");
616 DEFINE_int64(simcache_size
, -1,
617 "Number of bytes to use as a simcache of "
618 "uncompressed data. Nagative value disables simcache.");
620 DEFINE_bool(cache_index_and_filter_blocks
, false,
621 "Cache index/filter blocks in block cache.");
623 DEFINE_bool(use_cache_jemalloc_no_dump_allocator
, false,
624 "Use JemallocNodumpAllocator for block/blob cache.");
626 DEFINE_bool(use_cache_memkind_kmem_allocator
, false,
627 "Use memkind kmem allocator for block/blob cache.");
629 DEFINE_bool(partition_index_and_filters
, false,
630 "Partition index and filter blocks.");
632 DEFINE_bool(partition_index
, false, "Partition index blocks");
634 DEFINE_bool(index_with_first_key
, false, "Include first key in the index");
637 optimize_filters_for_memory
,
638 ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory
,
639 "Minimize memory footprint of filters");
642 index_shortening_mode
, 2,
643 "mode to shorten index: 0 for no shortening; 1 for only shortening "
644 "separaters; 2 for shortening shortening and successor");
646 DEFINE_int64(metadata_block_size
,
647 ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size
,
648 "Max partition size when partitioning index/filters");
650 // The default reduces the overhead of reading time with flash. With HDD, which
651 // offers much less throughput, however, this number better to be set to 1.
652 DEFINE_int32(ops_between_duration_checks
, 1000,
653 "Check duration limit every x ops");
655 DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache
, false,
656 "Pin index/filter blocks of L0 files in block cache.");
659 pin_top_level_index_and_filter
, false,
660 "Pin top-level index of partitioned index/filter blocks in block cache.");
662 DEFINE_int32(block_size
,
663 static_cast<int32_t>(
664 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size
),
665 "Number of bytes in a block.");
667 DEFINE_int32(format_version
,
668 static_cast<int32_t>(
669 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version
),
670 "Format version of SST files.");
672 DEFINE_int32(block_restart_interval
,
673 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval
,
674 "Number of keys between restart points "
675 "for delta encoding of keys in data block.");
678 index_block_restart_interval
,
679 ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval
,
680 "Number of keys between restart points "
681 "for delta encoding of keys in index block.");
683 DEFINE_int32(read_amp_bytes_per_bit
,
684 ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit
,
685 "Number of bytes per bit to be used in block read-amp bitmap");
688 enable_index_compression
,
689 ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression
,
690 "Compress the index block");
692 DEFINE_bool(block_align
,
693 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align
,
694 "Align data blocks on page size");
696 DEFINE_int64(prepopulate_block_cache
, 0,
697 "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
698 "to insert during flush");
700 DEFINE_bool(use_data_block_hash_index
, false,
701 "if use kDataBlockBinaryAndHash "
702 "instead of kDataBlockBinarySearch. "
703 "This is valid if only we use BlockTable");
705 DEFINE_double(data_block_hash_table_util_ratio
, 0.75,
706 "util ratio for data block hash index table. "
707 "This is only valid if use_data_block_hash_index is "
710 DEFINE_int64(compressed_cache_size
, -1,
711 "Number of bytes to use as a cache of compressed data.");
713 DEFINE_int64(row_cache_size
, 0,
714 "Number of bytes to use as a cache of individual rows"
717 DEFINE_int32(open_files
, ROCKSDB_NAMESPACE::Options().max_open_files
,
718 "Maximum number of files to keep open at the same time"
719 " (use default if == 0)");
721 DEFINE_int32(file_opening_threads
,
722 ROCKSDB_NAMESPACE::Options().max_file_opening_threads
,
723 "If open_files is set to -1, this option set the number of "
724 "threads that will be used to open files during DB::Open()");
726 DEFINE_int32(compaction_readahead_size
, 0, "Compaction readahead size");
728 DEFINE_int32(log_readahead_size
, 0, "WAL and manifest readahead size");
730 DEFINE_int32(random_access_max_buffer_size
, 1024 * 1024,
731 "Maximum windows randomaccess buffer size");
733 DEFINE_int32(writable_file_max_buffer_size
, 1024 * 1024,
734 "Maximum write buffer for Writable File");
736 DEFINE_int32(bloom_bits
, -1,
737 "Bloom filter bits per key. Negative means use default."
740 DEFINE_bool(use_ribbon_filter
, false, "Use Ribbon instead of Bloom filter");
742 DEFINE_double(memtable_bloom_size_ratio
, 0,
743 "Ratio of memtable size used for bloom filter. 0 means no bloom "
745 DEFINE_bool(memtable_whole_key_filtering
, false,
746 "Try to use whole key bloom filter in memtables.");
747 DEFINE_bool(memtable_use_huge_page
, false,
748 "Try to use huge page in memtables.");
750 DEFINE_bool(whole_key_filtering
,
751 ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering
,
752 "Use whole keys (in addition to prefixes) in SST bloom filter.");
754 DEFINE_bool(use_existing_db
, false,
755 "If true, do not destroy the existing database. If you set this "
756 "flag and also specify a benchmark that wants a fresh database, "
757 "that benchmark will fail.");
759 DEFINE_bool(use_existing_keys
, false,
760 "If true, uses existing keys in the DB, "
761 "rather than generating new ones. This involves some startup "
762 "latency to load all keys into memory. It is supported for the "
763 "same read/overwrite benchmarks as `-use_existing_db=true`, which "
764 "must also be set for this flag to be enabled. When this flag is "
765 "set, the value for `-num` will be ignored.");
767 DEFINE_bool(show_table_properties
, false,
768 "If true, then per-level table"
769 " properties will be printed on every stats-interval when"
770 " stats_interval is set and stats_per_interval is on.");
772 DEFINE_string(db
, "", "Use the db with the following name.");
774 DEFINE_bool(progress_reports
, true,
775 "If true, db_bench will report number of finished operations.");
779 DEFINE_string(read_cache_path
, "",
780 "If not empty string, a read cache will be used in this path");
782 DEFINE_int64(read_cache_size
, 4LL * 1024 * 1024 * 1024,
783 "Maximum size of the read cache");
785 DEFINE_bool(read_cache_direct_write
, true,
786 "Whether to use Direct IO for writing to the read cache");
788 DEFINE_bool(read_cache_direct_read
, true,
789 "Whether to use Direct IO for reading from read cache");
791 DEFINE_bool(use_keep_filter
, false, "Whether to use a noop compaction filter");
793 static bool ValidateCacheNumshardbits(const char* flagname
, int32_t value
) {
795 fprintf(stderr
, "Invalid value for --%s: %d, must be < 20\n", flagname
,
802 DEFINE_bool(verify_checksum
, true,
803 "Verify checksum for every block read from storage");
805 DEFINE_int32(checksum_type
,
806 ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum
,
807 "ChecksumType as an int");
809 DEFINE_bool(statistics
, false, "Database statistics");
810 DEFINE_int32(stats_level
, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers
,
811 "stats level for statistics");
812 DEFINE_string(statistics_string
, "", "Serialized statistics string");
813 static class std::shared_ptr
<ROCKSDB_NAMESPACE::Statistics
> dbstats
;
815 DEFINE_int64(writes
, -1,
816 "Number of write operations to do. If negative, do --num reads.");
818 DEFINE_bool(finish_after_writes
, false,
819 "Write thread terminates after all writes are finished");
821 DEFINE_bool(sync
, false, "Sync all writes to disk");
823 DEFINE_bool(use_fsync
, false, "If true, issue fsync instead of fdatasync");
825 DEFINE_bool(disable_wal
, false, "If true, do not write WAL for write.");
827 DEFINE_bool(manual_wal_flush
, false,
828 "If true, buffer WAL until buffer is full or a manual FlushWAL().");
830 DEFINE_string(wal_compression
, "none",
831 "Algorithm to use for WAL compression. none to disable.");
832 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e
=
833 ROCKSDB_NAMESPACE::kNoCompression
;
835 DEFINE_string(wal_dir
, "", "If not empty, use the given dir for WAL");
837 DEFINE_string(truth_db
, "/dev/shm/truth_db/dbbench",
838 "Truth key/values used when using verify");
840 DEFINE_int32(num_levels
, 7, "The total number of levels");
842 DEFINE_int64(target_file_size_base
,
843 ROCKSDB_NAMESPACE::Options().target_file_size_base
,
844 "Target file size at level-1");
846 DEFINE_int32(target_file_size_multiplier
,
847 ROCKSDB_NAMESPACE::Options().target_file_size_multiplier
,
848 "A multiplier to compute target level-N file size (N >= 2)");
850 DEFINE_uint64(max_bytes_for_level_base
,
851 ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base
,
852 "Max bytes for level-1");
854 DEFINE_bool(level_compaction_dynamic_level_bytes
, false,
855 "Whether level size base is dynamic");
857 DEFINE_double(max_bytes_for_level_multiplier
, 10,
858 "A multiplier to compute max bytes for level-N (N >= 2)");
860 static std::vector
<int> FLAGS_max_bytes_for_level_multiplier_additional_v
;
861 DEFINE_string(max_bytes_for_level_multiplier_additional
, "",
862 "A vector that specifies additional fanout per level");
864 DEFINE_int32(level0_stop_writes_trigger
,
865 ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger
,
866 "Number of files in level-0 that will trigger put stop.");
868 DEFINE_int32(level0_slowdown_writes_trigger
,
869 ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger
,
870 "Number of files in level-0 that will slow down writes.");
872 DEFINE_int32(level0_file_num_compaction_trigger
,
873 ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger
,
874 "Number of files in level-0 when compactions start.");
876 DEFINE_uint64(periodic_compaction_seconds
,
877 ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds
,
878 "Files older than this will be picked up for compaction and"
879 " rewritten to the same level");
881 DEFINE_uint64(ttl_seconds
, ROCKSDB_NAMESPACE::Options().ttl
, "Set options.ttl");
883 static bool ValidateInt32Percent(const char* flagname
, int32_t value
) {
884 if (value
<= 0 || value
>= 100) {
885 fprintf(stderr
, "Invalid value for --%s: %d, 0< pct <100 \n", flagname
,
891 DEFINE_int32(readwritepercent
, 90,
892 "Ratio of reads to reads/writes (expressed as percentage) for "
893 "the ReadRandomWriteRandom workload. The default value 90 means "
894 "90% operations out of all reads and writes operations are "
895 "reads. In other words, 9 gets for every 1 put.");
897 DEFINE_int32(mergereadpercent
, 70,
898 "Ratio of merges to merges&reads (expressed as percentage) for "
899 "the ReadRandomMergeRandom workload. The default value 70 means "
900 "70% out of all read and merge operations are merges. In other "
901 "words, 7 merges for every 3 gets.");
903 DEFINE_int32(deletepercent
, 2,
904 "Percentage of deletes out of reads/writes/deletes (used in "
905 "RandomWithVerify only). RandomWithVerify "
906 "calculates writepercent as (100 - FLAGS_readwritepercent - "
907 "deletepercent), so deletepercent must be smaller than (100 - "
908 "FLAGS_readwritepercent)");
910 DEFINE_bool(optimize_filters_for_hits
,
911 ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits
,
912 "Optimizes bloom filters for workloads for most lookups return "
913 "a value. For now this doesn't create bloom filters for the max "
914 "level of the LSM to reduce metadata that should fit in RAM. ");
916 DEFINE_bool(paranoid_checks
, ROCKSDB_NAMESPACE::Options().paranoid_checks
,
917 "RocksDB will aggressively check consistency of the data.");
919 DEFINE_bool(force_consistency_checks
,
920 ROCKSDB_NAMESPACE::Options().force_consistency_checks
,
921 "Runs consistency checks on the LSM every time a change is "
924 DEFINE_bool(check_flush_compaction_key_order
,
925 ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order
,
926 "During flush or compaction, check whether keys inserted to "
927 "output files are in order.");
929 DEFINE_uint64(delete_obsolete_files_period_micros
, 0,
930 "Ignored. Left here for backward compatibility");
932 DEFINE_int64(writes_before_delete_range
, 0,
933 "Number of writes before DeleteRange is called regularly.");
935 DEFINE_int64(writes_per_range_tombstone
, 0,
936 "Number of writes between range tombstones");
938 DEFINE_int64(range_tombstone_width
, 100, "Number of keys in tombstone's range");
940 DEFINE_int64(max_num_range_tombstones
, 0,
941 "Maximum number of range tombstones to insert.");
943 DEFINE_bool(expand_range_tombstones
, false,
944 "Expand range tombstone into sequential regular tombstones.");
947 // Transactions Options
948 DEFINE_bool(optimistic_transaction_db
, false,
949 "Open a OptimisticTransactionDB instance. "
950 "Required for randomtransaction benchmark.");
952 DEFINE_bool(transaction_db
, false,
953 "Open a TransactionDB instance. "
954 "Required for randomtransaction benchmark.");
956 DEFINE_uint64(transaction_sets
, 2,
957 "Number of keys each transaction will "
958 "modify (use in RandomTransaction only). Max: 9999");
960 DEFINE_bool(transaction_set_snapshot
, false,
961 "Setting to true will have each transaction call SetSnapshot()"
964 DEFINE_int32(transaction_sleep
, 0,
965 "Max microseconds to sleep in between "
966 "reading and writing a value (used in RandomTransaction only). ");
968 DEFINE_uint64(transaction_lock_timeout
, 100,
969 "If using a transaction_db, specifies the lock wait timeout in"
970 " milliseconds before failing a transaction waiting on a lock");
973 "The path to a RocksDB options file. If specified, then db_bench will "
974 "run with the RocksDB options in the default column family of the "
975 "specified options file. "
976 "Note that with this setting, db_bench will ONLY accept the following "
977 "RocksDB options related command-line arguments, all other arguments "
978 "that are related to RocksDB options will be ignored:\n"
979 "\t--use_existing_db\n"
980 "\t--use_existing_keys\n"
982 "\t--row_cache_size\n"
983 "\t--row_cache_numshardbits\n"
984 "\t--enable_io_prio\n"
985 "\t--dump_malloc_stats\n"
986 "\t--num_multi_db\n");
988 // FIFO Compaction Options
989 DEFINE_uint64(fifo_compaction_max_table_files_size_mb
, 0,
990 "The limit of total table file sizes to trigger FIFO compaction");
992 DEFINE_bool(fifo_compaction_allow_compaction
, true,
993 "Allow compaction in FIFO compaction.");
995 DEFINE_uint64(fifo_compaction_ttl
, 0, "TTL for the SST Files in seconds.");
997 DEFINE_uint64(fifo_age_for_warm
, 0, "age_for_warm for FIFO compaction.");
999 // Stacked BlobDB Options
1000 DEFINE_bool(use_blob_db
, false, "[Stacked BlobDB] Open a BlobDB instance.");
1004 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection
,
1005 "[Stacked BlobDB] Enable BlobDB garbage collection.");
1009 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff
,
1010 "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
1012 DEFINE_bool(blob_db_is_fifo
,
1013 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo
,
1014 "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
1016 DEFINE_uint64(blob_db_max_db_size
,
1017 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size
,
1018 "[Stacked BlobDB] Max size limit of the directory where blob "
1019 "files are stored.");
1021 DEFINE_uint64(blob_db_max_ttl_range
, 0,
1022 "[Stacked BlobDB] TTL range to generate BlobDB data (in "
1023 "seconds). 0 means no TTL.");
1026 blob_db_ttl_range_secs
,
1027 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs
,
1028 "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
1031 blob_db_min_blob_size
,
1032 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size
,
1033 "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
1034 "smaller than this will be inlined with the key in the LSM tree.");
1036 DEFINE_uint64(blob_db_bytes_per_sync
,
1037 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync
,
1038 "[Stacked BlobDB] Bytes to sync blob file at.");
1040 DEFINE_uint64(blob_db_file_size
,
1041 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size
,
1042 "[Stacked BlobDB] Target size of each blob file.");
1045 blob_db_compression_type
, "snappy",
1046 "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
1047 static enum ROCKSDB_NAMESPACE::CompressionType
1048 FLAGS_blob_db_compression_type_e
= ROCKSDB_NAMESPACE::kSnappyCompression
;
1050 #endif // ROCKSDB_LITE
1052 // Integrated BlobDB options
1055 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files
,
1056 "[Integrated BlobDB] Enable writing large values to separate blob files.");
1058 DEFINE_uint64(min_blob_size
,
1059 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size
,
1060 "[Integrated BlobDB] The size of the smallest value to be stored "
1061 "separately in a blob file.");
1063 DEFINE_uint64(blob_file_size
,
1064 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size
,
1065 "[Integrated BlobDB] The size limit for blob files.");
1067 DEFINE_string(blob_compression_type
, "none",
1068 "[Integrated BlobDB] The compression algorithm to use for large "
1069 "values stored in blob files.");
1071 DEFINE_bool(enable_blob_garbage_collection
,
1072 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1073 .enable_blob_garbage_collection
,
1074 "[Integrated BlobDB] Enable blob garbage collection.");
1076 DEFINE_double(blob_garbage_collection_age_cutoff
,
1077 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1078 .blob_garbage_collection_age_cutoff
,
1079 "[Integrated BlobDB] The cutoff in terms of blob file age for "
1080 "garbage collection.");
1082 DEFINE_double(blob_garbage_collection_force_threshold
,
1083 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1084 .blob_garbage_collection_force_threshold
,
1085 "[Integrated BlobDB] The threshold for the ratio of garbage in "
1086 "the oldest blob files for forcing garbage collection.");
1088 DEFINE_uint64(blob_compaction_readahead_size
,
1089 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1090 .blob_compaction_readahead_size
,
1091 "[Integrated BlobDB] Compaction readahead for blob files.");
1094 blob_file_starting_level
,
1095 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level
,
1096 "[Integrated BlobDB] The starting level for blob files.");
1098 DEFINE_bool(use_blob_cache
, false, "[Integrated BlobDB] Enable blob cache.");
1101 use_shared_block_and_blob_cache
, true,
1102 "[Integrated BlobDB] Use a shared backing cache for both block "
1103 "cache and blob cache. It only takes effect if use_blob_cache is enabled.");
1106 blob_cache_size
, 8 << 20,
1107 "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only "
1108 "takes effect if the block and blob caches are different "
1109 "(use_shared_block_and_blob_cache = false).");
1111 DEFINE_int32(blob_cache_numshardbits
, 6,
1112 "[Integrated BlobDB] Number of shards for the blob cache is 2 ** "
1113 "blob_cache_numshardbits. Negative means use default settings. "
1114 "It only takes effect if blob_cache_size is greater than 0, and "
1115 "the block and blob caches are different "
1116 "(use_shared_block_and_blob_cache = false).");
1118 DEFINE_int32(prepopulate_blob_cache
, 0,
1119 "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
1120 "to disable and 1 to insert during flush.");
1122 #ifndef ROCKSDB_LITE
1124 // Secondary DB instance Options
1125 DEFINE_bool(use_secondary_db
, false,
1126 "Open a RocksDB secondary instance. A primary instance can be "
1127 "running in another db_bench process.");
1129 DEFINE_string(secondary_path
, "",
1130 "Path to a directory used by the secondary instance to store "
1131 "private files, e.g. info log.");
1133 DEFINE_int32(secondary_update_interval
, 5,
1134 "Secondary instance attempts to catch up with the primary every "
1135 "secondary_update_interval seconds.");
1137 #endif // ROCKSDB_LITE
1139 DEFINE_bool(report_bg_io_stats
, false,
1140 "Measure times spents on I/Os while in compactions. ");
1142 DEFINE_bool(use_stderr_info_logger
, false,
1143 "Write info logs to stderr instead of to LOG file. ");
1145 #ifndef ROCKSDB_LITE
1147 DEFINE_string(trace_file
, "", "Trace workload to a file. ");
1149 DEFINE_double(trace_replay_fast_forward
, 1.0,
1150 "Fast forward trace replay, must > 0.0.");
1151 DEFINE_int32(block_cache_trace_sampling_frequency
, 1,
1152 "Block cache trace sampling frequency, termed s. It uses spatial "
1153 "downsampling and samples accesses to one out of s blocks.");
1155 block_cache_trace_max_trace_file_size_in_bytes
,
1156 uint64_t{64} * 1024 * 1024 * 1024,
1157 "The maximum block cache trace file size in bytes. Block cache accesses "
1158 "will not be logged if the trace file size exceeds this threshold. Default "
1160 DEFINE_string(block_cache_trace_file
, "", "Block cache trace file path.");
1161 DEFINE_int32(trace_replay_threads
, 1,
1162 "The number of threads to replay, must >=1.");
1164 DEFINE_bool(io_uring_enabled
, true,
1165 "If true, enable the use of IO uring if the platform supports it");
1166 extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled
; }
1167 #endif // ROCKSDB_LITE
1169 DEFINE_bool(adaptive_readahead
, false,
1170 "carry forward internal auto readahead size from one file to next "
1171 "file at each level during iteration");
1173 DEFINE_bool(rate_limit_user_ops
, false,
1174 "When true use Env::IO_USER priority level to charge internal rate "
1175 "limiter for reads associated with user operations.");
1177 DEFINE_bool(file_checksum
, false,
1178 "When true use FileChecksumGenCrc32cFactory for "
1179 "file_checksum_gen_factory.");
1181 DEFINE_bool(rate_limit_auto_wal_flush
, false,
1182 "When true use Env::IO_USER priority level to charge internal rate "
1183 "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
1184 "false) after the user write operation.");
1186 DEFINE_bool(async_io
, false,
1187 "When set true, RocksDB does asynchronous reads for internal auto "
1188 "readahead prefetching.");
1190 DEFINE_bool(optimize_multiget_for_io
, true,
1191 "When set true, RocksDB does asynchronous reads for SST files in "
1192 "multiple levels for MultiGet.");
1194 DEFINE_bool(charge_compression_dictionary_building_buffer
, false,
1196 "CacheEntryRoleOptions::charged of "
1197 "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
1199 DEFINE_bool(charge_filter_construction
, false,
1201 "CacheEntryRoleOptions::charged of "
1202 "CacheEntryRole::kFilterConstruction");
1204 DEFINE_bool(charge_table_reader
, false,
1206 "CacheEntryRoleOptions::charged of "
1207 "CacheEntryRole::kBlockBasedTableReader");
1209 DEFINE_bool(charge_file_metadata
, false,
1211 "CacheEntryRoleOptions::charged of "
1212 "CacheEntryRole::kFileMetadata");
1214 DEFINE_bool(charge_blob_cache
, false,
1216 "CacheEntryRoleOptions::charged of "
1217 "CacheEntryRole::kBlobCache");
1219 DEFINE_uint64(backup_rate_limit
, 0ull,
1220 "If non-zero, db_bench will rate limit reads and writes for DB "
1222 "is the global rate in ops/second.");
1224 DEFINE_uint64(restore_rate_limit
, 0ull,
1225 "If non-zero, db_bench will rate limit reads and writes for DB "
1227 "is the global rate in ops/second.");
1229 DEFINE_string(backup_dir
, "",
1230 "If not empty string, use the given dir for backup.");
1232 DEFINE_string(restore_dir
, "",
1233 "If not empty string, use the given dir for restore.");
1236 initial_auto_readahead_size
,
1237 ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size
,
1238 "RocksDB does auto-readahead for iterators on noticing more than two reads "
1239 "for a table file if user doesn't provide readahead_size. The readahead "
1240 "size starts at initial_auto_readahead_size");
1243 max_auto_readahead_size
,
1244 ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size
,
1245 "Rocksdb implicit readahead starts at "
1246 "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every "
1247 "additional read upto max_auto_readahead_size");
1250 num_file_reads_for_auto_readahead
,
1251 ROCKSDB_NAMESPACE::BlockBasedTableOptions()
1252 .num_file_reads_for_auto_readahead
,
1253 "Rocksdb implicit readahead is enabled if reads are sequential and "
1254 "num_file_reads_for_auto_readahead indicates after how many sequential "
1255 "reads into that file internal auto prefetching should be start.");
1257 static enum ROCKSDB_NAMESPACE::CompressionType
StringToCompressionType(
1258 const char* ctype
) {
1261 if (!strcasecmp(ctype
, "none"))
1262 return ROCKSDB_NAMESPACE::kNoCompression
;
1263 else if (!strcasecmp(ctype
, "snappy"))
1264 return ROCKSDB_NAMESPACE::kSnappyCompression
;
1265 else if (!strcasecmp(ctype
, "zlib"))
1266 return ROCKSDB_NAMESPACE::kZlibCompression
;
1267 else if (!strcasecmp(ctype
, "bzip2"))
1268 return ROCKSDB_NAMESPACE::kBZip2Compression
;
1269 else if (!strcasecmp(ctype
, "lz4"))
1270 return ROCKSDB_NAMESPACE::kLZ4Compression
;
1271 else if (!strcasecmp(ctype
, "lz4hc"))
1272 return ROCKSDB_NAMESPACE::kLZ4HCCompression
;
1273 else if (!strcasecmp(ctype
, "xpress"))
1274 return ROCKSDB_NAMESPACE::kXpressCompression
;
1275 else if (!strcasecmp(ctype
, "zstd"))
1276 return ROCKSDB_NAMESPACE::kZSTD
;
1278 fprintf(stderr
, "Cannot parse compression type '%s'\n", ctype
);
1283 static std::string
ColumnFamilyName(size_t i
) {
1285 return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
;
1288 snprintf(name
, sizeof(name
), "column_family_name_%06zu", i
);
1289 return std::string(name
);
1293 DEFINE_string(compression_type
, "snappy",
1294 "Algorithm to use to compress the database");
1295 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e
=
1296 ROCKSDB_NAMESPACE::kSnappyCompression
;
1298 DEFINE_int64(sample_for_compression
, 0, "Sample every N block for compression");
1300 DEFINE_int32(compression_level
, ROCKSDB_NAMESPACE::CompressionOptions().level
,
1301 "Compression level. The meaning of this value is library-"
1302 "dependent. If unset, we try to use the default for the library "
1303 "specified in `--compression_type`");
1305 DEFINE_int32(compression_max_dict_bytes
,
1306 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes
,
1307 "Maximum size of dictionary used to prime the compression "
1310 DEFINE_int32(compression_zstd_max_train_bytes
,
1311 ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes
,
1312 "Maximum size of training data passed to zstd's dictionary "
1315 DEFINE_int32(min_level_to_compress
, -1,
1316 "If non-negative, compression starts"
1317 " from this level. Levels with number < min_level_to_compress are"
1318 " not compressed. Otherwise, apply compression_type to "
1321 DEFINE_int32(compression_parallel_threads
, 1,
1322 "Number of threads for parallel compression.");
1324 DEFINE_uint64(compression_max_dict_buffer_bytes
,
1325 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes
,
1326 "Maximum bytes to buffer to collect samples for dictionary.");
1328 DEFINE_bool(compression_use_zstd_dict_trainer
,
1329 ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer
,
1330 "If true, use ZSTD_TrainDictionary() to create dictionary, else"
1331 "use ZSTD_FinalizeDictionary() to create dictionary");
1333 static bool ValidateTableCacheNumshardbits(const char* flagname
,
1335 if (0 >= value
|| value
>= 20) {
1336 fprintf(stderr
, "Invalid value for --%s: %d, must be 0 < val < 20\n",
1342 DEFINE_int32(table_cache_numshardbits
, 4, "");
1344 #ifndef ROCKSDB_LITE
1345 DEFINE_string(env_uri
, "",
1346 "URI for registry Env lookup. Mutually exclusive with --fs_uri");
1347 DEFINE_string(fs_uri
, "",
1348 "URI for registry Filesystem lookup. Mutually exclusive"
1350 " Creates a default environment with the specified filesystem.");
1351 #endif // ROCKSDB_LITE
1352 DEFINE_string(simulate_hybrid_fs_file
, "",
1353 "File for Store Metadata for Simulate hybrid FS. Empty means "
1354 "disable the feature. Now, if it is set, last_level_temperature "
1355 "is set to kWarm.");
1356 DEFINE_int32(simulate_hybrid_hdd_multipliers
, 1,
1357 "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
1359 DEFINE_bool(simulate_hdd
, false, "Simulate read/write latency on HDD.");
1362 preclude_last_level_data_seconds
, 0,
1363 "Preclude the latest data from the last level. (Used for tiered storage)");
1365 DEFINE_int64(preserve_internal_time_seconds
, 0,
1366 "Preserve the internal time information which stores with SST.");
1368 static std::shared_ptr
<ROCKSDB_NAMESPACE::Env
> env_guard
;
1370 static ROCKSDB_NAMESPACE::Env
* FLAGS_env
= ROCKSDB_NAMESPACE::Env::Default();
1372 DEFINE_int64(stats_interval
, 0,
1373 "Stats are reported every N operations when this is greater than "
1374 "zero. When 0 the interval grows over time.");
1376 DEFINE_int64(stats_interval_seconds
, 0,
1377 "Report stats every N seconds. This overrides stats_interval when"
1380 DEFINE_int32(stats_per_interval
, 0,
1381 "Reports additional stats per interval when this is greater than "
1384 DEFINE_uint64(slow_usecs
, 1000000,
1385 "A message is printed for operations that take at least this "
1386 "many microseconds.");
1388 DEFINE_int64(report_interval_seconds
, 0,
1389 "If greater than zero, it will write simple stats in CSV format "
1390 "to --report_file every N seconds");
1392 DEFINE_string(report_file
, "report.csv",
1393 "Filename where some simple stats are reported to (if "
1394 "--report_interval_seconds is bigger than 0)");
1396 DEFINE_int32(thread_status_per_interval
, 0,
1397 "Takes and report a snapshot of the current status of each thread"
1398 " when this is greater than 0.");
1400 DEFINE_int32(perf_level
, ROCKSDB_NAMESPACE::PerfLevel::kDisable
,
1401 "Level of perf collection");
1403 DEFINE_uint64(soft_pending_compaction_bytes_limit
, 64ull * 1024 * 1024 * 1024,
1404 "Slowdown writes if pending compaction bytes exceed this number");
1406 DEFINE_uint64(hard_pending_compaction_bytes_limit
, 128ull * 1024 * 1024 * 1024,
1407 "Stop writes if pending compaction bytes exceed this number");
1409 DEFINE_uint64(delayed_write_rate
, 8388608u,
1410 "Limited bytes allowed to DB when soft_rate_limit or "
1411 "level0_slowdown_writes_trigger triggers");
1413 DEFINE_bool(enable_pipelined_write
, true,
1414 "Allow WAL and memtable writes to be pipelined");
1417 unordered_write
, false,
1418 "Enable the unordered write feature, which provides higher throughput but "
1419 "relaxes the guarantees around atomic reads and immutable snapshots");
1421 DEFINE_bool(allow_concurrent_memtable_write
, true,
1422 "Allow multi-writers to update mem tables in parallel.");
1424 DEFINE_double(experimental_mempurge_threshold
, 0.0,
1425 "Maximum useful payload ratio estimate that triggers a mempurge "
1426 "(memtable garbage collection).");
1428 DEFINE_bool(inplace_update_support
,
1429 ROCKSDB_NAMESPACE::Options().inplace_update_support
,
1430 "Support in-place memtable update for smaller or same-size values");
1432 DEFINE_uint64(inplace_update_num_locks
,
1433 ROCKSDB_NAMESPACE::Options().inplace_update_num_locks
,
1434 "Number of RW locks to protect in-place memtable updates");
1436 DEFINE_bool(enable_write_thread_adaptive_yield
, true,
1437 "Use a yielding spin loop for brief writer thread waits.");
1440 write_thread_max_yield_usec
, 100,
1441 "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
1443 DEFINE_uint64(write_thread_slow_yield_usec
, 3,
1444 "The threshold at which a slow yield is considered a signal that "
1445 "other processes or threads want the core.");
1447 DEFINE_uint64(rate_limiter_bytes_per_sec
, 0, "Set options.rate_limiter value.");
1449 DEFINE_int64(rate_limiter_refill_period_us
, 100 * 1000,
1450 "Set refill period on rate limiter.");
1452 DEFINE_bool(rate_limiter_auto_tuned
, false,
1453 "Enable dynamic adjustment of rate limit according to demand for "
1456 DEFINE_bool(sine_write_rate
, false, "Use a sine wave write_rate_limit");
1459 sine_write_rate_interval_milliseconds
, 10000,
1460 "Interval of which the sine wave write_rate_limit is recalculated");
1462 DEFINE_double(sine_a
, 1, "A in f(x) = A sin(bx + c) + d");
1464 DEFINE_double(sine_b
, 1, "B in f(x) = A sin(bx + c) + d");
1466 DEFINE_double(sine_c
, 0, "C in f(x) = A sin(bx + c) + d");
1468 DEFINE_double(sine_d
, 1, "D in f(x) = A sin(bx + c) + d");
1470 DEFINE_bool(rate_limit_bg_reads
, false,
1471 "Use options.rate_limiter on compaction reads");
1474 benchmark_write_rate_limit
, 0,
1475 "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
1476 "is the global rate in bytes/second.");
1478 // the parameters of mix_graph
1479 DEFINE_double(keyrange_dist_a
, 0.0,
1480 "The parameter 'a' of prefix average access distribution "
1481 "f(x)=a*exp(b*x)+c*exp(d*x)");
1482 DEFINE_double(keyrange_dist_b
, 0.0,
1483 "The parameter 'b' of prefix average access distribution "
1484 "f(x)=a*exp(b*x)+c*exp(d*x)");
1485 DEFINE_double(keyrange_dist_c
, 0.0,
1486 "The parameter 'c' of prefix average access distribution"
1487 "f(x)=a*exp(b*x)+c*exp(d*x)");
1488 DEFINE_double(keyrange_dist_d
, 0.0,
1489 "The parameter 'd' of prefix average access distribution"
1490 "f(x)=a*exp(b*x)+c*exp(d*x)");
1491 DEFINE_int64(keyrange_num
, 1,
1492 "The number of key ranges that are in the same prefix "
1493 "group, each prefix range will have its key access distribution");
1494 DEFINE_double(key_dist_a
, 0.0,
1495 "The parameter 'a' of key access distribution model f(x)=a*x^b");
1496 DEFINE_double(key_dist_b
, 0.0,
1497 "The parameter 'b' of key access distribution model f(x)=a*x^b");
1498 DEFINE_double(value_theta
, 0.0,
1499 "The parameter 'theta' of Generized Pareto Distribution "
1500 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1501 // Use reasonable defaults based on the mixgraph paper
1502 DEFINE_double(value_k
, 0.2615,
1503 "The parameter 'k' of Generized Pareto Distribution "
1504 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1505 // Use reasonable defaults based on the mixgraph paper
1506 DEFINE_double(value_sigma
, 25.45,
1507 "The parameter 'theta' of Generized Pareto Distribution "
1508 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1509 DEFINE_double(iter_theta
, 0.0,
1510 "The parameter 'theta' of Generized Pareto Distribution "
1511 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1512 // Use reasonable defaults based on the mixgraph paper
1513 DEFINE_double(iter_k
, 2.517,
1514 "The parameter 'k' of Generized Pareto Distribution "
1515 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1516 // Use reasonable defaults based on the mixgraph paper
1517 DEFINE_double(iter_sigma
, 14.236,
1518 "The parameter 'sigma' of Generized Pareto Distribution "
1519 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1520 DEFINE_double(mix_get_ratio
, 1.0,
1521 "The ratio of Get queries of mix_graph workload");
1522 DEFINE_double(mix_put_ratio
, 0.0,
1523 "The ratio of Put queries of mix_graph workload");
1524 DEFINE_double(mix_seek_ratio
, 0.0,
1525 "The ratio of Seek queries of mix_graph workload");
1526 DEFINE_int64(mix_max_scan_len
, 10000, "The max scan length of Iterator");
1527 DEFINE_int64(mix_max_value_size
, 1024, "The max value size of this workload");
1529 sine_mix_rate_noise
, 0.0,
1530 "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
1531 DEFINE_bool(sine_mix_rate
, false,
1532 "Enable the sine QPS control on the mix workload");
1534 sine_mix_rate_interval_milliseconds
, 10000,
1535 "Interval of which the sine wave read_rate_limit is recalculated");
1536 DEFINE_int64(mix_accesses
, -1,
1537 "The total query accesses of mix_graph workload");
1540 benchmark_read_rate_limit
, 0,
1541 "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
1542 "is the global rate in ops/second.");
1544 DEFINE_uint64(max_compaction_bytes
,
1545 ROCKSDB_NAMESPACE::Options().max_compaction_bytes
,
1546 "Max bytes allowed in one compaction");
1548 #ifndef ROCKSDB_LITE
1549 DEFINE_bool(readonly
, false, "Run read only benchmarks.");
1551 DEFINE_bool(print_malloc_stats
, false,
1552 "Print malloc stats to stdout after benchmarks finish.");
1553 #endif // ROCKSDB_LITE
1555 DEFINE_bool(disable_auto_compactions
, false, "Do not auto trigger compactions");
1557 DEFINE_uint64(wal_ttl_seconds
, 0, "Set the TTL for the WAL Files in seconds.");
1558 DEFINE_uint64(wal_size_limit_MB
, 0,
1559 "Set the size limit for the WAL Files in MB.");
1560 DEFINE_uint64(max_total_wal_size
, 0, "Set total max WAL size");
1562 DEFINE_bool(mmap_read
, ROCKSDB_NAMESPACE::Options().allow_mmap_reads
,
1563 "Allow reads to occur via mmap-ing files");
1565 DEFINE_bool(mmap_write
, ROCKSDB_NAMESPACE::Options().allow_mmap_writes
,
1566 "Allow writes to occur via mmap-ing files");
1568 DEFINE_bool(use_direct_reads
, ROCKSDB_NAMESPACE::Options().use_direct_reads
,
1569 "Use O_DIRECT for reading data");
1571 DEFINE_bool(use_direct_io_for_flush_and_compaction
,
1572 ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction
,
1573 "Use O_DIRECT for background flush and compaction writes");
1575 DEFINE_bool(advise_random_on_open
,
1576 ROCKSDB_NAMESPACE::Options().advise_random_on_open
,
1577 "Advise random access on table file open");
1579 DEFINE_string(compaction_fadvice
, "NORMAL",
1580 "Access pattern advice when a file is compacted");
1581 static auto FLAGS_compaction_fadvice_e
=
1582 ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start
;
1584 DEFINE_bool(use_tailing_iterator
, false,
1585 "Use tailing iterator to access a series of keys instead of get");
1587 DEFINE_bool(use_adaptive_mutex
, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex
,
1588 "Use adaptive mutex");
1590 DEFINE_uint64(bytes_per_sync
, ROCKSDB_NAMESPACE::Options().bytes_per_sync
,
1591 "Allows OS to incrementally sync SST files to disk while they are"
1592 " being written, in the background. Issue one request for every"
1593 " bytes_per_sync written. 0 turns it off.");
1595 DEFINE_uint64(wal_bytes_per_sync
,
1596 ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync
,
1597 "Allows OS to incrementally sync WAL files to disk while they are"
1598 " being written, in the background. Issue one request for every"
1599 " wal_bytes_per_sync written. 0 turns it off.");
1601 DEFINE_bool(use_single_deletes
, true,
1602 "Use single deletes (used in RandomReplaceKeys only).");
1604 DEFINE_double(stddev
, 2000.0,
1605 "Standard deviation of normal distribution used for picking keys"
1606 " (used in RandomReplaceKeys only).");
1608 DEFINE_int32(key_id_range
, 100000,
1609 "Range of possible value of key id (used in TimeSeries only).");
1611 DEFINE_string(expire_style
, "none",
1612 "Style to remove expired time entries. Can be one of the options "
1613 "below: none (do not expired data), compaction_filter (use a "
1614 "compaction filter to remove expired data), delete (seek IDs and "
1615 "remove expired data) (used in TimeSeries only).");
1619 "Range of timestamp that store in the database (used in TimeSeries"
1622 DEFINE_int32(num_deletion_threads
, 1,
1623 "Number of threads to do deletion (used in TimeSeries and delete "
1624 "expire_style only).");
1626 DEFINE_int32(max_successive_merges
, 0,
1627 "Maximum number of successive merge operations on a key in the "
1630 static bool ValidatePrefixSize(const char* flagname
, int32_t value
) {
1631 if (value
< 0 || value
>= 2000000000) {
1632 fprintf(stderr
, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
1639 DEFINE_int32(prefix_size
, 0,
1640 "control the prefix size for HashSkipList and plain table");
1641 DEFINE_int64(keys_per_prefix
, 0,
1642 "control average number of keys generated per prefix, 0 means no "
1643 "special handling of the prefix, i.e. use the prefix comes with "
1644 "the generated random number.");
1645 DEFINE_bool(total_order_seek
, false,
1646 "Enable total order seek regardless of index format.");
1647 DEFINE_bool(prefix_same_as_start
, false,
1648 "Enforce iterator to return keys with prefix same as seek key.");
1650 seek_missing_prefix
, false,
1651 "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
1653 DEFINE_int32(memtable_insert_with_hint_prefix_size
, 0,
1654 "If non-zero, enable "
1655 "memtable insert with hint with the given prefix size.");
1656 DEFINE_bool(enable_io_prio
, false,
1657 "Lower the background flush/compaction threads' IO priority");
1658 DEFINE_bool(enable_cpu_prio
, false,
1659 "Lower the background flush/compaction threads' CPU priority");
1660 DEFINE_bool(identity_as_first_hash
, false,
1661 "the first hash function of cuckoo table becomes an identity "
1662 "function. This is only valid when key is 8 bytes");
1663 DEFINE_bool(dump_malloc_stats
, true, "Dump malloc stats in LOG ");
1664 DEFINE_uint64(stats_dump_period_sec
,
1665 ROCKSDB_NAMESPACE::Options().stats_dump_period_sec
,
1666 "Gap between printing stats to log in seconds");
1667 DEFINE_uint64(stats_persist_period_sec
,
1668 ROCKSDB_NAMESPACE::Options().stats_persist_period_sec
,
1669 "Gap between persisting stats in seconds");
1670 DEFINE_bool(persist_stats_to_disk
,
1671 ROCKSDB_NAMESPACE::Options().persist_stats_to_disk
,
1672 "whether to persist stats to disk");
1673 DEFINE_uint64(stats_history_buffer_size
,
1674 ROCKSDB_NAMESPACE::Options().stats_history_buffer_size
,
1675 "Max number of stats snapshots to keep in memory");
1676 DEFINE_bool(avoid_flush_during_recovery
,
1677 ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery
,
1678 "If true, avoids flushing the recovered WAL data where possible.");
1679 DEFINE_int64(multiread_stride
, 0,
1680 "Stride length for the keys in a MultiGet batch");
1681 DEFINE_bool(multiread_batched
, false, "Use the new MultiGet API");
1683 DEFINE_string(memtablerep
, "skip_list", "");
1684 DEFINE_int64(hash_bucket_count
, 1024 * 1024, "hash bucket count");
1685 DEFINE_bool(use_plain_table
, false,
1686 "if use plain table instead of block-based table format");
1687 DEFINE_bool(use_cuckoo_table
, false, "if use cuckoo table format");
1688 DEFINE_double(cuckoo_hash_ratio
, 0.9, "Hash ratio for Cuckoo SST table.");
1689 DEFINE_bool(use_hash_search
, false,
1690 "if use kHashSearch instead of kBinarySearch. "
1691 "This is valid if only we use BlockTable");
1692 DEFINE_string(merge_operator
, "",
1693 "The merge operator to use with the database."
1694 "If a new merge operator is specified, be sure to use fresh"
1695 " database The possible merge operators are defined in"
1696 " utilities/merge_operators.h");
1697 DEFINE_int32(skip_list_lookahead
, 0,
1698 "Used with skip_list memtablerep; try linear search first for "
1699 "this many steps from the previous position");
1700 DEFINE_bool(report_file_operations
, false,
1701 "if report number of file operations");
1702 DEFINE_bool(report_open_timing
, false, "if report open timing");
1703 DEFINE_int32(readahead_size
, 0, "Iterator readahead size");
1705 DEFINE_bool(read_with_latest_user_timestamp
, true,
1706 "If true, always use the current latest timestamp for read. If "
1707 "false, choose a random timestamp from the past.");
1709 #ifndef ROCKSDB_LITE
1710 DEFINE_string(secondary_cache_uri
, "",
1711 "Full URI for creating a custom secondary cache object");
1712 static class std::shared_ptr
<ROCKSDB_NAMESPACE::SecondaryCache
> secondary_cache
;
1713 #endif // ROCKSDB_LITE
1715 static const bool FLAGS_prefix_size_dummy
__attribute__((__unused__
)) =
1716 RegisterFlagValidator(&FLAGS_prefix_size
, &ValidatePrefixSize
);
1718 static const bool FLAGS_key_size_dummy
__attribute__((__unused__
)) =
1719 RegisterFlagValidator(&FLAGS_key_size
, &ValidateKeySize
);
1721 static const bool FLAGS_cache_numshardbits_dummy
__attribute__((__unused__
)) =
1722 RegisterFlagValidator(&FLAGS_cache_numshardbits
,
1723 &ValidateCacheNumshardbits
);
1725 static const bool FLAGS_readwritepercent_dummy
__attribute__((__unused__
)) =
1726 RegisterFlagValidator(&FLAGS_readwritepercent
, &ValidateInt32Percent
);
1728 DEFINE_int32(disable_seek_compaction
, false,
1729 "Not used, left here for backwards compatibility");
1731 DEFINE_bool(allow_data_in_errors
,
1732 ROCKSDB_NAMESPACE::Options().allow_data_in_errors
,
1733 "If true, allow logging data, e.g. key, value in LOG files.");
1735 static const bool FLAGS_deletepercent_dummy
__attribute__((__unused__
)) =
1736 RegisterFlagValidator(&FLAGS_deletepercent
, &ValidateInt32Percent
);
1737 static const bool FLAGS_table_cache_numshardbits_dummy
1738 __attribute__((__unused__
)) = RegisterFlagValidator(
1739 &FLAGS_table_cache_numshardbits
, &ValidateTableCacheNumshardbits
);
1741 DEFINE_uint32(write_batch_protection_bytes_per_key
, 0,
1742 "Size of per-key-value checksum in each write batch. Currently "
1743 "only value 0 and 8 are supported.");
1746 memtable_protection_bytes_per_key
, 0,
1747 "Enable memtable per key-value checksum protection. "
1748 "Each entry in memtable will be suffixed by a per key-value checksum. "
1749 "This options determines the size of such checksums. "
1750 "Supported values: 0, 1, 2, 4, 8.");
1752 DEFINE_bool(build_info
, false,
1753 "Print the build info via GetRocksBuildInfoAsString");
1755 DEFINE_bool(track_and_verify_wals_in_manifest
, false,
1756 "If true, enable WAL tracking in the MANIFEST");
1758 namespace ROCKSDB_NAMESPACE
{
1760 static Status
CreateMemTableRepFactory(
1761 const ConfigOptions
& config_options
,
1762 std::shared_ptr
<MemTableRepFactory
>* factory
) {
1764 if (!strcasecmp(FLAGS_memtablerep
.c_str(), SkipListFactory::kNickName())) {
1765 factory
->reset(new SkipListFactory(FLAGS_skip_list_lookahead
));
1766 #ifndef ROCKSDB_LITE
1767 } else if (!strcasecmp(FLAGS_memtablerep
.c_str(), "prefix_hash")) {
1768 factory
->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count
));
1769 } else if (!strcasecmp(FLAGS_memtablerep
.c_str(),
1770 VectorRepFactory::kNickName())) {
1771 factory
->reset(new VectorRepFactory());
1772 } else if (!strcasecmp(FLAGS_memtablerep
.c_str(), "hash_linkedlist")) {
1773 factory
->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count
));
1774 #endif // ROCKSDB_LITE
1776 std::unique_ptr
<MemTableRepFactory
> unique
;
1777 s
= MemTableRepFactory::CreateFromString(config_options
, FLAGS_memtablerep
,
1780 factory
->reset(unique
.release());
1788 enum DistributionType
: unsigned char { kFixed
= 0, kUniform
, kNormal
};
1790 static enum DistributionType FLAGS_value_size_distribution_type_e
= kFixed
;
1792 static enum DistributionType
StringToDistributionType(const char* ctype
) {
1795 if (!strcasecmp(ctype
, "fixed"))
1797 else if (!strcasecmp(ctype
, "uniform"))
1799 else if (!strcasecmp(ctype
, "normal"))
1802 fprintf(stdout
, "Cannot parse distribution type '%s'\n", ctype
);
1806 class BaseDistribution
{
1808 BaseDistribution(unsigned int _min
, unsigned int _max
)
1809 : min_value_size_(_min
), max_value_size_(_max
) {}
1810 virtual ~BaseDistribution() {}
1812 unsigned int Generate() {
1814 if (NeedTruncate()) {
1815 val
= std::max(min_value_size_
, val
);
1816 val
= std::min(max_value_size_
, val
);
1822 virtual unsigned int Get() = 0;
1823 virtual bool NeedTruncate() { return true; }
1824 unsigned int min_value_size_
;
1825 unsigned int max_value_size_
;
1828 class FixedDistribution
: public BaseDistribution
{
1830 FixedDistribution(unsigned int size
)
1831 : BaseDistribution(size
, size
), size_(size
) {}
1834 virtual unsigned int Get() override
{ return size_
; }
1835 virtual bool NeedTruncate() override
{ return false; }
1839 class NormalDistribution
: public BaseDistribution
,
1840 public std::normal_distribution
<double> {
1842 NormalDistribution(unsigned int _min
, unsigned int _max
)
1843 : BaseDistribution(_min
, _max
),
1844 // 99.7% values within the range [min, max].
1845 std::normal_distribution
<double>(
1846 (double)(_min
+ _max
) / 2.0 /*mean*/,
1847 (double)(_max
- _min
) / 6.0 /*stddev*/),
1851 virtual unsigned int Get() override
{
1852 return static_cast<unsigned int>((*this)(gen_
));
1854 std::random_device rd_
;
1858 class UniformDistribution
: public BaseDistribution
,
1859 public std::uniform_int_distribution
<unsigned int> {
1861 UniformDistribution(unsigned int _min
, unsigned int _max
)
1862 : BaseDistribution(_min
, _max
),
1863 std::uniform_int_distribution
<unsigned int>(_min
, _max
),
1867 virtual unsigned int Get() override
{ return (*this)(gen_
); }
1868 virtual bool NeedTruncate() override
{ return false; }
1869 std::random_device rd_
;
1873 // Helper for quickly generating random data.
1874 class RandomGenerator
{
1878 std::unique_ptr
<BaseDistribution
> dist_
;
1882 auto max_value_size
= FLAGS_value_size_max
;
1883 switch (FLAGS_value_size_distribution_type_e
) {
1885 dist_
.reset(new UniformDistribution(FLAGS_value_size_min
,
1886 FLAGS_value_size_max
));
1890 new NormalDistribution(FLAGS_value_size_min
, FLAGS_value_size_max
));
1894 dist_
.reset(new FixedDistribution(value_size
));
1895 max_value_size
= value_size
;
1897 // We use a limited amount of data over and over again and ensure
1898 // that it is larger than the compression window (32KB), and also
1899 // large enough to serve all typical value sizes we want to write.
1902 while (data_
.size() < (unsigned)std::max(1048576, max_value_size
)) {
1903 // Add a short fragment that is as compressible as specified
1904 // by FLAGS_compression_ratio.
1905 test::CompressibleString(&rnd
, FLAGS_compression_ratio
, 100, &piece
);
1906 data_
.append(piece
);
1911 Slice
Generate(unsigned int len
) {
1912 assert(len
<= data_
.size());
1913 if (pos_
+ len
> data_
.size()) {
1917 return Slice(data_
.data() + pos_
- len
, len
);
1921 auto len
= dist_
->Generate();
1922 return Generate(len
);
1926 static void AppendWithSpace(std::string
* str
, Slice msg
) {
1927 if (msg
.empty()) return;
1928 if (!str
->empty()) {
1929 str
->push_back(' ');
1931 str
->append(msg
.data(), msg
.size());
1934 struct DBWithColumnFamilies
{
1935 std::vector
<ColumnFamilyHandle
*> cfh
;
1937 #ifndef ROCKSDB_LITE
1938 OptimisticTransactionDB
* opt_txn_db
;
1939 #endif // ROCKSDB_LITE
1940 std::atomic
<size_t> num_created
; // Need to be updated after all the
1941 // new entries in cfh are set.
1942 size_t num_hot
; // Number of column families to be queried at each moment.
1943 // After each CreateNewCf(), another num_hot number of new
1944 // Column families will be created and used to be queried.
1945 port::Mutex create_cf_mutex
; // Only one thread can execute CreateNewCf()
1946 std::vector
<int> cfh_idx_to_prob
; // ith index holds probability of operating
1949 DBWithColumnFamilies()
1951 #ifndef ROCKSDB_LITE
1954 #endif // ROCKSDB_LITE
1961 DBWithColumnFamilies(const DBWithColumnFamilies
& other
)
1964 #ifndef ROCKSDB_LITE
1965 opt_txn_db(other
.opt_txn_db
),
1966 #endif // ROCKSDB_LITE
1967 num_created(other
.num_created
.load()),
1968 num_hot(other
.num_hot
),
1969 cfh_idx_to_prob(other
.cfh_idx_to_prob
) {
1973 std::for_each(cfh
.begin(), cfh
.end(),
1974 [](ColumnFamilyHandle
* cfhi
) { delete cfhi
; });
1976 #ifndef ROCKSDB_LITE
1979 opt_txn_db
= nullptr;
1987 #endif // ROCKSDB_LITE
1990 ColumnFamilyHandle
* GetCfh(int64_t rand_num
) {
1991 assert(num_hot
> 0);
1992 size_t rand_offset
= 0;
1993 if (!cfh_idx_to_prob
.empty()) {
1994 assert(cfh_idx_to_prob
.size() == num_hot
);
1996 while (sum
+ cfh_idx_to_prob
[rand_offset
] < rand_num
% 100) {
1997 sum
+= cfh_idx_to_prob
[rand_offset
];
2000 assert(rand_offset
< cfh_idx_to_prob
.size());
2002 rand_offset
= rand_num
% num_hot
;
2004 return cfh
[num_created
.load(std::memory_order_acquire
) - num_hot
+
2008 // stage: assume CF from 0 to stage * num_hot has be created. Need to create
2009 // stage * num_hot + 1 to stage * (num_hot + 1).
2010 void CreateNewCf(ColumnFamilyOptions options
, int64_t stage
) {
2011 MutexLock
l(&create_cf_mutex
);
2012 if ((stage
+ 1) * num_hot
<= num_created
) {
2016 auto new_num_created
= num_created
+ num_hot
;
2017 assert(new_num_created
<= cfh
.size());
2018 for (size_t i
= num_created
; i
< new_num_created
; i
++) {
2020 db
->CreateColumnFamily(options
, ColumnFamilyName(i
), &(cfh
[i
]));
2022 fprintf(stderr
, "create column family error: %s\n",
2023 s
.ToString().c_str());
2027 num_created
.store(new_num_created
, std::memory_order_release
);
2031 // A class that reports stats to CSV file.
2032 class ReporterAgent
{
2034 ReporterAgent(Env
* env
, const std::string
& fname
,
2035 uint64_t report_interval_secs
)
2039 report_interval_secs_(report_interval_secs
),
2041 auto s
= env_
->NewWritableFile(fname
, &report_file_
, EnvOptions());
2043 s
= report_file_
->Append(Header() + "\n");
2046 s
= report_file_
->Flush();
2049 fprintf(stderr
, "Can't open %s: %s\n", fname
.c_str(),
2050 s
.ToString().c_str());
2054 reporting_thread_
= port::Thread([&]() { SleepAndReport(); });
2059 std::unique_lock
<std::mutex
> lk(mutex_
);
2061 stop_cv_
.notify_all();
2063 reporting_thread_
.join();
2067 void ReportFinishedOps(int64_t num_ops
) {
2068 total_ops_done_
.fetch_add(num_ops
);
2072 std::string
Header() const { return "secs_elapsed,interval_qps"; }
2073 void SleepAndReport() {
2074 auto* clock
= env_
->GetSystemClock().get();
2075 auto time_started
= clock
->NowMicros();
2078 std::unique_lock
<std::mutex
> lk(mutex_
);
2080 stop_cv_
.wait_for(lk
, std::chrono::seconds(report_interval_secs_
),
2081 [&]() { return stop_
; })) {
2085 // else -> timeout, which means time for a report!
2087 auto total_ops_done_snapshot
= total_ops_done_
.load();
2088 // round the seconds elapsed
2090 (clock
->NowMicros() - time_started
+ kMicrosInSecond
/ 2) /
2092 std::string report
=
2093 std::to_string(secs_elapsed
) + "," +
2094 std::to_string(total_ops_done_snapshot
- last_report_
) + "\n";
2095 auto s
= report_file_
->Append(report
);
2097 s
= report_file_
->Flush();
2101 "Can't write to report file (%s), stopping the reporting\n",
2102 s
.ToString().c_str());
2105 last_report_
= total_ops_done_snapshot
;
2110 std::unique_ptr
<WritableFile
> report_file_
;
2111 std::atomic
<int64_t> total_ops_done_
;
2112 int64_t last_report_
;
2113 const uint64_t report_interval_secs_
;
2114 ROCKSDB_NAMESPACE::port::Thread reporting_thread_
;
2116 // will notify on stop
2117 std::condition_variable stop_cv_
;
2121 enum OperationType
: unsigned char {
2135 static std::unordered_map
<OperationType
, std::string
, std::hash
<unsigned char>>
2136 OperationTypeString
= {{kRead
, "read"}, {kWrite
, "write"},
2137 {kDelete
, "delete"}, {kSeek
, "seek"},
2138 {kMerge
, "merge"}, {kUpdate
, "update"},
2139 {kCompress
, "compress"}, {kCompress
, "uncompress"},
2140 {kCrc
, "crc"}, {kHash
, "hash"},
2143 class CombinedStats
;
2146 SystemClock
* clock_
;
2148 uint64_t start_
= 0;
2149 uint64_t sine_interval_
;
2153 uint64_t last_report_done_
;
2154 uint64_t next_report_
;
2156 uint64_t last_op_finish_
;
2157 uint64_t last_report_finish_
;
2158 std::unordered_map
<OperationType
, std::shared_ptr
<HistogramImpl
>,
2159 std::hash
<unsigned char>>
2161 std::string message_
;
2162 bool exclude_from_merge_
;
2163 ReporterAgent
* reporter_agent_
; // does not own
2164 friend class CombinedStats
;
2167 Stats() : clock_(FLAGS_env
->GetSystemClock().get()) { Start(-1); }
2169 void SetReporterAgent(ReporterAgent
* reporter_agent
) {
2170 reporter_agent_
= reporter_agent
;
2173 void Start(int id
) {
2175 next_report_
= FLAGS_stats_interval
? FLAGS_stats_interval
: 100;
2176 last_op_finish_
= start_
;
2179 last_report_done_
= 0;
2182 start_
= clock_
->NowMicros();
2183 sine_interval_
= clock_
->NowMicros();
2185 last_report_finish_
= start_
;
2187 // When set, stats from this thread won't be merged with others.
2188 exclude_from_merge_
= false;
2191 void Merge(const Stats
& other
) {
2192 if (other
.exclude_from_merge_
) return;
2194 for (auto it
= other
.hist_
.begin(); it
!= other
.hist_
.end(); ++it
) {
2195 auto this_it
= hist_
.find(it
->first
);
2196 if (this_it
!= hist_
.end()) {
2197 this_it
->second
->Merge(*(other
.hist_
.at(it
->first
)));
2199 hist_
.insert({it
->first
, it
->second
});
2203 done_
+= other
.done_
;
2204 bytes_
+= other
.bytes_
;
2205 seconds_
+= other
.seconds_
;
2206 if (other
.start_
< start_
) start_
= other
.start_
;
2207 if (other
.finish_
> finish_
) finish_
= other
.finish_
;
2209 // Just keep the messages from one thread.
2210 if (message_
.empty()) message_
= other
.message_
;
2214 finish_
= clock_
->NowMicros();
2215 seconds_
= (finish_
- start_
) * 1e-6;
2218 void AddMessage(Slice msg
) { AppendWithSpace(&message_
, msg
); }
2220 void SetId(int id
) { id_
= id
; }
2221 void SetExcludeFromMerge() { exclude_from_merge_
= true; }
2223 void PrintThreadStatus() {
2224 std::vector
<ThreadStatus
> thread_list
;
2225 FLAGS_env
->GetThreadList(&thread_list
);
2227 fprintf(stderr
, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID",
2228 "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage",
2229 "State", "OperationProperties");
2231 int64_t current_time
= 0;
2232 clock_
->GetCurrentTime(¤t_time
).PermitUncheckedError();
2233 for (auto ts
: thread_list
) {
2234 fprintf(stderr
, "%18" PRIu64
" %10s %12s %20s %13s %45s %12s",
2236 ThreadStatus::GetThreadTypeName(ts
.thread_type
).c_str(),
2238 ThreadStatus::GetOperationName(ts
.operation_type
).c_str(),
2239 ThreadStatus::MicrosToString(ts
.op_elapsed_micros
).c_str(),
2240 ThreadStatus::GetOperationStageName(ts
.operation_stage
).c_str(),
2241 ThreadStatus::GetStateName(ts
.state_type
).c_str());
2243 auto op_properties
= ThreadStatus::InterpretOperationProperties(
2244 ts
.operation_type
, ts
.op_properties
);
2245 for (const auto& op_prop
: op_properties
) {
2246 fprintf(stderr
, " %s %" PRIu64
" |", op_prop
.first
.c_str(),
2249 fprintf(stderr
, "\n");
2253 void ResetSineInterval() { sine_interval_
= clock_
->NowMicros(); }
2255 uint64_t GetSineInterval() { return sine_interval_
; }
2257 uint64_t GetStart() { return start_
; }
2259 void ResetLastOpTime() {
2260 // Set to now to avoid latency from calls to SleepForMicroseconds.
2261 last_op_finish_
= clock_
->NowMicros();
2264 void FinishedOps(DBWithColumnFamilies
* db_with_cfh
, DB
* db
, int64_t num_ops
,
2265 enum OperationType op_type
= kOthers
) {
2266 if (reporter_agent_
) {
2267 reporter_agent_
->ReportFinishedOps(num_ops
);
2269 if (FLAGS_histogram
) {
2270 uint64_t now
= clock_
->NowMicros();
2271 uint64_t micros
= now
- last_op_finish_
;
2273 if (hist_
.find(op_type
) == hist_
.end()) {
2274 auto hist_temp
= std::make_shared
<HistogramImpl
>();
2275 hist_
.insert({op_type
, std::move(hist_temp
)});
2277 hist_
[op_type
]->Add(micros
);
2279 if (micros
>= FLAGS_slow_usecs
&& !FLAGS_stats_interval
) {
2280 fprintf(stderr
, "long op: %" PRIu64
" micros%30s\r", micros
, "");
2283 last_op_finish_
= now
;
2287 if (done_
>= next_report_
&& FLAGS_progress_reports
) {
2288 if (!FLAGS_stats_interval
) {
2289 if (next_report_
< 1000)
2290 next_report_
+= 100;
2291 else if (next_report_
< 5000)
2292 next_report_
+= 500;
2293 else if (next_report_
< 10000)
2294 next_report_
+= 1000;
2295 else if (next_report_
< 50000)
2296 next_report_
+= 5000;
2297 else if (next_report_
< 100000)
2298 next_report_
+= 10000;
2299 else if (next_report_
< 500000)
2300 next_report_
+= 50000;
2302 next_report_
+= 100000;
2303 fprintf(stderr
, "... finished %" PRIu64
" ops%30s\r", done_
, "");
2305 uint64_t now
= clock_
->NowMicros();
2306 int64_t usecs_since_last
= now
- last_report_finish_
;
2308 // Determine whether to print status where interval is either
2309 // each N operations or each N seconds.
2311 if (FLAGS_stats_interval_seconds
&&
2312 usecs_since_last
< (FLAGS_stats_interval_seconds
* 1000000)) {
2313 // Don't check again for this many operations.
2314 next_report_
+= FLAGS_stats_interval
;
2318 "%s ... thread %d: (%" PRIu64
",%" PRIu64
2320 "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
2321 clock_
->TimeToString(now
/ 1000000).c_str(), id_
,
2322 done_
- last_report_done_
, done_
,
2323 (done_
- last_report_done_
) / (usecs_since_last
/ 1000000.0),
2324 done_
/ ((now
- start_
) / 1000000.0),
2325 (now
- last_report_finish_
) / 1000000.0,
2326 (now
- start_
) / 1000000.0);
2328 if (id_
== 0 && FLAGS_stats_per_interval
) {
2331 if (db_with_cfh
&& db_with_cfh
->num_created
.load()) {
2332 for (size_t i
= 0; i
< db_with_cfh
->num_created
.load(); ++i
) {
2333 if (db
->GetProperty(db_with_cfh
->cfh
[i
], "rocksdb.cfstats",
2335 fprintf(stderr
, "%s\n", stats
.c_str());
2336 if (FLAGS_show_table_properties
) {
2337 for (int level
= 0; level
< FLAGS_num_levels
; ++level
) {
2338 if (db
->GetProperty(
2339 db_with_cfh
->cfh
[i
],
2340 "rocksdb.aggregated-table-properties-at-level" +
2341 std::to_string(level
),
2343 if (stats
.find("# entries=0") == std::string::npos
) {
2344 fprintf(stderr
, "Level[%d]: %s\n", level
,
2352 if (db
->GetProperty("rocksdb.stats", &stats
)) {
2353 fprintf(stderr
, "%s", stats
.c_str());
2355 if (db
->GetProperty("rocksdb.num-running-compactions", &stats
)) {
2356 fprintf(stderr
, "num-running-compactions: %s\n", stats
.c_str());
2358 if (db
->GetProperty("rocksdb.num-running-flushes", &stats
)) {
2359 fprintf(stderr
, "num-running-flushes: %s\n\n", stats
.c_str());
2361 if (FLAGS_show_table_properties
) {
2362 for (int level
= 0; level
< FLAGS_num_levels
; ++level
) {
2363 if (db
->GetProperty(
2364 "rocksdb.aggregated-table-properties-at-level" +
2365 std::to_string(level
),
2367 if (stats
.find("# entries=0") == std::string::npos
) {
2368 fprintf(stderr
, "Level[%d]: %s\n", level
, stats
.c_str());
2376 next_report_
+= FLAGS_stats_interval
;
2377 last_report_finish_
= now
;
2378 last_report_done_
= done_
;
2381 if (id_
== 0 && FLAGS_thread_status_per_interval
) {
2382 PrintThreadStatus();
2388 void AddBytes(int64_t n
) { bytes_
+= n
; }
2390 void Report(const Slice
& name
) {
2391 // Pretend at least one op was done in case we are running a benchmark
2392 // that does not call FinishedOps().
2393 if (done_
< 1) done_
= 1;
2396 double elapsed
= (finish_
- start_
) * 1e-6;
2398 // Rate is computed on actual elapsed time, not the sum of per-thread
2401 snprintf(rate
, sizeof(rate
), "%6.1f MB/s",
2402 (bytes_
/ 1048576.0) / elapsed
);
2405 AppendWithSpace(&extra
, message_
);
2406 double throughput
= (double)done_
/ elapsed
;
2409 "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64
2410 " operations;%s%s\n",
2411 name
.ToString().c_str(), seconds_
* 1e6
/ done_
, (long)throughput
,
2412 elapsed
, done_
, (extra
.empty() ? "" : " "), extra
.c_str());
2413 if (FLAGS_histogram
) {
2414 for (auto it
= hist_
.begin(); it
!= hist_
.end(); ++it
) {
2415 fprintf(stdout
, "Microseconds per %s:\n%s\n",
2416 OperationTypeString
[it
->first
].c_str(),
2417 it
->second
->ToString().c_str());
2420 if (FLAGS_report_file_operations
) {
2422 FLAGS_env
->GetFileSystem()->CheckedCast
<CountedFileSystem
>();
2424 fprintf(stdout
, "%s", counted_fs
->PrintCounters().c_str());
2425 counted_fs
->ResetCounters();
2431 class CombinedStats
{
2433 void AddStats(const Stats
& stat
) {
2434 uint64_t total_ops
= stat
.done_
;
2435 uint64_t total_bytes_
= stat
.bytes_
;
2438 if (total_ops
< 1) {
2442 elapsed
= (stat
.finish_
- stat
.start_
) * 1e-6;
2443 throughput_ops_
.emplace_back(total_ops
/ elapsed
);
2445 if (total_bytes_
> 0) {
2446 double mbs
= (total_bytes_
/ 1048576.0);
2447 throughput_mbs_
.emplace_back(mbs
/ elapsed
);
2451 void Report(const std::string
& bench_name
) {
2452 if (throughput_ops_
.size() < 2) {
2453 // skip if there are not enough samples
2457 const char* name
= bench_name
.c_str();
2458 int num_runs
= static_cast<int>(throughput_ops_
.size());
2460 if (throughput_mbs_
.size() == throughput_ops_
.size()) {
2462 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2464 name
, num_runs
, static_cast<int>(CalcAvg(throughput_ops_
)),
2465 static_cast<int>(CalcConfidence95(throughput_ops_
)),
2466 CalcAvg(throughput_mbs_
), CalcConfidence95(throughput_mbs_
));
2468 fprintf(stdout
, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name
,
2469 num_runs
, static_cast<int>(CalcAvg(throughput_ops_
)),
2470 static_cast<int>(CalcConfidence95(throughput_ops_
)));
2474 void ReportWithConfidenceIntervals(const std::string
& bench_name
) {
2475 if (throughput_ops_
.size() < 2) {
2476 // skip if there are not enough samples
2480 const char* name
= bench_name
.c_str();
2481 int num_runs
= static_cast<int>(throughput_ops_
.size());
2483 int ops_avg
= static_cast<int>(CalcAvg(throughput_ops_
));
2484 int ops_confidence_95
= static_cast<int>(CalcConfidence95(throughput_ops_
));
2486 if (throughput_mbs_
.size() == throughput_ops_
.size()) {
2487 double mbs_avg
= CalcAvg(throughput_mbs_
);
2488 double mbs_confidence_95
= CalcConfidence95(throughput_mbs_
);
2490 "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n",
2491 name
, num_runs
, ops_avg
- ops_confidence_95
,
2492 ops_avg
+ ops_confidence_95
, mbs_avg
- mbs_confidence_95
,
2493 mbs_avg
+ mbs_confidence_95
);
2495 fprintf(stdout
, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name
, num_runs
,
2496 ops_avg
- ops_confidence_95
, ops_avg
+ ops_confidence_95
);
2500 void ReportFinal(const std::string
& bench_name
) {
2501 if (throughput_ops_
.size() < 2) {
2502 // skip if there are not enough samples
2506 const char* name
= bench_name
.c_str();
2507 int num_runs
= static_cast<int>(throughput_ops_
.size());
2509 if (throughput_mbs_
.size() == throughput_ops_
.size()) {
2510 // \xC2\xB1 is +/- character in UTF-8
2512 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2514 "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
2515 name
, num_runs
, static_cast<int>(CalcAvg(throughput_ops_
)),
2516 static_cast<int>(CalcConfidence95(throughput_ops_
)),
2517 CalcAvg(throughput_mbs_
), CalcConfidence95(throughput_mbs_
), name
,
2518 num_runs
, static_cast<int>(CalcMedian(throughput_ops_
)),
2519 CalcMedian(throughput_mbs_
));
2522 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
2523 "%s [MEDIAN %d runs] : %d ops/sec\n",
2524 name
, num_runs
, static_cast<int>(CalcAvg(throughput_ops_
)),
2525 static_cast<int>(CalcConfidence95(throughput_ops_
)), name
,
2526 num_runs
, static_cast<int>(CalcMedian(throughput_ops_
)));
2531 double CalcAvg(std::vector
<double>& data
) {
2533 for (double x
: data
) {
2536 avg
= avg
/ data
.size();
2540 // Calculates 95% CI assuming a normal distribution of samples.
2541 // Samples are not from a normal distribution, but it still
2542 // provides useful approximation.
2543 double CalcConfidence95(std::vector
<double>& data
) {
2544 assert(data
.size() > 1);
2545 double avg
= CalcAvg(data
);
2546 double std_error
= CalcStdDev(data
, avg
) / std::sqrt(data
.size());
2548 // Z score for the 97.5 percentile
2549 // see https://en.wikipedia.org/wiki/1.96
2550 return 1.959964 * std_error
;
2553 double CalcMedian(std::vector
<double>& data
) {
2554 assert(data
.size() > 0);
2555 std::sort(data
.begin(), data
.end());
2557 size_t mid
= data
.size() / 2;
2558 if (data
.size() % 2 == 1) {
2559 // Odd number of entries
2562 // Even number of entries
2563 return (data
[mid
] + data
[mid
- 1]) / 2;
2567 double CalcStdDev(std::vector
<double>& data
, double average
) {
2568 assert(data
.size() > 1);
2569 double squared_sum
= 0.0;
2570 for (double x
: data
) {
2571 squared_sum
+= std::pow(x
- average
, 2);
2574 // using samples count - 1 following Bessel's correction
2575 // see https://en.wikipedia.org/wiki/Bessel%27s_correction
2576 return std::sqrt(squared_sum
/ (data
.size() - 1));
2579 std::vector
<double> throughput_ops_
;
2580 std::vector
<double> throughput_mbs_
;
2583 class TimestampEmulator
{
2585 std::atomic
<uint64_t> timestamp_
;
2588 TimestampEmulator() : timestamp_(0) {}
2589 uint64_t Get() const { return timestamp_
.load(); }
2590 void Inc() { timestamp_
++; }
2591 Slice
Allocate(char* scratch
) {
2592 // TODO: support larger timestamp sizes
2593 assert(FLAGS_user_timestamp_size
== 8);
2595 uint64_t ts
= timestamp_
.fetch_add(1);
2596 EncodeFixed64(scratch
, ts
);
2597 return Slice(scratch
, FLAGS_user_timestamp_size
);
2599 Slice
GetTimestampForRead(Random64
& rand
, char* scratch
) {
2600 assert(FLAGS_user_timestamp_size
== 8);
2602 if (FLAGS_read_with_latest_user_timestamp
) {
2603 return Allocate(scratch
);
2605 // Choose a random timestamp from the past.
2606 uint64_t ts
= rand
.Next() % Get();
2607 EncodeFixed64(scratch
, ts
);
2608 return Slice(scratch
, FLAGS_user_timestamp_size
);
2612 // State shared by all concurrent executions of the same benchmark.
2613 struct SharedState
{
2618 std::shared_ptr
<RateLimiter
> write_rate_limiter
;
2619 std::shared_ptr
<RateLimiter
> read_rate_limiter
;
2621 // Each thread goes through the following states:
2623 // (2) waiting for others to be initialized
2627 long num_initialized
;
2631 SharedState() : cv(&mu
), perf_level(FLAGS_perf_level
) {}
2634 // Per-thread state for concurrent executions of the same benchmark.
2635 struct ThreadState
{
2636 int tid
; // 0..n-1 when running in n threads
2637 Random64 rand
; // Has different seeds for different threads
2639 SharedState
* shared
;
2641 explicit ThreadState(int index
, int my_seed
)
2642 : tid(index
), rand(seed_base
+ my_seed
) {}
2647 Duration(uint64_t max_seconds
, int64_t max_ops
, int64_t ops_per_stage
= 0) {
2648 max_seconds_
= max_seconds
;
2650 ops_per_stage_
= (ops_per_stage
> 0) ? ops_per_stage
: max_ops
;
2652 start_at_
= FLAGS_env
->NowMicros();
2655 int64_t GetStage() { return std::min(ops_
, max_ops_
- 1) / ops_per_stage_
; }
2657 bool Done(int64_t increment
) {
2658 if (increment
<= 0) increment
= 1; // avoid Done(0) and infinite loops
2662 // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2663 auto granularity
= FLAGS_ops_between_duration_checks
;
2664 if ((ops_
/ granularity
) != ((ops_
- increment
) / granularity
)) {
2665 uint64_t now
= FLAGS_env
->NowMicros();
2666 return ((now
- start_at_
) / 1000000) >= max_seconds_
;
2671 return ops_
> max_ops_
;
2676 uint64_t max_seconds_
;
2678 int64_t ops_per_stage_
;
2685 std::shared_ptr
<Cache
> cache_
;
2686 std::shared_ptr
<Cache
> compressed_cache_
;
2687 std::shared_ptr
<const SliceTransform
> prefix_extractor_
;
2688 DBWithColumnFamilies db_
;
2689 std::vector
<DBWithColumnFamilies
> multi_dbs_
;
2692 int user_timestamp_size_
;
2694 int total_thread_count_
;
2695 int64_t keys_per_prefix_
;
2696 int64_t entries_per_batch_
;
2697 int64_t writes_before_delete_range_
;
2698 int64_t writes_per_range_tombstone_
;
2699 int64_t range_tombstone_width_
;
2700 int64_t max_num_range_tombstones_
;
2701 ReadOptions read_options_
;
2702 WriteOptions write_options_
;
2703 Options open_options_
; // keep options around to properly destroy db later
2704 #ifndef ROCKSDB_LITE
2705 TraceOptions trace_options_
;
2706 TraceOptions block_cache_trace_options_
;
2710 double read_random_exp_range_
;
2712 int64_t readwrites_
;
2713 int64_t merge_keys_
;
2714 bool report_file_operations_
;
2715 bool use_blob_db_
; // Stacked BlobDB
2716 bool read_operands_
; // read via GetMergeOperands()
2717 std::vector
<std::string
> keys_
;
2719 class ErrorHandlerListener
: public EventListener
{
2721 #ifndef ROCKSDB_LITE
2722 ErrorHandlerListener()
2725 no_auto_recovery_(false),
2726 recovery_complete_(false) {}
2728 ~ErrorHandlerListener() override
{}
2730 const char* Name() const override
{ return kClassName(); }
2731 static const char* kClassName() { return "ErrorHandlerListener"; }
2733 void OnErrorRecoveryBegin(BackgroundErrorReason
/*reason*/,
2734 Status
/*bg_error*/,
2735 bool* auto_recovery
) override
{
2736 if (*auto_recovery
&& no_auto_recovery_
) {
2737 *auto_recovery
= false;
2741 void OnErrorRecoveryCompleted(Status
/*old_bg_error*/) override
{
2742 InstrumentedMutexLock
l(&mutex_
);
2743 recovery_complete_
= true;
2747 bool WaitForRecovery(uint64_t abs_time_us
) {
2748 InstrumentedMutexLock
l(&mutex_
);
2749 if (!recovery_complete_
) {
2750 cv_
.TimedWait(abs_time_us
);
2752 if (recovery_complete_
) {
2753 recovery_complete_
= false;
2759 void EnableAutoRecovery(bool enable
= true) { no_auto_recovery_
= !enable
; }
2762 InstrumentedMutex mutex_
;
2763 InstrumentedCondVar cv_
;
2764 bool no_auto_recovery_
;
2765 bool recovery_complete_
;
2766 #else // ROCKSDB_LITE
2767 bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
2768 void EnableAutoRecovery(bool /*enable*/) {}
2769 #endif // ROCKSDB_LITE
2772 std::shared_ptr
<ErrorHandlerListener
> listener_
;
2774 std::unique_ptr
<TimestampEmulator
> mock_app_clock_
;
2776 bool SanityCheck() {
2777 if (FLAGS_compression_ratio
> 1) {
2778 fprintf(stderr
, "compression_ratio should be between 0 and 1\n");
2784 inline bool CompressSlice(const CompressionInfo
& compression_info
,
2785 const Slice
& input
, std::string
* compressed
) {
2786 constexpr uint32_t compress_format_version
= 2;
2788 return CompressData(input
, compression_info
, compress_format_version
,
2792 void PrintHeader(const Options
& options
) {
2795 "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n",
2796 FLAGS_key_size
, FLAGS_user_timestamp_size
);
2797 auto avg_value_size
= FLAGS_value_size
;
2798 if (FLAGS_value_size_distribution_type_e
== kFixed
) {
2800 "Values: %d bytes each (%d bytes after compression)\n",
2802 static_cast<int>(avg_value_size
* FLAGS_compression_ratio
+ 0.5));
2804 avg_value_size
= (FLAGS_value_size_min
+ FLAGS_value_size_max
) / 2;
2806 "Values: %d avg bytes each (%d bytes after compression)\n",
2808 static_cast<int>(avg_value_size
* FLAGS_compression_ratio
+ 0.5));
2809 fprintf(stdout
, "Values Distribution: %s (min: %d, max: %d)\n",
2810 FLAGS_value_size_distribution_type
.c_str(), FLAGS_value_size_min
,
2811 FLAGS_value_size_max
);
2813 fprintf(stdout
, "Entries: %" PRIu64
"\n", num_
);
2814 fprintf(stdout
, "Prefix: %d bytes\n", FLAGS_prefix_size
);
2815 fprintf(stdout
, "Keys per prefix: %" PRIu64
"\n", keys_per_prefix_
);
2816 fprintf(stdout
, "RawSize: %.1f MB (estimated)\n",
2817 ((static_cast<int64_t>(FLAGS_key_size
+ avg_value_size
) * num_
) /
2820 stdout
, "FileSize: %.1f MB (estimated)\n",
2821 (((FLAGS_key_size
+ avg_value_size
* FLAGS_compression_ratio
) * num_
) /
2823 fprintf(stdout
, "Write rate: %" PRIu64
" bytes/second\n",
2824 FLAGS_benchmark_write_rate_limit
);
2825 fprintf(stdout
, "Read rate: %" PRIu64
" ops/second\n",
2826 FLAGS_benchmark_read_rate_limit
);
2827 if (FLAGS_enable_numa
) {
2828 fprintf(stderr
, "Running in NUMA enabled mode.\n");
2830 fprintf(stderr
, "NUMA is not defined in the system.\n");
2833 if (numa_available() == -1) {
2834 fprintf(stderr
, "NUMA is not supported by the system.\n");
2840 auto compression
= CompressionTypeToString(FLAGS_compression_type_e
);
2841 fprintf(stdout
, "Compression: %s\n", compression
.c_str());
2842 fprintf(stdout
, "Compression sampling rate: %" PRId64
"\n",
2843 FLAGS_sample_for_compression
);
2844 if (options
.memtable_factory
!= nullptr) {
2845 fprintf(stdout
, "Memtablerep: %s\n",
2846 options
.memtable_factory
->GetId().c_str());
2848 fprintf(stdout
, "Perf Level: %d\n", FLAGS_perf_level
);
2850 PrintWarnings(compression
.c_str());
2851 fprintf(stdout
, "------------------------------------------------\n");
2854 void PrintWarnings(const char* compression
) {
2855 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
2858 "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
2862 "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
2864 if (FLAGS_compression_type_e
!= ROCKSDB_NAMESPACE::kNoCompression
) {
2865 // The test string should not be too small.
2866 const int len
= FLAGS_block_size
;
2867 std::string
input_str(len
, 'y');
2868 std::string compressed
;
2869 CompressionOptions opts
;
2870 CompressionContext
context(FLAGS_compression_type_e
);
2871 CompressionInfo
info(opts
, context
, CompressionDict::GetEmptyDict(),
2872 FLAGS_compression_type_e
,
2873 FLAGS_sample_for_compression
);
2874 bool result
= CompressSlice(info
, Slice(input_str
), &compressed
);
2877 fprintf(stdout
, "WARNING: %s compression is not enabled\n",
2879 } else if (compressed
.size() >= input_str
.size()) {
2880 fprintf(stdout
, "WARNING: %s compression is not effective\n",
2886 // Current the following isn't equivalent to OS_LINUX.
2887 #if defined(__linux)
2888 static Slice
TrimSpace(Slice s
) {
2889 unsigned int start
= 0;
2890 while (start
< s
.size() && isspace(s
[start
])) {
2893 unsigned int limit
= static_cast<unsigned int>(s
.size());
2894 while (limit
> start
&& isspace(s
[limit
- 1])) {
2897 return Slice(s
.data() + start
, limit
- start
);
2901 void PrintEnvironment() {
2902 fprintf(stderr
, "RocksDB: version %s\n",
2903 GetRocksVersionAsString(true).c_str());
2905 #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
2906 time_t now
= time(nullptr);
2908 // Lint complains about ctime() usage, so replace it with ctime_r(). The
2909 // requirement is to provide a buffer which is at least 26 bytes.
2910 fprintf(stderr
, "Date: %s",
2911 ctime_r(&now
, buf
)); // ctime_r() adds newline
2913 #if defined(__linux)
2914 FILE* cpuinfo
= fopen("/proc/cpuinfo", "r");
2915 if (cpuinfo
!= nullptr) {
2918 std::string cpu_type
;
2919 std::string cache_size
;
2920 while (fgets(line
, sizeof(line
), cpuinfo
) != nullptr) {
2921 const char* sep
= strchr(line
, ':');
2922 if (sep
== nullptr) {
2925 Slice key
= TrimSpace(Slice(line
, sep
- 1 - line
));
2926 Slice val
= TrimSpace(Slice(sep
+ 1));
2927 if (key
== "model name") {
2929 cpu_type
= val
.ToString();
2930 } else if (key
== "cache size") {
2931 cache_size
= val
.ToString();
2935 fprintf(stderr
, "CPU: %d * %s\n", num_cpus
, cpu_type
.c_str());
2936 fprintf(stderr
, "CPUCache: %s\n", cache_size
.c_str());
2938 #elif defined(__APPLE__)
2939 struct host_basic_info h
;
2940 size_t hlen
= HOST_BASIC_INFO_COUNT
;
2941 if (host_info(mach_host_self(), HOST_BASIC_INFO
, (host_info_t
)&h
,
2942 (uint32_t*)&hlen
) == KERN_SUCCESS
) {
2943 std::string cpu_type
;
2944 std::string cache_size
;
2946 hlen
= sizeof(hcache_size
);
2947 if (sysctlbyname("hw.cachelinesize", &hcache_size
, &hlen
, NULL
, 0) == 0) {
2948 cache_size
= std::to_string(hcache_size
);
2950 switch (h
.cpu_type
) {
2951 case CPU_TYPE_X86_64
:
2952 cpu_type
= "x86_64";
2954 case CPU_TYPE_ARM64
:
2960 fprintf(stderr
, "CPU: %d * %s\n", h
.max_cpus
, cpu_type
.c_str());
2961 fprintf(stderr
, "CPUCache: %s\n", cache_size
.c_str());
2963 #elif defined(__FreeBSD__)
2965 size_t len
= sizeof(ncpus
);
2966 int mib
[2] = {CTL_HW
, HW_NCPU
};
2967 if (sysctl(mib
, 2, &ncpus
, &len
, nullptr, 0) == 0) {
2969 len
= sizeof(cpu_type
) - 1;
2970 mib
[1] = HW_MACHINE
;
2971 if (sysctl(mib
, 2, cpu_type
, &len
, nullptr, 0) == 0) cpu_type
[len
] = 0;
2973 fprintf(stderr
, "CPU: %d * %s\n", ncpus
, cpu_type
);
2974 // no programmatic way to get the cache line size except on PPC
2980 static bool KeyExpired(const TimestampEmulator
* timestamp_emulator
,
2982 const char* pos
= key
.data();
2984 uint64_t timestamp
= 0;
2985 if (port::kLittleEndian
) {
2986 int bytes_to_fill
= 8;
2987 for (int i
= 0; i
< bytes_to_fill
; ++i
) {
2988 timestamp
|= (static_cast<uint64_t>(static_cast<unsigned char>(pos
[i
]))
2989 << ((bytes_to_fill
- i
- 1) << 3));
2992 memcpy(×tamp
, pos
, sizeof(timestamp
));
2994 return timestamp_emulator
->Get() - timestamp
> FLAGS_time_range
;
2997 class ExpiredTimeFilter
: public CompactionFilter
{
2999 explicit ExpiredTimeFilter(
3000 const std::shared_ptr
<TimestampEmulator
>& timestamp_emulator
)
3001 : timestamp_emulator_(timestamp_emulator
) {}
3002 bool Filter(int /*level*/, const Slice
& key
,
3003 const Slice
& /*existing_value*/, std::string
* /*new_value*/,
3004 bool* /*value_changed*/) const override
{
3005 return KeyExpired(timestamp_emulator_
.get(), key
);
3007 const char* Name() const override
{ return "ExpiredTimeFilter"; }
3010 std::shared_ptr
<TimestampEmulator
> timestamp_emulator_
;
3013 class KeepFilter
: public CompactionFilter
{
3015 bool Filter(int /*level*/, const Slice
& /*key*/, const Slice
& /*value*/,
3016 std::string
* /*new_value*/,
3017 bool* /*value_changed*/) const override
{
3021 const char* Name() const override
{ return "KeepFilter"; }
3024 static std::shared_ptr
<MemoryAllocator
> GetCacheAllocator() {
3025 std::shared_ptr
<MemoryAllocator
> allocator
;
3027 if (FLAGS_use_cache_jemalloc_no_dump_allocator
) {
3028 JemallocAllocatorOptions jemalloc_options
;
3029 if (!NewJemallocNodumpAllocator(jemalloc_options
, &allocator
).ok()) {
3030 fprintf(stderr
, "JemallocNodumpAllocator not supported.\n");
3033 } else if (FLAGS_use_cache_memkind_kmem_allocator
) {
3035 allocator
= std::make_shared
<MemkindKmemAllocator
>();
3037 fprintf(stderr
, "Memkind library is not linked with the binary.\n");
3045 static std::shared_ptr
<Cache
> NewCache(int64_t capacity
) {
3046 if (capacity
<= 0) {
3049 if (FLAGS_cache_type
== "clock_cache") {
3050 fprintf(stderr
, "Old clock cache implementation has been removed.\n");
3052 } else if (FLAGS_cache_type
== "hyper_clock_cache") {
3053 return HyperClockCacheOptions(static_cast<size_t>(capacity
),
3054 FLAGS_block_size
/*estimated_entry_charge*/,
3055 FLAGS_cache_numshardbits
)
3057 } else if (FLAGS_cache_type
== "lru_cache") {
3058 LRUCacheOptions
opts(
3059 static_cast<size_t>(capacity
), FLAGS_cache_numshardbits
,
3060 false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio
,
3061 GetCacheAllocator(), kDefaultToAdaptiveMutex
,
3062 kDefaultCacheMetadataChargePolicy
, FLAGS_cache_low_pri_pool_ratio
);
3064 #ifndef ROCKSDB_LITE
3065 if (!FLAGS_secondary_cache_uri
.empty()) {
3066 Status s
= SecondaryCache::CreateFromString(
3067 ConfigOptions(), FLAGS_secondary_cache_uri
, &secondary_cache
);
3068 if (secondary_cache
== nullptr) {
3071 "No secondary cache registered matching string: %s status=%s\n",
3072 FLAGS_secondary_cache_uri
.c_str(), s
.ToString().c_str());
3075 opts
.secondary_cache
= secondary_cache
;
3077 #endif // ROCKSDB_LITE
3079 if (FLAGS_use_compressed_secondary_cache
) {
3080 CompressedSecondaryCacheOptions secondary_cache_opts
;
3081 secondary_cache_opts
.capacity
= FLAGS_compressed_secondary_cache_size
;
3082 secondary_cache_opts
.num_shard_bits
=
3083 FLAGS_compressed_secondary_cache_numshardbits
;
3084 secondary_cache_opts
.high_pri_pool_ratio
=
3085 FLAGS_compressed_secondary_cache_high_pri_pool_ratio
;
3086 secondary_cache_opts
.low_pri_pool_ratio
=
3087 FLAGS_compressed_secondary_cache_low_pri_pool_ratio
;
3088 secondary_cache_opts
.compression_type
=
3089 FLAGS_compressed_secondary_cache_compression_type_e
;
3090 secondary_cache_opts
.compress_format_version
=
3091 FLAGS_compressed_secondary_cache_compress_format_version
;
3092 opts
.secondary_cache
=
3093 NewCompressedSecondaryCache(secondary_cache_opts
);
3096 return NewLRUCache(opts
);
3098 fprintf(stderr
, "Cache type not supported.");
3105 : cache_(NewCache(FLAGS_cache_size
)),
3106 compressed_cache_(NewCache(FLAGS_compressed_cache_size
)),
3107 prefix_extractor_(FLAGS_prefix_size
!= 0
3108 ? NewFixedPrefixTransform(FLAGS_prefix_size
)
3111 key_size_(FLAGS_key_size
),
3112 user_timestamp_size_(FLAGS_user_timestamp_size
),
3113 prefix_size_(FLAGS_prefix_size
),
3114 total_thread_count_(0),
3115 keys_per_prefix_(FLAGS_keys_per_prefix
),
3116 entries_per_batch_(1),
3117 reads_(FLAGS_reads
< 0 ? FLAGS_num
: FLAGS_reads
),
3118 read_random_exp_range_(0.0),
3119 writes_(FLAGS_writes
< 0 ? FLAGS_num
: FLAGS_writes
),
3121 (FLAGS_writes
< 0 && FLAGS_reads
< 0)
3123 : ((FLAGS_writes
> FLAGS_reads
) ? FLAGS_writes
: FLAGS_reads
)),
3124 merge_keys_(FLAGS_merge_keys
< 0 ? FLAGS_num
: FLAGS_merge_keys
),
3125 report_file_operations_(FLAGS_report_file_operations
),
3126 #ifndef ROCKSDB_LITE
3127 use_blob_db_(FLAGS_use_blob_db
), // Stacked BlobDB
3129 use_blob_db_(false), // Stacked BlobDB
3130 #endif // !ROCKSDB_LITE
3131 read_operands_(false) {
3132 // use simcache instead of cache
3133 if (FLAGS_simcache_size
>= 0) {
3134 if (FLAGS_cache_numshardbits
>= 1) {
3136 NewSimCache(cache_
, FLAGS_simcache_size
, FLAGS_cache_numshardbits
);
3138 cache_
= NewSimCache(cache_
, FLAGS_simcache_size
, 0);
3142 if (report_file_operations_
) {
3143 FLAGS_env
= new CompositeEnvWrapper(
3145 std::make_shared
<CountedFileSystem
>(FLAGS_env
->GetFileSystem()));
3148 if (FLAGS_prefix_size
> FLAGS_key_size
) {
3149 fprintf(stderr
, "prefix size is larger than key size");
3153 std::vector
<std::string
> files
;
3154 FLAGS_env
->GetChildren(FLAGS_db
, &files
);
3155 for (size_t i
= 0; i
< files
.size(); i
++) {
3156 if (Slice(files
[i
]).starts_with("heap-")) {
3157 FLAGS_env
->DeleteFile(FLAGS_db
+ "/" + files
[i
]);
3160 if (!FLAGS_use_existing_db
) {
3162 options
.env
= FLAGS_env
;
3163 if (!FLAGS_wal_dir
.empty()) {
3164 options
.wal_dir
= FLAGS_wal_dir
;
3166 #ifndef ROCKSDB_LITE
3169 blob_db::DestroyBlobDB(FLAGS_db
, options
, blob_db::BlobDBOptions());
3171 #endif // !ROCKSDB_LITE
3172 DestroyDB(FLAGS_db
, options
);
3173 if (!FLAGS_wal_dir
.empty()) {
3174 FLAGS_env
->DeleteDir(FLAGS_wal_dir
);
3177 if (FLAGS_num_multi_db
> 1) {
3178 FLAGS_env
->CreateDir(FLAGS_db
);
3179 if (!FLAGS_wal_dir
.empty()) {
3180 FLAGS_env
->CreateDir(FLAGS_wal_dir
);
3185 listener_
.reset(new ErrorHandlerListener());
3186 if (user_timestamp_size_
> 0) {
3187 mock_app_clock_
.reset(new TimestampEmulator());
3193 for (const DBWithColumnFamilies
& dbwcf
: multi_dbs_
) {
3200 if (cache_
.get() != nullptr) {
3201 // Clear cache reference first
3202 open_options_
.write_buffer_manager
.reset();
3203 // this will leak, but we're shutting down so nobody cares
3204 cache_
->DisownData();
3208 Slice
AllocateKey(std::unique_ptr
<const char[]>* key_guard
) {
3209 char* data
= new char[key_size_
];
3210 const char* const_data
= data
;
3211 key_guard
->reset(const_data
);
3212 return Slice(key_guard
->get(), key_size_
);
3215 // Generate key according to the given specification and random number.
3216 // The resulting key will have the following format:
3217 // - If keys_per_prefix_ is positive, extra trailing bytes are either cut
3218 // off or padded with '0'.
3219 // The prefix value is derived from key value.
3220 // ----------------------------
3221 // | prefix 00000 | key 00000 |
3222 // ----------------------------
3224 // - If keys_per_prefix_ is 0, the key is simply a binary representation of
3225 // random number followed by trailing '0's
3226 // ----------------------------
3228 // ----------------------------
3229 void GenerateKeyFromInt(uint64_t v
, int64_t num_keys
, Slice
* key
) {
3230 if (!keys_
.empty()) {
3231 assert(FLAGS_use_existing_keys
);
3232 assert(keys_
.size() == static_cast<size_t>(num_keys
));
3233 assert(v
< static_cast<uint64_t>(num_keys
));
3237 char* start
= const_cast<char*>(key
->data());
3239 if (keys_per_prefix_
> 0) {
3240 int64_t num_prefix
= num_keys
/ keys_per_prefix_
;
3241 int64_t prefix
= v
% num_prefix
;
3242 int bytes_to_fill
= std::min(prefix_size_
, 8);
3243 if (port::kLittleEndian
) {
3244 for (int i
= 0; i
< bytes_to_fill
; ++i
) {
3245 pos
[i
] = (prefix
>> ((bytes_to_fill
- i
- 1) << 3)) & 0xFF;
3248 memcpy(pos
, static_cast<void*>(&prefix
), bytes_to_fill
);
3250 if (prefix_size_
> 8) {
3251 // fill the rest with 0s
3252 memset(pos
+ 8, '0', prefix_size_
- 8);
3254 pos
+= prefix_size_
;
3257 int bytes_to_fill
= std::min(key_size_
- static_cast<int>(pos
- start
), 8);
3258 if (port::kLittleEndian
) {
3259 for (int i
= 0; i
< bytes_to_fill
; ++i
) {
3260 pos
[i
] = (v
>> ((bytes_to_fill
- i
- 1) << 3)) & 0xFF;
3263 memcpy(pos
, static_cast<void*>(&v
), bytes_to_fill
);
3265 pos
+= bytes_to_fill
;
3266 if (key_size_
> pos
- start
) {
3267 memset(pos
, '0', key_size_
- (pos
- start
));
3271 void GenerateKeyFromIntForSeek(uint64_t v
, int64_t num_keys
, Slice
* key
) {
3272 GenerateKeyFromInt(v
, num_keys
, key
);
3273 if (FLAGS_seek_missing_prefix
) {
3274 assert(prefix_size_
> 8);
3275 char* key_ptr
= const_cast<char*>(key
->data());
3276 // This rely on GenerateKeyFromInt filling paddings with '0's.
3277 // Putting a '1' will create a non-existing prefix.
3282 std::string
GetPathForMultiple(std::string base_name
, size_t id
) {
3283 if (!base_name
.empty()) {
3285 if (base_name
.back() != '/') {
3289 if (base_name
.back() != '\\') {
3294 return base_name
+ std::to_string(id
);
3297 void VerifyDBFromDB(std::string
& truth_db_name
) {
3298 DBWithColumnFamilies truth_db
;
3299 auto s
= DB::OpenForReadOnly(open_options_
, truth_db_name
, &truth_db
.db
);
3301 fprintf(stderr
, "open error: %s\n", s
.ToString().c_str());
3305 ro
.total_order_seek
= true;
3306 std::unique_ptr
<Iterator
> truth_iter(truth_db
.db
->NewIterator(ro
));
3307 std::unique_ptr
<Iterator
> db_iter(db_
.db
->NewIterator(ro
));
3308 // Verify that all the key/values in truth_db are retrivable in db with
3310 fprintf(stderr
, "Verifying db >= truth_db with ::Get...\n");
3311 for (truth_iter
->SeekToFirst(); truth_iter
->Valid(); truth_iter
->Next()) {
3313 s
= db_
.db
->Get(ro
, truth_iter
->key(), &value
);
3315 // TODO(myabandeh): provide debugging hints
3316 assert(Slice(value
) == truth_iter
->value());
3318 // Verify that the db iterator does not give any extra key/value
3319 fprintf(stderr
, "Verifying db == truth_db...\n");
3320 for (db_iter
->SeekToFirst(), truth_iter
->SeekToFirst(); db_iter
->Valid();
3321 db_iter
->Next(), truth_iter
->Next()) {
3322 assert(truth_iter
->Valid());
3323 assert(truth_iter
->value() == db_iter
->value());
3325 // No more key should be left unchecked in truth_db
3326 assert(!truth_iter
->Valid());
3327 fprintf(stderr
, "...Verified\n");
3336 if (!SanityCheck()) {
3339 Open(&open_options_
);
3340 PrintHeader(open_options_
);
3341 std::stringstream
benchmark_stream(FLAGS_benchmarks
);
3343 std::unique_ptr
<ExpiredTimeFilter
> filter
;
3344 while (std::getline(benchmark_stream
, name
, ',')) {
3345 // Sanitize parameters
3347 reads_
= (FLAGS_reads
< 0 ? FLAGS_num
: FLAGS_reads
);
3348 writes_
= (FLAGS_writes
< 0 ? FLAGS_num
: FLAGS_writes
);
3349 deletes_
= (FLAGS_deletes
< 0 ? FLAGS_num
: FLAGS_deletes
);
3350 value_size
= FLAGS_value_size
;
3351 key_size_
= FLAGS_key_size
;
3352 entries_per_batch_
= FLAGS_batch_size
;
3353 writes_before_delete_range_
= FLAGS_writes_before_delete_range
;
3354 writes_per_range_tombstone_
= FLAGS_writes_per_range_tombstone
;
3355 range_tombstone_width_
= FLAGS_range_tombstone_width
;
3356 max_num_range_tombstones_
= FLAGS_max_num_range_tombstones
;
3357 write_options_
= WriteOptions();
3358 read_random_exp_range_
= FLAGS_read_random_exp_range
;
3360 write_options_
.sync
= true;
3362 write_options_
.disableWAL
= FLAGS_disable_wal
;
3363 write_options_
.rate_limiter_priority
=
3364 FLAGS_rate_limit_auto_wal_flush
? Env::IO_USER
: Env::IO_TOTAL
;
3365 read_options_
= ReadOptions(FLAGS_verify_checksum
, true);
3366 read_options_
.total_order_seek
= FLAGS_total_order_seek
;
3367 read_options_
.prefix_same_as_start
= FLAGS_prefix_same_as_start
;
3368 read_options_
.rate_limiter_priority
=
3369 FLAGS_rate_limit_user_ops
? Env::IO_USER
: Env::IO_TOTAL
;
3370 read_options_
.tailing
= FLAGS_use_tailing_iterator
;
3371 read_options_
.readahead_size
= FLAGS_readahead_size
;
3372 read_options_
.adaptive_readahead
= FLAGS_adaptive_readahead
;
3373 read_options_
.async_io
= FLAGS_async_io
;
3374 read_options_
.optimize_multiget_for_io
= FLAGS_optimize_multiget_for_io
;
3376 void (Benchmark::*method
)(ThreadState
*) = nullptr;
3377 void (Benchmark::*post_process_method
)() = nullptr;
3379 bool fresh_db
= false;
3380 int num_threads
= FLAGS_threads
;
3384 if (!name
.empty() && *name
.rbegin() == ']') {
3385 auto it
= name
.find('[');
3386 if (it
== std::string::npos
) {
3387 fprintf(stderr
, "unknown benchmark arguments '%s'\n", name
.c_str());
3390 std::string args
= name
.substr(it
+ 1);
3391 args
.resize(args
.size() - 1);
3394 std::string bench_arg
;
3395 std::stringstream
args_stream(args
);
3396 while (std::getline(args_stream
, bench_arg
, '-')) {
3397 if (bench_arg
.empty()) {
3400 if (bench_arg
[0] == 'X') {
3401 // Repeat the benchmark n times
3402 std::string num_str
= bench_arg
.substr(1);
3403 num_repeat
= std::stoi(num_str
);
3404 } else if (bench_arg
[0] == 'W') {
3405 // Warm up the benchmark for n times
3406 std::string num_str
= bench_arg
.substr(1);
3407 num_warmup
= std::stoi(num_str
);
3412 // Both fillseqdeterministic and filluniquerandomdeterministic
3413 // fill the levels except the max level with UNIQUE_RANDOM
3414 // and fill the max level with fillseq and filluniquerandom, respectively
3415 if (name
== "fillseqdeterministic" ||
3416 name
== "filluniquerandomdeterministic") {
3417 if (!FLAGS_disable_auto_compactions
) {
3419 "Please disable_auto_compactions in FillDeterministic "
3423 if (num_threads
> 1) {
3425 "filldeterministic multithreaded not supported"
3426 ", use 1 thread\n");
3430 if (name
== "fillseqdeterministic") {
3431 method
= &Benchmark::WriteSeqDeterministic
;
3433 method
= &Benchmark::WriteUniqueRandomDeterministic
;
3435 } else if (name
== "fillseq") {
3437 method
= &Benchmark::WriteSeq
;
3438 } else if (name
== "fillbatch") {
3440 entries_per_batch_
= 1000;
3441 method
= &Benchmark::WriteSeq
;
3442 } else if (name
== "fillrandom") {
3444 method
= &Benchmark::WriteRandom
;
3445 } else if (name
== "filluniquerandom" ||
3446 name
== "fillanddeleteuniquerandom") {
3448 if (num_threads
> 1) {
3450 "filluniquerandom and fillanddeleteuniquerandom "
3451 "multithreaded not supported, use 1 thread");
3454 method
= &Benchmark::WriteUniqueRandom
;
3455 } else if (name
== "overwrite") {
3456 method
= &Benchmark::WriteRandom
;
3457 } else if (name
== "fillsync") {
3460 write_options_
.sync
= true;
3461 method
= &Benchmark::WriteRandom
;
3462 } else if (name
== "fill100K") {
3465 value_size
= 100 * 1000;
3466 method
= &Benchmark::WriteRandom
;
3467 } else if (name
== "readseq") {
3468 method
= &Benchmark::ReadSequential
;
3469 } else if (name
== "readtorowcache") {
3470 if (!FLAGS_use_existing_keys
|| !FLAGS_row_cache_size
) {
3472 "Please set use_existing_keys to true and specify a "
3473 "row cache size in readtorowcache benchmark\n");
3476 method
= &Benchmark::ReadToRowCache
;
3477 } else if (name
== "readtocache") {
3478 method
= &Benchmark::ReadSequential
;
3481 } else if (name
== "readreverse") {
3482 method
= &Benchmark::ReadReverse
;
3483 } else if (name
== "readrandom") {
3484 if (FLAGS_multiread_stride
) {
3485 fprintf(stderr
, "entries_per_batch = %" PRIi64
"\n",
3486 entries_per_batch_
);
3488 method
= &Benchmark::ReadRandom
;
3489 } else if (name
== "readrandomfast") {
3490 method
= &Benchmark::ReadRandomFast
;
3491 } else if (name
== "multireadrandom") {
3492 fprintf(stderr
, "entries_per_batch = %" PRIi64
"\n",
3493 entries_per_batch_
);
3494 method
= &Benchmark::MultiReadRandom
;
3495 } else if (name
== "multireadwhilewriting") {
3496 fprintf(stderr
, "entries_per_batch = %" PRIi64
"\n",
3497 entries_per_batch_
);
3499 method
= &Benchmark::MultiReadWhileWriting
;
3500 } else if (name
== "approximatesizerandom") {
3501 fprintf(stderr
, "entries_per_batch = %" PRIi64
"\n",
3502 entries_per_batch_
);
3503 method
= &Benchmark::ApproximateSizeRandom
;
3504 } else if (name
== "mixgraph") {
3505 method
= &Benchmark::MixGraph
;
3506 } else if (name
== "readmissing") {
3508 method
= &Benchmark::ReadRandom
;
3509 } else if (name
== "newiterator") {
3510 method
= &Benchmark::IteratorCreation
;
3511 } else if (name
== "newiteratorwhilewriting") {
3512 num_threads
++; // Add extra thread for writing
3513 method
= &Benchmark::IteratorCreationWhileWriting
;
3514 } else if (name
== "seekrandom") {
3515 method
= &Benchmark::SeekRandom
;
3516 } else if (name
== "seekrandomwhilewriting") {
3517 num_threads
++; // Add extra thread for writing
3518 method
= &Benchmark::SeekRandomWhileWriting
;
3519 } else if (name
== "seekrandomwhilemerging") {
3520 num_threads
++; // Add extra thread for merging
3521 method
= &Benchmark::SeekRandomWhileMerging
;
3522 } else if (name
== "readrandomsmall") {
3524 method
= &Benchmark::ReadRandom
;
3525 } else if (name
== "deleteseq") {
3526 method
= &Benchmark::DeleteSeq
;
3527 } else if (name
== "deleterandom") {
3528 method
= &Benchmark::DeleteRandom
;
3529 } else if (name
== "readwhilewriting") {
3530 num_threads
++; // Add extra thread for writing
3531 method
= &Benchmark::ReadWhileWriting
;
3532 } else if (name
== "readwhilemerging") {
3533 num_threads
++; // Add extra thread for writing
3534 method
= &Benchmark::ReadWhileMerging
;
3535 } else if (name
== "readwhilescanning") {
3536 num_threads
++; // Add extra thread for scaning
3537 method
= &Benchmark::ReadWhileScanning
;
3538 } else if (name
== "readrandomwriterandom") {
3539 method
= &Benchmark::ReadRandomWriteRandom
;
3540 } else if (name
== "readrandommergerandom") {
3541 if (FLAGS_merge_operator
.empty()) {
3542 fprintf(stdout
, "%-12s : skipped (--merge_operator is unknown)\n",
3546 method
= &Benchmark::ReadRandomMergeRandom
;
3547 } else if (name
== "updaterandom") {
3548 method
= &Benchmark::UpdateRandom
;
3549 } else if (name
== "xorupdaterandom") {
3550 method
= &Benchmark::XORUpdateRandom
;
3551 } else if (name
== "appendrandom") {
3552 method
= &Benchmark::AppendRandom
;
3553 } else if (name
== "mergerandom") {
3554 if (FLAGS_merge_operator
.empty()) {
3555 fprintf(stdout
, "%-12s : skipped (--merge_operator is unknown)\n",
3559 method
= &Benchmark::MergeRandom
;
3560 } else if (name
== "randomwithverify") {
3561 method
= &Benchmark::RandomWithVerify
;
3562 } else if (name
== "fillseekseq") {
3563 method
= &Benchmark::WriteSeqSeekSeq
;
3564 } else if (name
== "compact") {
3565 method
= &Benchmark::Compact
;
3566 } else if (name
== "compactall") {
3568 #ifndef ROCKSDB_LITE
3569 } else if (name
== "compact0") {
3571 } else if (name
== "compact1") {
3573 } else if (name
== "waitforcompaction") {
3574 WaitForCompaction();
3576 } else if (name
== "flush") {
3578 } else if (name
== "crc32c") {
3579 method
= &Benchmark::Crc32c
;
3580 } else if (name
== "xxhash") {
3581 method
= &Benchmark::xxHash
;
3582 } else if (name
== "xxhash64") {
3583 method
= &Benchmark::xxHash64
;
3584 } else if (name
== "xxh3") {
3585 method
= &Benchmark::xxh3
;
3586 } else if (name
== "acquireload") {
3587 method
= &Benchmark::AcquireLoad
;
3588 } else if (name
== "compress") {
3589 method
= &Benchmark::Compress
;
3590 } else if (name
== "uncompress") {
3591 method
= &Benchmark::Uncompress
;
3592 #ifndef ROCKSDB_LITE
3593 } else if (name
== "randomtransaction") {
3594 method
= &Benchmark::RandomTransaction
;
3595 post_process_method
= &Benchmark::RandomTransactionVerify
;
3596 #endif // ROCKSDB_LITE
3597 } else if (name
== "randomreplacekeys") {
3599 method
= &Benchmark::RandomReplaceKeys
;
3600 } else if (name
== "timeseries") {
3601 timestamp_emulator_
.reset(new TimestampEmulator());
3602 if (FLAGS_expire_style
== "compaction_filter") {
3603 filter
.reset(new ExpiredTimeFilter(timestamp_emulator_
));
3604 fprintf(stdout
, "Compaction filter is used to remove expired data");
3605 open_options_
.compaction_filter
= filter
.get();
3608 method
= &Benchmark::TimeSeries
;
3609 } else if (name
== "block_cache_entry_stats") {
3610 // DB::Properties::kBlockCacheEntryStats
3611 PrintStats("rocksdb.block-cache-entry-stats");
3612 } else if (name
== "stats") {
3613 PrintStats("rocksdb.stats");
3614 } else if (name
== "resetstats") {
3616 } else if (name
== "verify") {
3617 VerifyDBFromDB(FLAGS_truth_db
);
3618 } else if (name
== "levelstats") {
3619 PrintStats("rocksdb.levelstats");
3620 } else if (name
== "memstats") {
3621 std::vector
<std::string
> keys
{"rocksdb.num-immutable-mem-table",
3622 "rocksdb.cur-size-active-mem-table",
3623 "rocksdb.cur-size-all-mem-tables",
3624 "rocksdb.size-all-mem-tables",
3625 "rocksdb.num-entries-active-mem-table",
3626 "rocksdb.num-entries-imm-mem-tables"};
3628 } else if (name
== "sstables") {
3629 PrintStats("rocksdb.sstables");
3630 } else if (name
== "stats_history") {
3631 PrintStatsHistory();
3632 #ifndef ROCKSDB_LITE
3633 } else if (name
== "replay") {
3634 if (num_threads
> 1) {
3635 fprintf(stderr
, "Multi-threaded replay is not yet supported\n");
3638 if (FLAGS_trace_file
== "") {
3639 fprintf(stderr
, "Please set --trace_file to be replayed from\n");
3642 method
= &Benchmark::Replay
;
3643 #endif // ROCKSDB_LITE
3644 } else if (name
== "getmergeoperands") {
3645 method
= &Benchmark::GetMergeOperands
;
3646 #ifndef ROCKSDB_LITE
3647 } else if (name
== "verifychecksum") {
3648 method
= &Benchmark::VerifyChecksum
;
3649 } else if (name
== "verifyfilechecksums") {
3650 method
= &Benchmark::VerifyFileChecksums
;
3651 #endif // ROCKSDB_LITE
3652 } else if (name
== "readrandomoperands") {
3653 read_operands_
= true;
3654 method
= &Benchmark::ReadRandom
;
3655 #ifndef ROCKSDB_LITE
3656 } else if (name
== "backup") {
3657 method
= &Benchmark::Backup
;
3658 } else if (name
== "restore") {
3659 method
= &Benchmark::Restore
;
3661 } else if (!name
.empty()) { // No error message for empty name
3662 fprintf(stderr
, "unknown benchmark '%s'\n", name
.c_str());
3667 if (FLAGS_use_existing_db
) {
3668 fprintf(stdout
, "%-12s : skipped (--use_existing_db is true)\n",
3672 if (db_
.db
!= nullptr) {
3674 DestroyDB(FLAGS_db
, open_options_
);
3676 Options options
= open_options_
;
3677 for (size_t i
= 0; i
< multi_dbs_
.size(); i
++) {
3678 delete multi_dbs_
[i
].db
;
3679 if (!open_options_
.wal_dir
.empty()) {
3680 options
.wal_dir
= GetPathForMultiple(open_options_
.wal_dir
, i
);
3682 DestroyDB(GetPathForMultiple(FLAGS_db
, i
), options
);
3686 Open(&open_options_
); // use open_options for the last accessed
3689 if (method
!= nullptr) {
3690 fprintf(stdout
, "DB path: [%s]\n", FLAGS_db
.c_str());
3692 #ifndef ROCKSDB_LITE
3693 if (name
== "backup") {
3694 std::cout
<< "Backup path: [" << FLAGS_backup_dir
<< "]" << std::endl
;
3695 } else if (name
== "restore") {
3696 std::cout
<< "Backup path: [" << FLAGS_backup_dir
<< "]" << std::endl
;
3697 std::cout
<< "Restore path: [" << FLAGS_restore_dir
<< "]"
3700 // A trace_file option can be provided both for trace and replay
3701 // operations. But db_bench does not support tracing and replaying at
3702 // the same time, for now. So, start tracing only when it is not a
3704 if (FLAGS_trace_file
!= "" && name
!= "replay") {
3705 std::unique_ptr
<TraceWriter
> trace_writer
;
3706 Status s
= NewFileTraceWriter(FLAGS_env
, EnvOptions(),
3707 FLAGS_trace_file
, &trace_writer
);
3709 fprintf(stderr
, "Encountered an error starting a trace, %s\n",
3710 s
.ToString().c_str());
3713 s
= db_
.db
->StartTrace(trace_options_
, std::move(trace_writer
));
3715 fprintf(stderr
, "Encountered an error starting a trace, %s\n",
3716 s
.ToString().c_str());
3719 fprintf(stdout
, "Tracing the workload to: [%s]\n",
3720 FLAGS_trace_file
.c_str());
3722 // Start block cache tracing.
3723 if (!FLAGS_block_cache_trace_file
.empty()) {
3725 if (FLAGS_block_cache_trace_sampling_frequency
<= 0) {
3727 "Block cache trace sampling frequency must be higher than "
3731 if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes
<= 0) {
3733 "The maximum file size for block cache tracing must be "
3734 "higher than 0.\n");
3737 block_cache_trace_options_
.max_trace_file_size
=
3738 FLAGS_block_cache_trace_max_trace_file_size_in_bytes
;
3739 block_cache_trace_options_
.sampling_frequency
=
3740 FLAGS_block_cache_trace_sampling_frequency
;
3741 std::unique_ptr
<TraceWriter
> block_cache_trace_writer
;
3742 Status s
= NewFileTraceWriter(FLAGS_env
, EnvOptions(),
3743 FLAGS_block_cache_trace_file
,
3744 &block_cache_trace_writer
);
3747 "Encountered an error when creating trace writer, %s\n",
3748 s
.ToString().c_str());
3751 s
= db_
.db
->StartBlockCacheTrace(block_cache_trace_options_
,
3752 std::move(block_cache_trace_writer
));
3756 "Encountered an error when starting block cache tracing, %s\n",
3757 s
.ToString().c_str());
3760 fprintf(stdout
, "Tracing block cache accesses to: [%s]\n",
3761 FLAGS_block_cache_trace_file
.c_str());
3763 #endif // ROCKSDB_LITE
3765 if (num_warmup
> 0) {
3766 printf("Warming up benchmark by running %d times\n", num_warmup
);
3769 for (int i
= 0; i
< num_warmup
; i
++) {
3770 RunBenchmark(num_threads
, name
, method
);
3773 if (num_repeat
> 1) {
3774 printf("Running benchmark for %d times\n", num_repeat
);
3777 CombinedStats combined_stats
;
3778 for (int i
= 0; i
< num_repeat
; i
++) {
3779 Stats stats
= RunBenchmark(num_threads
, name
, method
);
3780 combined_stats
.AddStats(stats
);
3781 if (FLAGS_confidence_interval_only
) {
3782 combined_stats
.ReportWithConfidenceIntervals(name
);
3784 combined_stats
.Report(name
);
3787 if (num_repeat
> 1) {
3788 combined_stats
.ReportFinal(name
);
3791 if (post_process_method
!= nullptr) {
3792 (this->*post_process_method
)();
3796 if (secondary_update_thread_
) {
3797 secondary_update_stopped_
.store(1, std::memory_order_relaxed
);
3798 secondary_update_thread_
->join();
3799 secondary_update_thread_
.reset();
3802 #ifndef ROCKSDB_LITE
3803 if (name
!= "replay" && FLAGS_trace_file
!= "") {
3804 Status s
= db_
.db
->EndTrace();
3806 fprintf(stderr
, "Encountered an error ending the trace, %s\n",
3807 s
.ToString().c_str());
3810 if (!FLAGS_block_cache_trace_file
.empty()) {
3811 Status s
= db_
.db
->EndBlockCacheTrace();
3814 "Encountered an error ending the block cache tracing, %s\n",
3815 s
.ToString().c_str());
3818 #endif // ROCKSDB_LITE
3820 if (FLAGS_statistics
) {
3821 fprintf(stdout
, "STATISTICS:\n%s\n", dbstats
->ToString().c_str());
3823 if (FLAGS_simcache_size
>= 0) {
3825 stdout
, "SIMULATOR CACHE STATISTICS:\n%s\n",
3826 static_cast_with_check
<SimCache
>(cache_
.get())->ToString().c_str());
3829 #ifndef ROCKSDB_LITE
3830 if (FLAGS_use_secondary_db
) {
3831 fprintf(stdout
, "Secondary instance updated %" PRIu64
" times.\n",
3832 secondary_db_updates_
);
3834 #endif // ROCKSDB_LITE
3838 std::shared_ptr
<TimestampEmulator
> timestamp_emulator_
;
3839 std::unique_ptr
<port::Thread
> secondary_update_thread_
;
3840 std::atomic
<int> secondary_update_stopped_
{0};
3841 #ifndef ROCKSDB_LITE
3842 uint64_t secondary_db_updates_
= 0;
3843 #endif // ROCKSDB_LITE
3846 SharedState
* shared
;
3847 ThreadState
* thread
;
3848 void (Benchmark::*method
)(ThreadState
*);
3851 static void ThreadBody(void* v
) {
3852 ThreadArg
* arg
= reinterpret_cast<ThreadArg
*>(v
);
3853 SharedState
* shared
= arg
->shared
;
3854 ThreadState
* thread
= arg
->thread
;
3856 MutexLock
l(&shared
->mu
);
3857 shared
->num_initialized
++;
3858 if (shared
->num_initialized
>= shared
->total
) {
3859 shared
->cv
.SignalAll();
3861 while (!shared
->start
) {
3866 SetPerfLevel(static_cast<PerfLevel
>(shared
->perf_level
));
3867 perf_context
.EnablePerLevelPerfContext();
3868 thread
->stats
.Start(thread
->tid
);
3869 (arg
->bm
->*(arg
->method
))(thread
);
3870 if (FLAGS_perf_level
> ROCKSDB_NAMESPACE::PerfLevel::kDisable
) {
3871 thread
->stats
.AddMessage(std::string("PERF_CONTEXT:\n") +
3872 get_perf_context()->ToString());
3874 thread
->stats
.Stop();
3877 MutexLock
l(&shared
->mu
);
3879 if (shared
->num_done
>= shared
->total
) {
3880 shared
->cv
.SignalAll();
3885 Stats
RunBenchmark(int n
, Slice name
,
3886 void (Benchmark::*method
)(ThreadState
*)) {
3889 shared
.num_initialized
= 0;
3890 shared
.num_done
= 0;
3891 shared
.start
= false;
3892 if (FLAGS_benchmark_write_rate_limit
> 0) {
3893 shared
.write_rate_limiter
.reset(
3894 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit
));
3896 if (FLAGS_benchmark_read_rate_limit
> 0) {
3897 shared
.read_rate_limiter
.reset(NewGenericRateLimiter(
3898 FLAGS_benchmark_read_rate_limit
, 100000 /* refill_period_us */,
3899 10 /* fairness */, RateLimiter::Mode::kReadsOnly
));
3902 std::unique_ptr
<ReporterAgent
> reporter_agent
;
3903 if (FLAGS_report_interval_seconds
> 0) {
3904 reporter_agent
.reset(new ReporterAgent(FLAGS_env
, FLAGS_report_file
,
3905 FLAGS_report_interval_seconds
));
3908 ThreadArg
* arg
= new ThreadArg
[n
];
3910 for (int i
= 0; i
< n
; i
++) {
3912 if (FLAGS_enable_numa
) {
3913 // Performs a local allocation of memory to threads in numa node.
3914 int n_nodes
= numa_num_task_nodes(); // Number of nodes in NUMA.
3915 numa_exit_on_error
= 1;
3916 int numa_node
= i
% n_nodes
;
3917 bitmask
* nodes
= numa_allocate_nodemask();
3918 numa_bitmask_clearall(nodes
);
3919 numa_bitmask_setbit(nodes
, numa_node
);
3920 // numa_bind() call binds the process to the node and these
3921 // properties are passed on to the thread that is created in
3922 // StartThread method called later in the loop.
3925 numa_free_nodemask(nodes
);
3929 arg
[i
].method
= method
;
3930 arg
[i
].shared
= &shared
;
3931 total_thread_count_
++;
3932 arg
[i
].thread
= new ThreadState(i
, total_thread_count_
);
3933 arg
[i
].thread
->stats
.SetReporterAgent(reporter_agent
.get());
3934 arg
[i
].thread
->shared
= &shared
;
3935 FLAGS_env
->StartThread(ThreadBody
, &arg
[i
]);
3939 while (shared
.num_initialized
< n
) {
3943 shared
.start
= true;
3944 shared
.cv
.SignalAll();
3945 while (shared
.num_done
< n
) {
3950 // Stats for some threads can be excluded.
3952 for (int i
= 0; i
< n
; i
++) {
3953 merge_stats
.Merge(arg
[i
].thread
->stats
);
3955 merge_stats
.Report(name
);
3957 for (int i
= 0; i
< n
; i
++) {
3958 delete arg
[i
].thread
;
3965 template <OperationType kOpType
, typename FnType
, typename
... Args
>
3966 static inline void ChecksumBenchmark(FnType fn
, ThreadState
* thread
,
3968 const int size
= FLAGS_block_size
; // use --block_size option for db_bench
3969 std::string labels
= "(" + std::to_string(FLAGS_block_size
) + " per op)";
3970 const char* label
= labels
.c_str();
3972 std::string
data(size
, 'x');
3975 while (bytes
< 5000U * uint64_t{1048576}) { // ~5GB
3976 val
+= static_cast<uint32_t>(fn(data
.data(), size
, args
...));
3977 thread
->stats
.FinishedOps(nullptr, nullptr, 1, kOpType
);
3980 // Print so result is not dead
3981 fprintf(stderr
, "... val=0x%x\r", static_cast<unsigned int>(val
));
3983 thread
->stats
.AddBytes(bytes
);
3984 thread
->stats
.AddMessage(label
);
3987 void Crc32c(ThreadState
* thread
) {
3988 ChecksumBenchmark
<kCrc
>(crc32c::Value
, thread
);
3991 void xxHash(ThreadState
* thread
) {
3992 ChecksumBenchmark
<kHash
>(XXH32
, thread
, /*seed*/ 0);
3995 void xxHash64(ThreadState
* thread
) {
3996 ChecksumBenchmark
<kHash
>(XXH64
, thread
, /*seed*/ 0);
3999 void xxh3(ThreadState
* thread
) {
4000 ChecksumBenchmark
<kHash
>(XXH3_64bits
, thread
);
4003 void AcquireLoad(ThreadState
* thread
) {
4005 std::atomic
<void*> ap(&dummy
);
4007 void* ptr
= nullptr;
4008 thread
->stats
.AddMessage("(each op is 1000 loads)");
4009 while (count
< 100000) {
4010 for (int i
= 0; i
< 1000; i
++) {
4011 ptr
= ap
.load(std::memory_order_acquire
);
4014 thread
->stats
.FinishedOps(nullptr, nullptr, 1, kOthers
);
4016 if (ptr
== nullptr) exit(1); // Disable unused variable warning.
4019 void Compress(ThreadState
* thread
) {
4020 RandomGenerator gen
;
4021 Slice input
= gen
.Generate(FLAGS_block_size
);
4023 int64_t produced
= 0;
4025 std::string compressed
;
4026 CompressionOptions opts
;
4027 CompressionContext
context(FLAGS_compression_type_e
);
4028 CompressionInfo
info(opts
, context
, CompressionDict::GetEmptyDict(),
4029 FLAGS_compression_type_e
,
4030 FLAGS_sample_for_compression
);
4032 while (ok
&& bytes
< int64_t(1) << 30) {
4034 ok
= CompressSlice(info
, input
, &compressed
);
4035 produced
+= compressed
.size();
4036 bytes
+= input
.size();
4037 thread
->stats
.FinishedOps(nullptr, nullptr, 1, kCompress
);
4041 thread
->stats
.AddMessage("(compression failure)");
4044 snprintf(buf
, sizeof(buf
), "(output: %.1f%%)",
4045 (produced
* 100.0) / bytes
);
4046 thread
->stats
.AddMessage(buf
);
4047 thread
->stats
.AddBytes(bytes
);
4051 void Uncompress(ThreadState
* thread
) {
4052 RandomGenerator gen
;
4053 Slice input
= gen
.Generate(FLAGS_block_size
);
4054 std::string compressed
;
4056 CompressionContext
compression_ctx(FLAGS_compression_type_e
);
4057 CompressionOptions compression_opts
;
4058 CompressionInfo
compression_info(
4059 compression_opts
, compression_ctx
, CompressionDict::GetEmptyDict(),
4060 FLAGS_compression_type_e
, FLAGS_sample_for_compression
);
4061 UncompressionContext
uncompression_ctx(FLAGS_compression_type_e
);
4062 UncompressionInfo
uncompression_info(uncompression_ctx
,
4063 UncompressionDict::GetEmptyDict(),
4064 FLAGS_compression_type_e
);
4066 bool ok
= CompressSlice(compression_info
, input
, &compressed
);
4068 size_t uncompressed_size
= 0;
4069 while (ok
&& bytes
< 1024 * 1048576) {
4070 constexpr uint32_t compress_format_version
= 2;
4072 CacheAllocationPtr uncompressed
= UncompressData(
4073 uncompression_info
, compressed
.data(), compressed
.size(),
4074 &uncompressed_size
, compress_format_version
);
4076 ok
= uncompressed
.get() != nullptr;
4077 bytes
+= input
.size();
4078 thread
->stats
.FinishedOps(nullptr, nullptr, 1, kUncompress
);
4082 thread
->stats
.AddMessage("(compression failure)");
4084 thread
->stats
.AddBytes(bytes
);
4088 // Returns true if the options is initialized from the specified
4090 bool InitializeOptionsFromFile(Options
* opts
) {
4091 #ifndef ROCKSDB_LITE
4092 printf("Initializing RocksDB Options from the specified file\n");
4094 std::vector
<ColumnFamilyDescriptor
> cf_descs
;
4095 if (FLAGS_options_file
!= "") {
4096 auto s
= LoadOptionsFromFile(FLAGS_options_file
, FLAGS_env
, &db_opts
,
4098 db_opts
.env
= FLAGS_env
;
4100 *opts
= Options(db_opts
, cf_descs
[0].options
);
4103 fprintf(stderr
, "Unable to load options file %s --- %s\n",
4104 FLAGS_options_file
.c_str(), s
.ToString().c_str());
4113 void InitializeOptionsFromFlags(Options
* opts
) {
4114 printf("Initializing RocksDB Options from command-line flags\n");
4115 Options
& options
= *opts
;
4116 ConfigOptions
config_options(options
);
4117 config_options
.ignore_unsupported_options
= false;
4119 assert(db_
.db
== nullptr);
4121 options
.env
= FLAGS_env
;
4122 options
.wal_dir
= FLAGS_wal_dir
;
4123 options
.dump_malloc_stats
= FLAGS_dump_malloc_stats
;
4124 options
.stats_dump_period_sec
=
4125 static_cast<unsigned int>(FLAGS_stats_dump_period_sec
);
4126 options
.stats_persist_period_sec
=
4127 static_cast<unsigned int>(FLAGS_stats_persist_period_sec
);
4128 options
.persist_stats_to_disk
= FLAGS_persist_stats_to_disk
;
4129 options
.stats_history_buffer_size
=
4130 static_cast<size_t>(FLAGS_stats_history_buffer_size
);
4131 options
.avoid_flush_during_recovery
= FLAGS_avoid_flush_during_recovery
;
4133 options
.compression_opts
.level
= FLAGS_compression_level
;
4134 options
.compression_opts
.max_dict_bytes
= FLAGS_compression_max_dict_bytes
;
4135 options
.compression_opts
.zstd_max_train_bytes
=
4136 FLAGS_compression_zstd_max_train_bytes
;
4137 options
.compression_opts
.parallel_threads
=
4138 FLAGS_compression_parallel_threads
;
4139 options
.compression_opts
.max_dict_buffer_bytes
=
4140 FLAGS_compression_max_dict_buffer_bytes
;
4141 options
.compression_opts
.use_zstd_dict_trainer
=
4142 FLAGS_compression_use_zstd_dict_trainer
;
4144 options
.max_open_files
= FLAGS_open_files
;
4145 if (FLAGS_cost_write_buffer_to_cache
|| FLAGS_db_write_buffer_size
!= 0) {
4146 options
.write_buffer_manager
.reset(
4147 new WriteBufferManager(FLAGS_db_write_buffer_size
, cache_
));
4149 options
.arena_block_size
= FLAGS_arena_block_size
;
4150 options
.write_buffer_size
= FLAGS_write_buffer_size
;
4151 options
.max_write_buffer_number
= FLAGS_max_write_buffer_number
;
4152 options
.min_write_buffer_number_to_merge
=
4153 FLAGS_min_write_buffer_number_to_merge
;
4154 options
.max_write_buffer_number_to_maintain
=
4155 FLAGS_max_write_buffer_number_to_maintain
;
4156 options
.max_write_buffer_size_to_maintain
=
4157 FLAGS_max_write_buffer_size_to_maintain
;
4158 options
.max_background_jobs
= FLAGS_max_background_jobs
;
4159 options
.max_background_compactions
= FLAGS_max_background_compactions
;
4160 options
.max_subcompactions
= static_cast<uint32_t>(FLAGS_subcompactions
);
4161 options
.max_background_flushes
= FLAGS_max_background_flushes
;
4162 options
.compaction_style
= FLAGS_compaction_style_e
;
4163 options
.compaction_pri
= FLAGS_compaction_pri_e
;
4164 options
.allow_mmap_reads
= FLAGS_mmap_read
;
4165 options
.allow_mmap_writes
= FLAGS_mmap_write
;
4166 options
.use_direct_reads
= FLAGS_use_direct_reads
;
4167 options
.use_direct_io_for_flush_and_compaction
=
4168 FLAGS_use_direct_io_for_flush_and_compaction
;
4169 options
.manual_wal_flush
= FLAGS_manual_wal_flush
;
4170 options
.wal_compression
= FLAGS_wal_compression_e
;
4171 #ifndef ROCKSDB_LITE
4172 options
.ttl
= FLAGS_fifo_compaction_ttl
;
4173 options
.compaction_options_fifo
= CompactionOptionsFIFO(
4174 FLAGS_fifo_compaction_max_table_files_size_mb
* 1024 * 1024,
4175 FLAGS_fifo_compaction_allow_compaction
);
4176 options
.compaction_options_fifo
.age_for_warm
= FLAGS_fifo_age_for_warm
;
4177 #endif // ROCKSDB_LITE
4178 options
.prefix_extractor
= prefix_extractor_
;
4179 if (FLAGS_use_uint64_comparator
) {
4180 options
.comparator
= test::Uint64Comparator();
4181 if (FLAGS_key_size
!= 8) {
4182 fprintf(stderr
, "Using Uint64 comparator but key size is not 8.\n");
4186 if (FLAGS_use_stderr_info_logger
) {
4187 options
.info_log
.reset(new StderrLogger());
4189 options
.memtable_huge_page_size
= FLAGS_memtable_use_huge_page
? 2048 : 0;
4190 options
.memtable_prefix_bloom_size_ratio
= FLAGS_memtable_bloom_size_ratio
;
4191 options
.memtable_whole_key_filtering
= FLAGS_memtable_whole_key_filtering
;
4192 if (FLAGS_memtable_insert_with_hint_prefix_size
> 0) {
4193 options
.memtable_insert_with_hint_prefix_extractor
.reset(
4194 NewCappedPrefixTransform(
4195 FLAGS_memtable_insert_with_hint_prefix_size
));
4197 options
.bloom_locality
= FLAGS_bloom_locality
;
4198 options
.max_file_opening_threads
= FLAGS_file_opening_threads
;
4199 options
.compaction_readahead_size
= FLAGS_compaction_readahead_size
;
4200 options
.log_readahead_size
= FLAGS_log_readahead_size
;
4201 options
.random_access_max_buffer_size
= FLAGS_random_access_max_buffer_size
;
4202 options
.writable_file_max_buffer_size
= FLAGS_writable_file_max_buffer_size
;
4203 options
.use_fsync
= FLAGS_use_fsync
;
4204 options
.num_levels
= FLAGS_num_levels
;
4205 options
.target_file_size_base
= FLAGS_target_file_size_base
;
4206 options
.target_file_size_multiplier
= FLAGS_target_file_size_multiplier
;
4207 options
.max_bytes_for_level_base
= FLAGS_max_bytes_for_level_base
;
4208 options
.level_compaction_dynamic_level_bytes
=
4209 FLAGS_level_compaction_dynamic_level_bytes
;
4210 options
.max_bytes_for_level_multiplier
=
4211 FLAGS_max_bytes_for_level_multiplier
;
4213 CreateMemTableRepFactory(config_options
, &options
.memtable_factory
);
4215 fprintf(stderr
, "Could not create memtable factory: %s\n",
4216 s
.ToString().c_str());
4218 } else if ((FLAGS_prefix_size
== 0) &&
4219 (options
.memtable_factory
->IsInstanceOf("prefix_hash") ||
4220 options
.memtable_factory
->IsInstanceOf("hash_linkedlist"))) {
4222 "prefix_size should be non-zero if PrefixHash or "
4223 "HashLinkedList memtablerep is used\n");
4226 if (FLAGS_use_plain_table
) {
4227 #ifndef ROCKSDB_LITE
4228 if (!options
.memtable_factory
->IsInstanceOf("prefix_hash") &&
4229 !options
.memtable_factory
->IsInstanceOf("hash_linkedlist")) {
4230 fprintf(stderr
, "Warning: plain table is used with %s\n",
4231 options
.memtable_factory
->Name());
4234 int bloom_bits_per_key
= FLAGS_bloom_bits
;
4235 if (bloom_bits_per_key
< 0) {
4236 bloom_bits_per_key
= PlainTableOptions().bloom_bits_per_key
;
4239 PlainTableOptions plain_table_options
;
4240 plain_table_options
.user_key_len
= FLAGS_key_size
;
4241 plain_table_options
.bloom_bits_per_key
= bloom_bits_per_key
;
4242 plain_table_options
.hash_table_ratio
= 0.75;
4243 options
.table_factory
= std::shared_ptr
<TableFactory
>(
4244 NewPlainTableFactory(plain_table_options
));
4246 fprintf(stderr
, "Plain table is not supported in lite mode\n");
4248 #endif // ROCKSDB_LITE
4249 } else if (FLAGS_use_cuckoo_table
) {
4250 #ifndef ROCKSDB_LITE
4251 if (FLAGS_cuckoo_hash_ratio
> 1 || FLAGS_cuckoo_hash_ratio
< 0) {
4252 fprintf(stderr
, "Invalid cuckoo_hash_ratio\n");
4256 if (!FLAGS_mmap_read
) {
4257 fprintf(stderr
, "cuckoo table format requires mmap read to operate\n");
4261 ROCKSDB_NAMESPACE::CuckooTableOptions table_options
;
4262 table_options
.hash_table_ratio
= FLAGS_cuckoo_hash_ratio
;
4263 table_options
.identity_as_first_hash
= FLAGS_identity_as_first_hash
;
4264 options
.table_factory
=
4265 std::shared_ptr
<TableFactory
>(NewCuckooTableFactory(table_options
));
4267 fprintf(stderr
, "Cuckoo table is not supported in lite mode\n");
4269 #endif // ROCKSDB_LITE
4271 BlockBasedTableOptions block_based_options
;
4272 block_based_options
.checksum
=
4273 static_cast<ChecksumType
>(FLAGS_checksum_type
);
4274 if (FLAGS_use_hash_search
) {
4275 if (FLAGS_prefix_size
== 0) {
4277 "prefix_size not assigned when enable use_hash_search \n");
4280 block_based_options
.index_type
= BlockBasedTableOptions::kHashSearch
;
4282 block_based_options
.index_type
= BlockBasedTableOptions::kBinarySearch
;
4284 if (FLAGS_partition_index_and_filters
|| FLAGS_partition_index
) {
4285 if (FLAGS_index_with_first_key
) {
4287 "--index_with_first_key is not compatible with"
4288 " partition index.");
4290 if (FLAGS_use_hash_search
) {
4292 "use_hash_search is incompatible with "
4293 "partition index and is ignored");
4295 block_based_options
.index_type
=
4296 BlockBasedTableOptions::kTwoLevelIndexSearch
;
4297 block_based_options
.metadata_block_size
= FLAGS_metadata_block_size
;
4298 if (FLAGS_partition_index_and_filters
) {
4299 block_based_options
.partition_filters
= true;
4301 } else if (FLAGS_index_with_first_key
) {
4302 block_based_options
.index_type
=
4303 BlockBasedTableOptions::kBinarySearchWithFirstKey
;
4305 BlockBasedTableOptions::IndexShorteningMode index_shortening
=
4306 block_based_options
.index_shortening
;
4307 switch (FLAGS_index_shortening_mode
) {
4310 BlockBasedTableOptions::IndexShorteningMode::kNoShortening
;
4314 BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators
;
4317 index_shortening
= BlockBasedTableOptions::IndexShorteningMode::
4318 kShortenSeparatorsAndSuccessor
;
4321 fprintf(stderr
, "Unknown key shortening mode\n");
4323 block_based_options
.optimize_filters_for_memory
=
4324 FLAGS_optimize_filters_for_memory
;
4325 block_based_options
.index_shortening
= index_shortening
;
4326 if (cache_
== nullptr) {
4327 block_based_options
.no_block_cache
= true;
4329 block_based_options
.cache_index_and_filter_blocks
=
4330 FLAGS_cache_index_and_filter_blocks
;
4331 block_based_options
.pin_l0_filter_and_index_blocks_in_cache
=
4332 FLAGS_pin_l0_filter_and_index_blocks_in_cache
;
4333 block_based_options
.pin_top_level_index_and_filter
=
4334 FLAGS_pin_top_level_index_and_filter
;
4335 if (FLAGS_cache_high_pri_pool_ratio
> 1e-6) { // > 0.0 + eps
4336 block_based_options
.cache_index_and_filter_blocks_with_high_priority
=
4339 if (FLAGS_cache_high_pri_pool_ratio
+ FLAGS_cache_low_pri_pool_ratio
>
4342 "Sum of high_pri_pool_ratio and low_pri_pool_ratio "
4343 "cannot exceed 1.0.\n");
4345 block_based_options
.block_cache
= cache_
;
4346 block_based_options
.cache_usage_options
.options_overrides
.insert(
4347 {CacheEntryRole::kCompressionDictionaryBuildingBuffer
,
4348 {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
4349 ? CacheEntryRoleOptions::Decision::kEnabled
4350 : CacheEntryRoleOptions::Decision::kDisabled
}});
4351 block_based_options
.cache_usage_options
.options_overrides
.insert(
4352 {CacheEntryRole::kFilterConstruction
,
4353 {/*.charged = */ FLAGS_charge_filter_construction
4354 ? CacheEntryRoleOptions::Decision::kEnabled
4355 : CacheEntryRoleOptions::Decision::kDisabled
}});
4356 block_based_options
.cache_usage_options
.options_overrides
.insert(
4357 {CacheEntryRole::kBlockBasedTableReader
,
4358 {/*.charged = */ FLAGS_charge_table_reader
4359 ? CacheEntryRoleOptions::Decision::kEnabled
4360 : CacheEntryRoleOptions::Decision::kDisabled
}});
4361 block_based_options
.cache_usage_options
.options_overrides
.insert(
4362 {CacheEntryRole::kFileMetadata
,
4363 {/*.charged = */ FLAGS_charge_file_metadata
4364 ? CacheEntryRoleOptions::Decision::kEnabled
4365 : CacheEntryRoleOptions::Decision::kDisabled
}});
4366 block_based_options
.cache_usage_options
.options_overrides
.insert(
4367 {CacheEntryRole::kBlobCache
,
4368 {/*.charged = */ FLAGS_charge_blob_cache
4369 ? CacheEntryRoleOptions::Decision::kEnabled
4370 : CacheEntryRoleOptions::Decision::kDisabled
}});
4371 block_based_options
.block_cache_compressed
= compressed_cache_
;
4372 block_based_options
.block_size
= FLAGS_block_size
;
4373 block_based_options
.block_restart_interval
= FLAGS_block_restart_interval
;
4374 block_based_options
.index_block_restart_interval
=
4375 FLAGS_index_block_restart_interval
;
4376 block_based_options
.format_version
=
4377 static_cast<uint32_t>(FLAGS_format_version
);
4378 block_based_options
.read_amp_bytes_per_bit
= FLAGS_read_amp_bytes_per_bit
;
4379 block_based_options
.enable_index_compression
=
4380 FLAGS_enable_index_compression
;
4381 block_based_options
.block_align
= FLAGS_block_align
;
4382 block_based_options
.whole_key_filtering
= FLAGS_whole_key_filtering
;
4383 block_based_options
.max_auto_readahead_size
=
4384 FLAGS_max_auto_readahead_size
;
4385 block_based_options
.initial_auto_readahead_size
=
4386 FLAGS_initial_auto_readahead_size
;
4387 block_based_options
.num_file_reads_for_auto_readahead
=
4388 FLAGS_num_file_reads_for_auto_readahead
;
4389 BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache
=
4390 block_based_options
.prepopulate_block_cache
;
4391 switch (FLAGS_prepopulate_block_cache
) {
4393 prepopulate_block_cache
=
4394 BlockBasedTableOptions::PrepopulateBlockCache::kDisable
;
4397 prepopulate_block_cache
=
4398 BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly
;
4401 fprintf(stderr
, "Unknown prepopulate block cache mode\n");
4403 block_based_options
.prepopulate_block_cache
= prepopulate_block_cache
;
4404 if (FLAGS_use_data_block_hash_index
) {
4405 block_based_options
.data_block_index_type
=
4406 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash
;
4408 block_based_options
.data_block_index_type
=
4409 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch
;
4411 block_based_options
.data_block_hash_table_util_ratio
=
4412 FLAGS_data_block_hash_table_util_ratio
;
4413 if (FLAGS_read_cache_path
!= "") {
4414 #ifndef ROCKSDB_LITE
4417 // Read cache need to be provided with a the Logger, we will put all
4418 // reac cache logs in the read cache path in a file named rc_LOG
4419 rc_status
= FLAGS_env
->CreateDirIfMissing(FLAGS_read_cache_path
);
4420 std::shared_ptr
<Logger
> read_cache_logger
;
4421 if (rc_status
.ok()) {
4422 rc_status
= FLAGS_env
->NewLogger(FLAGS_read_cache_path
+ "/rc_LOG",
4423 &read_cache_logger
);
4426 if (rc_status
.ok()) {
4427 PersistentCacheConfig
rc_cfg(FLAGS_env
, FLAGS_read_cache_path
,
4428 FLAGS_read_cache_size
,
4431 rc_cfg
.enable_direct_reads
= FLAGS_read_cache_direct_read
;
4432 rc_cfg
.enable_direct_writes
= FLAGS_read_cache_direct_write
;
4433 rc_cfg
.writer_qdepth
= 4;
4434 rc_cfg
.writer_dispatch_size
= 4 * 1024;
4436 auto pcache
= std::make_shared
<BlockCacheTier
>(rc_cfg
);
4437 block_based_options
.persistent_cache
= pcache
;
4438 rc_status
= pcache
->Open();
4441 if (!rc_status
.ok()) {
4442 fprintf(stderr
, "Error initializing read cache, %s\n",
4443 rc_status
.ToString().c_str());
4447 fprintf(stderr
, "Read cache is not supported in LITE\n");
4453 if (FLAGS_use_blob_cache
) {
4454 if (FLAGS_use_shared_block_and_blob_cache
) {
4455 options
.blob_cache
= cache_
;
4457 if (FLAGS_blob_cache_size
> 0) {
4459 co
.capacity
= FLAGS_blob_cache_size
;
4460 co
.num_shard_bits
= FLAGS_blob_cache_numshardbits
;
4461 co
.memory_allocator
= GetCacheAllocator();
4463 options
.blob_cache
= NewLRUCache(co
);
4467 "Unable to create a standalone blob cache if blob_cache_size "
4472 switch (FLAGS_prepopulate_blob_cache
) {
4474 options
.prepopulate_blob_cache
= PrepopulateBlobCache::kDisable
;
4477 options
.prepopulate_blob_cache
= PrepopulateBlobCache::kFlushOnly
;
4480 fprintf(stderr
, "Unknown prepopulate blob cache mode\n");
4485 "Integrated BlobDB: blob cache enabled"
4486 ", block and blob caches shared: %d",
4487 FLAGS_use_shared_block_and_blob_cache
);
4488 if (!FLAGS_use_shared_block_and_blob_cache
) {
4490 ", blob cache size %" PRIu64
4491 ", blob cache num shard bits: %d",
4492 FLAGS_blob_cache_size
, FLAGS_blob_cache_numshardbits
);
4494 fprintf(stdout
, ", blob cache prepopulated: %d\n",
4495 FLAGS_prepopulate_blob_cache
);
4497 fprintf(stdout
, "Integrated BlobDB: blob cache disabled\n");
4500 options
.table_factory
.reset(
4501 NewBlockBasedTableFactory(block_based_options
));
4503 if (FLAGS_max_bytes_for_level_multiplier_additional_v
.size() > 0) {
4504 if (FLAGS_max_bytes_for_level_multiplier_additional_v
.size() !=
4505 static_cast<unsigned int>(FLAGS_num_levels
)) {
4506 fprintf(stderr
, "Insufficient number of fanouts specified %d\n",
4508 FLAGS_max_bytes_for_level_multiplier_additional_v
.size()));
4511 options
.max_bytes_for_level_multiplier_additional
=
4512 FLAGS_max_bytes_for_level_multiplier_additional_v
;
4514 options
.level0_stop_writes_trigger
= FLAGS_level0_stop_writes_trigger
;
4515 options
.level0_file_num_compaction_trigger
=
4516 FLAGS_level0_file_num_compaction_trigger
;
4517 options
.level0_slowdown_writes_trigger
=
4518 FLAGS_level0_slowdown_writes_trigger
;
4519 options
.compression
= FLAGS_compression_type_e
;
4520 if (FLAGS_simulate_hybrid_fs_file
!= "") {
4521 options
.bottommost_temperature
= Temperature::kWarm
;
4523 options
.preclude_last_level_data_seconds
=
4524 FLAGS_preclude_last_level_data_seconds
;
4525 options
.preserve_internal_time_seconds
=
4526 FLAGS_preserve_internal_time_seconds
;
4527 options
.sample_for_compression
= FLAGS_sample_for_compression
;
4528 options
.WAL_ttl_seconds
= FLAGS_wal_ttl_seconds
;
4529 options
.WAL_size_limit_MB
= FLAGS_wal_size_limit_MB
;
4530 options
.max_total_wal_size
= FLAGS_max_total_wal_size
;
4532 if (FLAGS_min_level_to_compress
>= 0) {
4533 assert(FLAGS_min_level_to_compress
<= FLAGS_num_levels
);
4534 options
.compression_per_level
.resize(FLAGS_num_levels
);
4535 for (int i
= 0; i
< FLAGS_min_level_to_compress
; i
++) {
4536 options
.compression_per_level
[i
] = kNoCompression
;
4538 for (int i
= FLAGS_min_level_to_compress
; i
< FLAGS_num_levels
; i
++) {
4539 options
.compression_per_level
[i
] = FLAGS_compression_type_e
;
4542 options
.soft_pending_compaction_bytes_limit
=
4543 FLAGS_soft_pending_compaction_bytes_limit
;
4544 options
.hard_pending_compaction_bytes_limit
=
4545 FLAGS_hard_pending_compaction_bytes_limit
;
4546 options
.delayed_write_rate
= FLAGS_delayed_write_rate
;
4547 options
.allow_concurrent_memtable_write
=
4548 FLAGS_allow_concurrent_memtable_write
;
4549 options
.experimental_mempurge_threshold
=
4550 FLAGS_experimental_mempurge_threshold
;
4551 options
.inplace_update_support
= FLAGS_inplace_update_support
;
4552 options
.inplace_update_num_locks
= FLAGS_inplace_update_num_locks
;
4553 options
.enable_write_thread_adaptive_yield
=
4554 FLAGS_enable_write_thread_adaptive_yield
;
4555 options
.enable_pipelined_write
= FLAGS_enable_pipelined_write
;
4556 options
.unordered_write
= FLAGS_unordered_write
;
4557 options
.write_thread_max_yield_usec
= FLAGS_write_thread_max_yield_usec
;
4558 options
.write_thread_slow_yield_usec
= FLAGS_write_thread_slow_yield_usec
;
4559 options
.table_cache_numshardbits
= FLAGS_table_cache_numshardbits
;
4560 options
.max_compaction_bytes
= FLAGS_max_compaction_bytes
;
4561 options
.disable_auto_compactions
= FLAGS_disable_auto_compactions
;
4562 options
.optimize_filters_for_hits
= FLAGS_optimize_filters_for_hits
;
4563 options
.paranoid_checks
= FLAGS_paranoid_checks
;
4564 options
.force_consistency_checks
= FLAGS_force_consistency_checks
;
4565 options
.check_flush_compaction_key_order
=
4566 FLAGS_check_flush_compaction_key_order
;
4567 options
.periodic_compaction_seconds
= FLAGS_periodic_compaction_seconds
;
4568 options
.ttl
= FLAGS_ttl_seconds
;
4569 // fill storage options
4570 options
.advise_random_on_open
= FLAGS_advise_random_on_open
;
4571 options
.access_hint_on_compaction_start
= FLAGS_compaction_fadvice_e
;
4572 options
.use_adaptive_mutex
= FLAGS_use_adaptive_mutex
;
4573 options
.bytes_per_sync
= FLAGS_bytes_per_sync
;
4574 options
.wal_bytes_per_sync
= FLAGS_wal_bytes_per_sync
;
4576 // merge operator options
4577 if (!FLAGS_merge_operator
.empty()) {
4578 s
= MergeOperator::CreateFromString(config_options
, FLAGS_merge_operator
,
4579 &options
.merge_operator
);
4581 fprintf(stderr
, "invalid merge operator[%s]: %s\n",
4582 FLAGS_merge_operator
.c_str(), s
.ToString().c_str());
4586 options
.max_successive_merges
= FLAGS_max_successive_merges
;
4587 options
.report_bg_io_stats
= FLAGS_report_bg_io_stats
;
4589 // set universal style compaction configurations, if applicable
4590 if (FLAGS_universal_size_ratio
!= 0) {
4591 options
.compaction_options_universal
.size_ratio
=
4592 FLAGS_universal_size_ratio
;
4594 if (FLAGS_universal_min_merge_width
!= 0) {
4595 options
.compaction_options_universal
.min_merge_width
=
4596 FLAGS_universal_min_merge_width
;
4598 if (FLAGS_universal_max_merge_width
!= 0) {
4599 options
.compaction_options_universal
.max_merge_width
=
4600 FLAGS_universal_max_merge_width
;
4602 if (FLAGS_universal_max_size_amplification_percent
!= 0) {
4603 options
.compaction_options_universal
.max_size_amplification_percent
=
4604 FLAGS_universal_max_size_amplification_percent
;
4606 if (FLAGS_universal_compression_size_percent
!= -1) {
4607 options
.compaction_options_universal
.compression_size_percent
=
4608 FLAGS_universal_compression_size_percent
;
4610 options
.compaction_options_universal
.allow_trivial_move
=
4611 FLAGS_universal_allow_trivial_move
;
4612 options
.compaction_options_universal
.incremental
=
4613 FLAGS_universal_incremental
;
4614 if (FLAGS_thread_status_per_interval
> 0) {
4615 options
.enable_thread_tracking
= true;
4618 if (FLAGS_user_timestamp_size
> 0) {
4619 if (FLAGS_user_timestamp_size
!= 8) {
4620 fprintf(stderr
, "Only 64 bits timestamps are supported.\n");
4623 options
.comparator
= test::BytewiseComparatorWithU64TsWrapper();
4626 options
.allow_data_in_errors
= FLAGS_allow_data_in_errors
;
4627 options
.track_and_verify_wals_in_manifest
=
4628 FLAGS_track_and_verify_wals_in_manifest
;
4630 // Integrated BlobDB
4631 options
.enable_blob_files
= FLAGS_enable_blob_files
;
4632 options
.min_blob_size
= FLAGS_min_blob_size
;
4633 options
.blob_file_size
= FLAGS_blob_file_size
;
4634 options
.blob_compression_type
=
4635 StringToCompressionType(FLAGS_blob_compression_type
.c_str());
4636 options
.enable_blob_garbage_collection
=
4637 FLAGS_enable_blob_garbage_collection
;
4638 options
.blob_garbage_collection_age_cutoff
=
4639 FLAGS_blob_garbage_collection_age_cutoff
;
4640 options
.blob_garbage_collection_force_threshold
=
4641 FLAGS_blob_garbage_collection_force_threshold
;
4642 options
.blob_compaction_readahead_size
=
4643 FLAGS_blob_compaction_readahead_size
;
4644 options
.blob_file_starting_level
= FLAGS_blob_file_starting_level
;
4646 #ifndef ROCKSDB_LITE
4647 if (FLAGS_readonly
&& FLAGS_transaction_db
) {
4648 fprintf(stderr
, "Cannot use readonly flag with transaction_db\n");
4651 if (FLAGS_use_secondary_db
&&
4652 (FLAGS_transaction_db
|| FLAGS_optimistic_transaction_db
)) {
4653 fprintf(stderr
, "Cannot use use_secondary_db flag with transaction_db\n");
4656 #endif // ROCKSDB_LITE
4657 options
.memtable_protection_bytes_per_key
=
4658 FLAGS_memtable_protection_bytes_per_key
;
4661 void InitializeOptionsGeneral(Options
* opts
) {
4662 // Be careful about what is set here to avoid accidentally overwriting
4663 // settings already configured by OPTIONS file. Only configure settings that
4664 // are needed for the benchmark to run, settings for shared objects that
4665 // were not configured already, settings that require dynamically invoking
4666 // APIs, and settings for the benchmark itself.
4667 Options
& options
= *opts
;
4669 // Always set these since they are harmless when not needed and prevent
4670 // a guaranteed failure when they are needed.
4671 options
.create_missing_column_families
= true;
4672 options
.create_if_missing
= true;
4674 if (options
.statistics
== nullptr) {
4675 options
.statistics
= dbstats
;
4678 auto table_options
=
4679 options
.table_factory
->GetOptions
<BlockBasedTableOptions
>();
4680 if (table_options
!= nullptr) {
4681 if (FLAGS_cache_size
> 0) {
4682 // This violates this function's rules on when to set options. But we
4683 // have to do it because the case of unconfigured block cache in OPTIONS
4684 // file is indistinguishable (it is sanitized to 8MB by this point, not
4685 // nullptr), and our regression tests assume this will be the shared
4686 // block cache, even with OPTIONS file provided.
4687 table_options
->block_cache
= cache_
;
4689 if (table_options
->filter_policy
== nullptr) {
4690 if (FLAGS_bloom_bits
< 0) {
4691 table_options
->filter_policy
= BlockBasedTableOptions().filter_policy
;
4692 } else if (FLAGS_bloom_bits
== 0) {
4693 table_options
->filter_policy
.reset();
4695 table_options
->filter_policy
.reset(
4696 FLAGS_use_ribbon_filter
? NewRibbonFilterPolicy(FLAGS_bloom_bits
)
4697 : NewBloomFilterPolicy(FLAGS_bloom_bits
));
4702 if (options
.row_cache
== nullptr) {
4703 if (FLAGS_row_cache_size
) {
4704 if (FLAGS_cache_numshardbits
>= 1) {
4706 NewLRUCache(FLAGS_row_cache_size
, FLAGS_cache_numshardbits
);
4708 options
.row_cache
= NewLRUCache(FLAGS_row_cache_size
);
4713 if (options
.env
== Env::Default()) {
4714 options
.env
= FLAGS_env
;
4716 if (FLAGS_enable_io_prio
) {
4717 options
.env
->LowerThreadPoolIOPriority(Env::LOW
);
4718 options
.env
->LowerThreadPoolIOPriority(Env::HIGH
);
4720 if (FLAGS_enable_cpu_prio
) {
4721 options
.env
->LowerThreadPoolCPUPriority(Env::LOW
);
4722 options
.env
->LowerThreadPoolCPUPriority(Env::HIGH
);
4725 if (FLAGS_sine_write_rate
) {
4726 FLAGS_benchmark_write_rate_limit
= static_cast<uint64_t>(SineRate(0));
4729 if (options
.rate_limiter
== nullptr) {
4730 if (FLAGS_rate_limiter_bytes_per_sec
> 0) {
4731 options
.rate_limiter
.reset(NewGenericRateLimiter(
4732 FLAGS_rate_limiter_bytes_per_sec
,
4733 FLAGS_rate_limiter_refill_period_us
, 10 /* fairness */,
4734 // TODO: replace this with a more general FLAG for deciding
4735 // RateLimiter::Mode as now we also rate-limit foreground reads e.g,
4737 FLAGS_rate_limit_bg_reads
? RateLimiter::Mode::kReadsOnly
4738 : RateLimiter::Mode::kWritesOnly
,
4739 FLAGS_rate_limiter_auto_tuned
));
4743 options
.listeners
.emplace_back(listener_
);
4745 if (options
.file_checksum_gen_factory
== nullptr) {
4746 if (FLAGS_file_checksum
) {
4747 options
.file_checksum_gen_factory
.reset(
4748 new FileChecksumGenCrc32cFactory());
4752 if (FLAGS_num_multi_db
<= 1) {
4753 OpenDb(options
, FLAGS_db
, &db_
);
4756 multi_dbs_
.resize(FLAGS_num_multi_db
);
4757 auto wal_dir
= options
.wal_dir
;
4758 for (int i
= 0; i
< FLAGS_num_multi_db
; i
++) {
4759 if (!wal_dir
.empty()) {
4760 options
.wal_dir
= GetPathForMultiple(wal_dir
, i
);
4762 OpenDb(options
, GetPathForMultiple(FLAGS_db
, i
), &multi_dbs_
[i
]);
4764 options
.wal_dir
= wal_dir
;
4767 // KeepFilter is a noop filter, this can be used to test compaction filter
4768 if (options
.compaction_filter
== nullptr) {
4769 if (FLAGS_use_keep_filter
) {
4770 options
.compaction_filter
= new KeepFilter();
4771 fprintf(stdout
, "A noop compaction filter is used\n");
4775 if (FLAGS_use_existing_keys
) {
4776 // Only work on single database
4777 assert(db_
.db
!= nullptr);
4778 ReadOptions read_opts
; // before read_options_ initialized
4779 read_opts
.total_order_seek
= true;
4780 Iterator
* iter
= db_
.db
->NewIterator(read_opts
);
4781 for (iter
->SeekToFirst(); iter
->Valid(); iter
->Next()) {
4782 keys_
.emplace_back(iter
->key().ToString());
4785 FLAGS_num
= keys_
.size();
4789 void Open(Options
* opts
) {
4790 if (!InitializeOptionsFromFile(opts
)) {
4791 InitializeOptionsFromFlags(opts
);
4794 InitializeOptionsGeneral(opts
);
4797 void OpenDb(Options options
, const std::string
& db_name
,
4798 DBWithColumnFamilies
* db
) {
4799 uint64_t open_start
= FLAGS_report_open_timing
? FLAGS_env
->NowNanos() : 0;
4801 // Open with column families if necessary.
4802 if (FLAGS_num_column_families
> 1) {
4803 size_t num_hot
= FLAGS_num_column_families
;
4804 if (FLAGS_num_hot_column_families
> 0 &&
4805 FLAGS_num_hot_column_families
< FLAGS_num_column_families
) {
4806 num_hot
= FLAGS_num_hot_column_families
;
4808 FLAGS_num_hot_column_families
= FLAGS_num_column_families
;
4810 std::vector
<ColumnFamilyDescriptor
> column_families
;
4811 for (size_t i
= 0; i
< num_hot
; i
++) {
4812 column_families
.push_back(ColumnFamilyDescriptor(
4813 ColumnFamilyName(i
), ColumnFamilyOptions(options
)));
4815 std::vector
<int> cfh_idx_to_prob
;
4816 if (!FLAGS_column_family_distribution
.empty()) {
4817 std::stringstream
cf_prob_stream(FLAGS_column_family_distribution
);
4818 std::string cf_prob
;
4820 while (std::getline(cf_prob_stream
, cf_prob
, ',')) {
4821 cfh_idx_to_prob
.push_back(std::stoi(cf_prob
));
4822 sum
+= cfh_idx_to_prob
.back();
4825 fprintf(stderr
, "column_family_distribution items must sum to 100\n");
4828 if (cfh_idx_to_prob
.size() != num_hot
) {
4830 "got %" ROCKSDB_PRIszt
4831 " column_family_distribution items; expected "
4832 "%" ROCKSDB_PRIszt
"\n",
4833 cfh_idx_to_prob
.size(), num_hot
);
4837 #ifndef ROCKSDB_LITE
4838 if (FLAGS_readonly
) {
4839 s
= DB::OpenForReadOnly(options
, db_name
, column_families
, &db
->cfh
,
4841 } else if (FLAGS_optimistic_transaction_db
) {
4842 s
= OptimisticTransactionDB::Open(options
, db_name
, column_families
,
4843 &db
->cfh
, &db
->opt_txn_db
);
4845 db
->db
= db
->opt_txn_db
->GetBaseDB();
4847 } else if (FLAGS_transaction_db
) {
4849 TransactionDBOptions txn_db_options
;
4850 if (options
.unordered_write
) {
4851 options
.two_write_queues
= true;
4852 txn_db_options
.skip_concurrency_control
= true;
4853 txn_db_options
.write_policy
= WRITE_PREPARED
;
4855 s
= TransactionDB::Open(options
, txn_db_options
, db_name
,
4856 column_families
, &db
->cfh
, &ptr
);
4861 s
= DB::Open(options
, db_name
, column_families
, &db
->cfh
, &db
->db
);
4864 s
= DB::Open(options
, db_name
, column_families
, &db
->cfh
, &db
->db
);
4865 #endif // ROCKSDB_LITE
4866 db
->cfh
.resize(FLAGS_num_column_families
);
4867 db
->num_created
= num_hot
;
4868 db
->num_hot
= num_hot
;
4869 db
->cfh_idx_to_prob
= std::move(cfh_idx_to_prob
);
4870 #ifndef ROCKSDB_LITE
4871 } else if (FLAGS_readonly
) {
4872 s
= DB::OpenForReadOnly(options
, db_name
, &db
->db
);
4873 } else if (FLAGS_optimistic_transaction_db
) {
4874 s
= OptimisticTransactionDB::Open(options
, db_name
, &db
->opt_txn_db
);
4876 db
->db
= db
->opt_txn_db
->GetBaseDB();
4878 } else if (FLAGS_transaction_db
) {
4879 TransactionDB
* ptr
= nullptr;
4880 TransactionDBOptions txn_db_options
;
4881 if (options
.unordered_write
) {
4882 options
.two_write_queues
= true;
4883 txn_db_options
.skip_concurrency_control
= true;
4884 txn_db_options
.write_policy
= WRITE_PREPARED
;
4886 s
= CreateLoggerFromOptions(db_name
, options
, &options
.info_log
);
4888 s
= TransactionDB::Open(options
, txn_db_options
, db_name
, &ptr
);
4893 } else if (FLAGS_use_blob_db
) {
4895 blob_db::BlobDBOptions blob_db_options
;
4896 blob_db_options
.enable_garbage_collection
= FLAGS_blob_db_enable_gc
;
4897 blob_db_options
.garbage_collection_cutoff
= FLAGS_blob_db_gc_cutoff
;
4898 blob_db_options
.is_fifo
= FLAGS_blob_db_is_fifo
;
4899 blob_db_options
.max_db_size
= FLAGS_blob_db_max_db_size
;
4900 blob_db_options
.ttl_range_secs
= FLAGS_blob_db_ttl_range_secs
;
4901 blob_db_options
.min_blob_size
= FLAGS_blob_db_min_blob_size
;
4902 blob_db_options
.bytes_per_sync
= FLAGS_blob_db_bytes_per_sync
;
4903 blob_db_options
.blob_file_size
= FLAGS_blob_db_file_size
;
4904 blob_db_options
.compression
= FLAGS_blob_db_compression_type_e
;
4905 blob_db::BlobDB
* ptr
= nullptr;
4906 s
= blob_db::BlobDB::Open(options
, blob_db_options
, db_name
, &ptr
);
4910 } else if (FLAGS_use_secondary_db
) {
4911 if (FLAGS_secondary_path
.empty()) {
4912 std::string default_secondary_path
;
4913 FLAGS_env
->GetTestDirectory(&default_secondary_path
);
4914 default_secondary_path
+= "/dbbench_secondary";
4915 FLAGS_secondary_path
= default_secondary_path
;
4917 s
= DB::OpenAsSecondary(options
, db_name
, FLAGS_secondary_path
, &db
->db
);
4918 if (s
.ok() && FLAGS_secondary_update_interval
> 0) {
4919 secondary_update_thread_
.reset(new port::Thread(
4920 [this](int interval
, DBWithColumnFamilies
* _db
) {
4921 while (0 == secondary_update_stopped_
.load(
4922 std::memory_order_relaxed
)) {
4923 Status secondary_update_status
=
4924 _db
->db
->TryCatchUpWithPrimary();
4925 if (!secondary_update_status
.ok()) {
4926 fprintf(stderr
, "Failed to catch up with primary: %s\n",
4927 secondary_update_status
.ToString().c_str());
4930 ++secondary_db_updates_
;
4931 FLAGS_env
->SleepForMicroseconds(interval
* 1000000);
4934 FLAGS_secondary_update_interval
, db
));
4936 #endif // ROCKSDB_LITE
4938 s
= DB::Open(options
, db_name
, &db
->db
);
4940 if (FLAGS_report_open_timing
) {
4941 std::cout
<< "OpenDb: "
4942 << (FLAGS_env
->NowNanos() - open_start
) / 1000000.0
4943 << " milliseconds\n";
4946 fprintf(stderr
, "open error: %s\n", s
.ToString().c_str());
4951 enum WriteMode
{ RANDOM
, SEQUENTIAL
, UNIQUE_RANDOM
};
4953 void WriteSeqDeterministic(ThreadState
* thread
) {
4954 DoDeterministicCompact(thread
, open_options_
.compaction_style
, SEQUENTIAL
);
4957 void WriteUniqueRandomDeterministic(ThreadState
* thread
) {
4958 DoDeterministicCompact(thread
, open_options_
.compaction_style
,
4962 void WriteSeq(ThreadState
* thread
) { DoWrite(thread
, SEQUENTIAL
); }
4964 void WriteRandom(ThreadState
* thread
) { DoWrite(thread
, RANDOM
); }
4966 void WriteUniqueRandom(ThreadState
* thread
) {
4967 DoWrite(thread
, UNIQUE_RANDOM
);
4970 class KeyGenerator
{
4972 KeyGenerator(Random64
* rand
, WriteMode mode
, uint64_t num
,
4973 uint64_t /*num_per_set*/ = 64 * 1024)
4974 : rand_(rand
), mode_(mode
), num_(num
), next_(0) {
4975 if (mode_
== UNIQUE_RANDOM
) {
4976 // NOTE: if memory consumption of this approach becomes a concern,
4977 // we can either break it into pieces and only random shuffle a section
4978 // each time. Alternatively, use a bit map implementation
4979 // (https://reviews.facebook.net/differential/diff/54627/)
4980 values_
.resize(num_
);
4981 for (uint64_t i
= 0; i
< num_
; ++i
) {
4984 RandomShuffle(values_
.begin(), values_
.end(),
4985 static_cast<uint32_t>(seed_base
));
4994 return rand_
->Next() % num_
;
4996 assert(next_
< num_
);
4997 return values_
[next_
++];
5000 return std::numeric_limits
<uint64_t>::max();
5003 // Only available for UNIQUE_RANDOM mode.
5004 uint64_t Fetch(uint64_t index
) {
5005 assert(mode_
== UNIQUE_RANDOM
);
5006 assert(index
< values_
.size());
5007 return values_
[index
];
5013 const uint64_t num_
;
5015 std::vector
<uint64_t> values_
;
5018 DB
* SelectDB(ThreadState
* thread
) { return SelectDBWithCfh(thread
)->db
; }
5020 DBWithColumnFamilies
* SelectDBWithCfh(ThreadState
* thread
) {
5021 return SelectDBWithCfh(thread
->rand
.Next());
5024 DBWithColumnFamilies
* SelectDBWithCfh(uint64_t rand_int
) {
5025 if (db_
.db
!= nullptr) {
5028 return &multi_dbs_
[rand_int
% multi_dbs_
.size()];
5032 double SineRate(double x
) {
5033 return FLAGS_sine_a
* sin((FLAGS_sine_b
* x
) + FLAGS_sine_c
) + FLAGS_sine_d
;
5036 void DoWrite(ThreadState
* thread
, WriteMode write_mode
) {
5037 const int test_duration
= write_mode
== RANDOM
? FLAGS_duration
: 0;
5038 const int64_t num_ops
= writes_
== 0 ? num_
: writes_
;
5040 size_t num_key_gens
= 1;
5041 if (db_
.db
== nullptr) {
5042 num_key_gens
= multi_dbs_
.size();
5044 std::vector
<std::unique_ptr
<KeyGenerator
>> key_gens(num_key_gens
);
5045 int64_t max_ops
= num_ops
* num_key_gens
;
5046 int64_t ops_per_stage
= max_ops
;
5047 if (FLAGS_num_column_families
> 1 && FLAGS_num_hot_column_families
> 0) {
5048 ops_per_stage
= (max_ops
- 1) / (FLAGS_num_column_families
/
5049 FLAGS_num_hot_column_families
) +
5053 Duration
duration(test_duration
, max_ops
, ops_per_stage
);
5054 const uint64_t num_per_key_gen
= num_
+ max_num_range_tombstones_
;
5055 for (size_t i
= 0; i
< num_key_gens
; i
++) {
5056 key_gens
[i
].reset(new KeyGenerator(&(thread
->rand
), write_mode
,
5057 num_per_key_gen
, ops_per_stage
));
5060 if (num_
!= FLAGS_num
) {
5062 snprintf(msg
, sizeof(msg
), "(%" PRIu64
" ops)", num_
);
5063 thread
->stats
.AddMessage(msg
);
5066 RandomGenerator gen
;
5067 WriteBatch
batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
5068 FLAGS_write_batch_protection_bytes_per_key
,
5069 user_timestamp_size_
);
5073 std::unique_ptr
<const char[]> key_guard
;
5074 Slice key
= AllocateKey(&key_guard
);
5075 std::unique_ptr
<const char[]> begin_key_guard
;
5076 Slice begin_key
= AllocateKey(&begin_key_guard
);
5077 std::unique_ptr
<const char[]> end_key_guard
;
5078 Slice end_key
= AllocateKey(&end_key_guard
);
5080 uint64_t num_overwrites
= 0, num_unique_keys
= 0, num_selective_deletes
= 0;
5081 // If user set overwrite_probability flag,
5082 // check if value is in [0.0,1.0].
5083 if (FLAGS_overwrite_probability
> 0.0) {
5084 p
= FLAGS_overwrite_probability
> 1.0 ? 1.0 : FLAGS_overwrite_probability
;
5085 // If overwrite set by user, and UNIQUE_RANDOM mode on,
5086 // the overwrite_window_size must be > 0.
5087 if (write_mode
== UNIQUE_RANDOM
&& FLAGS_overwrite_window_size
== 0) {
5089 "Overwrite_window_size must be strictly greater than 0.\n");
5094 // Default_random_engine provides slightly
5095 // improved throughput over mt19937.
5096 std::default_random_engine overwrite_gen
{
5097 static_cast<unsigned int>(seed_base
)};
5098 std::bernoulli_distribution
overwrite_decider(p
);
5100 // Inserted key window is filled with the last N
5101 // keys previously inserted into the DB (with
5102 // N=FLAGS_overwrite_window_size).
5103 // We use a deque struct because:
5104 // - random access is O(1)
5105 // - insertion/removal at beginning/end is also O(1).
5106 std::deque
<int64_t> inserted_key_window
;
5107 Random64
reservoir_id_gen(seed_base
);
5109 // --- Variables used in disposable/persistent keys simulation:
5110 // The following variables are used when
5111 // disposable_entries_batch_size is >0. We simualte a workload
5112 // where the following sequence is repeated multiple times:
5113 // "A set of keys S1 is inserted ('disposable entries'), then after
5114 // some delay another set of keys S2 is inserted ('persistent entries')
5115 // and the first set of keys S1 is deleted. S2 artificially represents
5116 // the insertion of hypothetical results from some undefined computation
5117 // done on the first set of keys S1. The next sequence can start as soon
5118 // as the last disposable entry in the set S1 of this sequence is
5119 // inserted, if the delay is non negligible"
5120 bool skip_for_loop
= false, is_disposable_entry
= true;
5121 std::vector
<uint64_t> disposable_entries_index(num_key_gens
, 0);
5122 std::vector
<uint64_t> persistent_ent_and_del_index(num_key_gens
, 0);
5123 const uint64_t kNumDispAndPersEntries
=
5124 FLAGS_disposable_entries_batch_size
+
5125 FLAGS_persistent_entries_batch_size
;
5126 if (kNumDispAndPersEntries
> 0) {
5127 if ((write_mode
!= UNIQUE_RANDOM
) || (writes_per_range_tombstone_
> 0) ||
5131 "Disposable/persistent deletes are not compatible with overwrites "
5132 "and DeleteRanges; and are only supported in filluniquerandom.\n");
5135 if (FLAGS_disposable_entries_value_size
< 0 ||
5136 FLAGS_persistent_entries_value_size
< 0) {
5139 "disposable_entries_value_size and persistent_entries_value_size"
5140 "have to be positive.\n");
5144 Random
rnd_disposable_entry(static_cast<uint32_t>(seed_base
));
5145 std::string random_value
;
5146 // Queue that stores scheduled timestamp of disposable entries deletes,
5147 // along with starting index of disposable entry keys to delete.
5148 std::vector
<std::queue
<std::pair
<uint64_t, uint64_t>>> disposable_entries_q(
5150 // --- End of variables used in disposable/persistent keys simulation.
5152 std::vector
<std::unique_ptr
<const char[]>> expanded_key_guards
;
5153 std::vector
<Slice
> expanded_keys
;
5154 if (FLAGS_expand_range_tombstones
) {
5155 expanded_key_guards
.resize(range_tombstone_width_
);
5156 for (auto& expanded_key_guard
: expanded_key_guards
) {
5157 expanded_keys
.emplace_back(AllocateKey(&expanded_key_guard
));
5161 std::unique_ptr
<char[]> ts_guard
;
5162 if (user_timestamp_size_
> 0) {
5163 ts_guard
.reset(new char[user_timestamp_size_
]);
5167 int64_t num_written
= 0;
5168 int64_t next_seq_db_at
= num_ops
;
5170 int64_t num_range_deletions
= 0;
5172 while ((num_per_key_gen
!= 0) && !duration
.Done(entries_per_batch_
)) {
5173 if (duration
.GetStage() != stage
) {
5174 stage
= duration
.GetStage();
5175 if (db_
.db
!= nullptr) {
5176 db_
.CreateNewCf(open_options_
, stage
);
5178 for (auto& db
: multi_dbs_
) {
5179 db
.CreateNewCf(open_options_
, stage
);
5184 if (write_mode
!= SEQUENTIAL
) {
5185 id
= thread
->rand
.Next() % num_key_gens
;
5187 // When doing a sequential load with multiple databases, load them in
5188 // order rather than all at the same time to avoid:
5189 // 1) long delays between flushing memtables
5190 // 2) flushing memtables for all of them at the same point in time
5191 // 3) not putting the same number of keys in each database
5192 if (num_written
>= next_seq_db_at
) {
5193 next_seq_db_at
+= num_ops
;
5195 if (id
>= num_key_gens
) {
5196 fprintf(stderr
, "Logic error. Filled all databases\n");
5201 DBWithColumnFamilies
* db_with_cfh
= SelectDBWithCfh(id
);
5204 int64_t batch_bytes
= 0;
5206 for (int64_t j
= 0; j
< entries_per_batch_
; j
++) {
5207 int64_t rand_num
= 0;
5208 if ((write_mode
== UNIQUE_RANDOM
) && (p
> 0.0)) {
5209 if ((inserted_key_window
.size() > 0) &&
5210 overwrite_decider(overwrite_gen
)) {
5212 rand_num
= inserted_key_window
[reservoir_id_gen
.Next() %
5213 inserted_key_window
.size()];
5216 rand_num
= key_gens
[id
]->Next();
5217 if (inserted_key_window
.size() < FLAGS_overwrite_window_size
) {
5218 inserted_key_window
.push_back(rand_num
);
5220 inserted_key_window
.pop_front();
5221 inserted_key_window
.push_back(rand_num
);
5224 } else if (kNumDispAndPersEntries
> 0) {
5225 // Check if queue is non-empty and if we need to insert
5226 // 'persistent' KV entries (KV entries that are never deleted)
5227 // and delete disposable entries previously inserted.
5228 if (!disposable_entries_q
[id
].empty() &&
5229 (disposable_entries_q
[id
].front().first
<
5230 FLAGS_env
->NowMicros())) {
5231 // If we need to perform a "merge op" pattern,
5232 // we first write all the persistent KV entries not targeted
5233 // by deletes, and then we write the disposable entries deletes.
5234 if (persistent_ent_and_del_index
[id
] <
5235 FLAGS_persistent_entries_batch_size
) {
5236 // Generate key to insert.
5238 key_gens
[id
]->Fetch(disposable_entries_q
[id
].front().second
+
5239 FLAGS_disposable_entries_batch_size
+
5240 persistent_ent_and_del_index
[id
]);
5241 persistent_ent_and_del_index
[id
]++;
5242 is_disposable_entry
= false;
5243 skip_for_loop
= false;
5244 } else if (persistent_ent_and_del_index
[id
] <
5245 kNumDispAndPersEntries
) {
5246 // Find key of the entry to delete.
5248 key_gens
[id
]->Fetch(disposable_entries_q
[id
].front().second
+
5249 (persistent_ent_and_del_index
[id
] -
5250 FLAGS_persistent_entries_batch_size
));
5251 persistent_ent_and_del_index
[id
]++;
5252 GenerateKeyFromInt(rand_num
, FLAGS_num
, &key
);
5253 // For the delete operation, everything happens here and we
5254 // skip the rest of the for-loop, which is designed for
5256 if (FLAGS_num_column_families
<= 1) {
5259 // We use same rand_num as seed for key and column family so
5260 // that we can deterministically find the cfh corresponding to a
5261 // particular key while reading the key.
5262 batch
.Delete(db_with_cfh
->GetCfh(rand_num
), key
);
5264 // A delete only includes Key+Timestamp (no value).
5265 batch_bytes
+= key_size_
+ user_timestamp_size_
;
5266 bytes
+= key_size_
+ user_timestamp_size_
;
5267 num_selective_deletes
++;
5268 // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
5269 skip_for_loop
= true;
5271 assert(false); // should never reach this point.
5273 // If disposable_entries_q needs to be updated (ie: when a selective
5274 // insert+delete was successfully completed, pop the job out of the
5276 if (!disposable_entries_q
[id
].empty() &&
5277 (disposable_entries_q
[id
].front().first
<
5278 FLAGS_env
->NowMicros()) &&
5279 persistent_ent_and_del_index
[id
] == kNumDispAndPersEntries
) {
5280 disposable_entries_q
[id
].pop();
5281 persistent_ent_and_del_index
[id
] = 0;
5284 // If we are deleting disposable entries, skip the rest of the
5285 // for-loop since there is no key-value inserts at this moment in
5287 if (skip_for_loop
) {
5292 // If no job is in the queue, then we keep inserting disposable KV
5293 // entries that will be deleted later by a series of deletes.
5295 rand_num
= key_gens
[id
]->Fetch(disposable_entries_index
[id
]);
5296 disposable_entries_index
[id
]++;
5297 is_disposable_entry
= true;
5298 if ((disposable_entries_index
[id
] %
5299 FLAGS_disposable_entries_batch_size
) == 0) {
5300 // Skip the persistent KV entries inserts for now
5301 disposable_entries_index
[id
] +=
5302 FLAGS_persistent_entries_batch_size
;
5306 rand_num
= key_gens
[id
]->Next();
5308 GenerateKeyFromInt(rand_num
, FLAGS_num
, &key
);
5310 if (kNumDispAndPersEntries
> 0) {
5311 random_value
= rnd_disposable_entry
.RandomString(
5312 is_disposable_entry
? FLAGS_disposable_entries_value_size
5313 : FLAGS_persistent_entries_value_size
);
5314 val
= Slice(random_value
);
5317 val
= gen
.Generate();
5320 #ifndef ROCKSDB_LITE
5322 blob_db::BlobDB
* blobdb
=
5323 static_cast<blob_db::BlobDB
*>(db_with_cfh
->db
);
5324 if (FLAGS_blob_db_max_ttl_range
> 0) {
5325 int ttl
= rand() % FLAGS_blob_db_max_ttl_range
;
5326 s
= blobdb
->PutWithTTL(write_options_
, key
, val
, ttl
);
5328 s
= blobdb
->Put(write_options_
, key
, val
);
5330 #endif // ROCKSDB_LITE
5331 } else if (FLAGS_num_column_families
<= 1) {
5332 batch
.Put(key
, val
);
5334 // We use same rand_num as seed for key and column family so that we
5335 // can deterministically find the cfh corresponding to a particular
5336 // key while reading the key.
5337 batch
.Put(db_with_cfh
->GetCfh(rand_num
), key
, val
);
5339 batch_bytes
+= val
.size() + key_size_
+ user_timestamp_size_
;
5340 bytes
+= val
.size() + key_size_
+ user_timestamp_size_
;
5343 // If all disposable entries have been inserted, then we need to
5344 // add in the job queue a call for 'persistent entry insertions +
5345 // disposable entry deletions'.
5346 if (kNumDispAndPersEntries
> 0 && is_disposable_entry
&&
5347 ((disposable_entries_index
[id
] % kNumDispAndPersEntries
) == 0)) {
5348 // Queue contains [timestamp, starting_idx],
5349 // timestamp = current_time + delay (minimum aboslute time when to
5350 // start inserting the selective deletes) starting_idx = index in the
5351 // keygen of the rand_num to generate the key of the first KV entry to
5352 // delete (= key of the first selective delete).
5353 disposable_entries_q
[id
].push(std::make_pair(
5354 FLAGS_env
->NowMicros() +
5355 FLAGS_disposable_entries_delete_delay
/* timestamp */,
5356 disposable_entries_index
[id
] - kNumDispAndPersEntries
5359 if (writes_per_range_tombstone_
> 0 &&
5360 num_written
> writes_before_delete_range_
&&
5361 (num_written
- writes_before_delete_range_
) /
5362 writes_per_range_tombstone_
<=
5363 max_num_range_tombstones_
&&
5364 (num_written
- writes_before_delete_range_
) %
5365 writes_per_range_tombstone_
==
5367 num_range_deletions
++;
5368 int64_t begin_num
= key_gens
[id
]->Next();
5369 if (FLAGS_expand_range_tombstones
) {
5370 for (int64_t offset
= 0; offset
< range_tombstone_width_
;
5372 GenerateKeyFromInt(begin_num
+ offset
, FLAGS_num
,
5373 &expanded_keys
[offset
]);
5375 #ifndef ROCKSDB_LITE
5377 s
= db_with_cfh
->db
->Delete(write_options_
,
5378 expanded_keys
[offset
]);
5379 #endif // ROCKSDB_LITE
5380 } else if (FLAGS_num_column_families
<= 1) {
5381 batch
.Delete(expanded_keys
[offset
]);
5383 batch
.Delete(db_with_cfh
->GetCfh(rand_num
),
5384 expanded_keys
[offset
]);
5388 GenerateKeyFromInt(begin_num
, FLAGS_num
, &begin_key
);
5389 GenerateKeyFromInt(begin_num
+ range_tombstone_width_
, FLAGS_num
,
5392 #ifndef ROCKSDB_LITE
5394 s
= db_with_cfh
->db
->DeleteRange(
5395 write_options_
, db_with_cfh
->db
->DefaultColumnFamily(),
5396 begin_key
, end_key
);
5397 #endif // ROCKSDB_LITE
5398 } else if (FLAGS_num_column_families
<= 1) {
5399 batch
.DeleteRange(begin_key
, end_key
);
5401 batch
.DeleteRange(db_with_cfh
->GetCfh(rand_num
), begin_key
,
5407 if (thread
->shared
->write_rate_limiter
.get() != nullptr) {
5408 thread
->shared
->write_rate_limiter
->Request(
5409 batch_bytes
, Env::IO_HIGH
, nullptr /* stats */,
5410 RateLimiter::OpType::kWrite
);
5411 // Set time at which last op finished to Now() to hide latency and
5412 // sleep from rate limiter. Also, do the check once per batch, not
5414 thread
->stats
.ResetLastOpTime();
5416 if (user_timestamp_size_
> 0) {
5417 Slice user_ts
= mock_app_clock_
->Allocate(ts_guard
.get());
5418 s
= batch
.UpdateTimestamps(
5419 user_ts
, [this](uint32_t) { return user_timestamp_size_
; });
5421 fprintf(stderr
, "assign timestamp to write batch: %s\n",
5422 s
.ToString().c_str());
5426 if (!use_blob_db_
) {
5427 // Not stacked BlobDB
5428 s
= db_with_cfh
->db
->Write(write_options_
, &batch
);
5430 thread
->stats
.FinishedOps(db_with_cfh
, db_with_cfh
->db
,
5431 entries_per_batch_
, kWrite
);
5432 if (FLAGS_sine_write_rate
) {
5433 uint64_t now
= FLAGS_env
->NowMicros();
5435 uint64_t usecs_since_last
;
5436 if (now
> thread
->stats
.GetSineInterval()) {
5437 usecs_since_last
= now
- thread
->stats
.GetSineInterval();
5439 usecs_since_last
= 0;
5442 if (usecs_since_last
>
5443 (FLAGS_sine_write_rate_interval_milliseconds
* uint64_t{1000})) {
5444 double usecs_since_start
=
5445 static_cast<double>(now
- thread
->stats
.GetStart());
5446 thread
->stats
.ResetSineInterval();
5447 uint64_t write_rate
=
5448 static_cast<uint64_t>(SineRate(usecs_since_start
/ 1000000.0));
5449 thread
->shared
->write_rate_limiter
.reset(
5450 NewGenericRateLimiter(write_rate
));
5454 s
= listener_
->WaitForRecovery(600000000) ? Status::OK() : s
;
5458 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
5462 if ((write_mode
== UNIQUE_RANDOM
) && (p
> 0.0)) {
5464 "Number of unique keys inserted: %" PRIu64
5465 ".\nNumber of overwrites: %" PRIu64
"\n",
5466 num_unique_keys
, num_overwrites
);
5467 } else if (kNumDispAndPersEntries
> 0) {
5469 "Number of unique keys inserted (disposable+persistent): %" PRIu64
5470 ".\nNumber of 'disposable entry delete': %" PRIu64
"\n",
5471 num_written
, num_selective_deletes
);
5473 if (num_range_deletions
> 0) {
5474 std::cout
<< "Number of range deletions: " << num_range_deletions
5477 thread
->stats
.AddBytes(bytes
);
5480 Status
DoDeterministicCompact(ThreadState
* thread
,
5481 CompactionStyle compaction_style
,
5482 WriteMode write_mode
) {
5483 #ifndef ROCKSDB_LITE
5484 ColumnFamilyMetaData meta
;
5485 std::vector
<DB
*> db_list
;
5486 if (db_
.db
!= nullptr) {
5487 db_list
.push_back(db_
.db
);
5489 for (auto& db
: multi_dbs_
) {
5490 db_list
.push_back(db
.db
);
5493 std::vector
<Options
> options_list
;
5494 for (auto db
: db_list
) {
5495 options_list
.push_back(db
->GetOptions());
5496 if (compaction_style
!= kCompactionStyleFIFO
) {
5497 db
->SetOptions({{"disable_auto_compactions", "1"},
5498 {"level0_slowdown_writes_trigger", "400000000"},
5499 {"level0_stop_writes_trigger", "400000000"}});
5501 db
->SetOptions({{"disable_auto_compactions", "1"}});
5505 assert(!db_list
.empty());
5506 auto num_db
= db_list
.size();
5507 size_t num_levels
= static_cast<size_t>(open_options_
.num_levels
);
5508 size_t output_level
= open_options_
.num_levels
- 1;
5509 std::vector
<std::vector
<std::vector
<SstFileMetaData
>>> sorted_runs(num_db
);
5510 std::vector
<size_t> num_files_at_level0(num_db
, 0);
5511 if (compaction_style
== kCompactionStyleLevel
) {
5512 if (num_levels
== 0) {
5513 return Status::InvalidArgument("num_levels should be larger than 1");
5515 bool should_stop
= false;
5516 while (!should_stop
) {
5517 if (sorted_runs
[0].empty()) {
5518 DoWrite(thread
, write_mode
);
5520 DoWrite(thread
, UNIQUE_RANDOM
);
5522 for (size_t i
= 0; i
< num_db
; i
++) {
5523 auto db
= db_list
[i
];
5524 db
->Flush(FlushOptions());
5525 db
->GetColumnFamilyMetaData(&meta
);
5526 if (num_files_at_level0
[i
] == meta
.levels
[0].files
.size() ||
5531 sorted_runs
[i
].emplace_back(
5532 meta
.levels
[0].files
.begin(),
5533 meta
.levels
[0].files
.end() - num_files_at_level0
[i
]);
5534 num_files_at_level0
[i
] = meta
.levels
[0].files
.size();
5535 if (sorted_runs
[i
].back().size() == 1) {
5539 if (sorted_runs
[i
].size() == output_level
) {
5540 auto& L1
= sorted_runs
[i
].back();
5541 L1
.erase(L1
.begin(), L1
.begin() + L1
.size() / 3);
5547 static_cast<int64_t>(open_options_
.max_bytes_for_level_multiplier
);
5549 for (size_t i
= 0; i
< num_db
; i
++) {
5550 if (sorted_runs
[i
].size() < num_levels
- 1) {
5551 fprintf(stderr
, "n is too small to fill %" ROCKSDB_PRIszt
" levels\n",
5556 for (size_t i
= 0; i
< num_db
; i
++) {
5557 auto db
= db_list
[i
];
5558 auto compactionOptions
= CompactionOptions();
5559 compactionOptions
.compression
= FLAGS_compression_type_e
;
5560 auto options
= db
->GetOptions();
5561 MutableCFOptions
mutable_cf_options(options
);
5562 for (size_t j
= 0; j
< sorted_runs
[i
].size(); j
++) {
5563 compactionOptions
.output_file_size_limit
= MaxFileSizeForLevel(
5564 mutable_cf_options
, static_cast<int>(output_level
),
5566 std::cout
<< sorted_runs
[i
][j
].size() << std::endl
;
5569 {sorted_runs
[i
][j
].back().name
, sorted_runs
[i
][j
].front().name
},
5570 static_cast<int>(output_level
- j
) /*level*/);
5573 } else if (compaction_style
== kCompactionStyleUniversal
) {
5574 auto ratio
= open_options_
.compaction_options_universal
.size_ratio
;
5575 bool should_stop
= false;
5576 while (!should_stop
) {
5577 if (sorted_runs
[0].empty()) {
5578 DoWrite(thread
, write_mode
);
5580 DoWrite(thread
, UNIQUE_RANDOM
);
5582 for (size_t i
= 0; i
< num_db
; i
++) {
5583 auto db
= db_list
[i
];
5584 db
->Flush(FlushOptions());
5585 db
->GetColumnFamilyMetaData(&meta
);
5586 if (num_files_at_level0
[i
] == meta
.levels
[0].files
.size() ||
5591 sorted_runs
[i
].emplace_back(
5592 meta
.levels
[0].files
.begin(),
5593 meta
.levels
[0].files
.end() - num_files_at_level0
[i
]);
5594 num_files_at_level0
[i
] = meta
.levels
[0].files
.size();
5595 if (sorted_runs
[i
].back().size() == 1) {
5599 num_files_at_level0
[i
] = meta
.levels
[0].files
.size();
5601 writes_
= static_cast<int64_t>(writes_
* static_cast<double>(100) /
5604 for (size_t i
= 0; i
< num_db
; i
++) {
5605 if (sorted_runs
[i
].size() < num_levels
) {
5606 fprintf(stderr
, "n is too small to fill %" ROCKSDB_PRIszt
" levels\n",
5611 for (size_t i
= 0; i
< num_db
; i
++) {
5612 auto db
= db_list
[i
];
5613 auto compactionOptions
= CompactionOptions();
5614 compactionOptions
.compression
= FLAGS_compression_type_e
;
5615 auto options
= db
->GetOptions();
5616 MutableCFOptions
mutable_cf_options(options
);
5617 for (size_t j
= 0; j
< sorted_runs
[i
].size(); j
++) {
5618 compactionOptions
.output_file_size_limit
= MaxFileSizeForLevel(
5619 mutable_cf_options
, static_cast<int>(output_level
),
5623 {sorted_runs
[i
][j
].back().name
, sorted_runs
[i
][j
].front().name
},
5624 (output_level
> j
? static_cast<int>(output_level
- j
)
5628 } else if (compaction_style
== kCompactionStyleFIFO
) {
5629 if (num_levels
!= 1) {
5630 return Status::InvalidArgument(
5631 "num_levels should be 1 for FIFO compaction");
5633 if (FLAGS_num_multi_db
!= 0) {
5634 return Status::InvalidArgument("Doesn't support multiDB");
5636 auto db
= db_list
[0];
5637 std::vector
<std::string
> file_names
;
5639 if (sorted_runs
[0].empty()) {
5640 DoWrite(thread
, write_mode
);
5642 DoWrite(thread
, UNIQUE_RANDOM
);
5644 db
->Flush(FlushOptions());
5645 db
->GetColumnFamilyMetaData(&meta
);
5646 auto total_size
= meta
.levels
[0].size
;
5648 db
->GetOptions().compaction_options_fifo
.max_table_files_size
) {
5649 for (auto file_meta
: meta
.levels
[0].files
) {
5650 file_names
.emplace_back(file_meta
.name
);
5655 // TODO(shuzhang1989): Investigate why CompactFiles not working
5656 // auto compactionOptions = CompactionOptions();
5657 // db->CompactFiles(compactionOptions, file_names, 0);
5658 auto compactionOptions
= CompactRangeOptions();
5659 db
->CompactRange(compactionOptions
, nullptr, nullptr);
5662 "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
5663 "filldeterministic");
5664 return Status::InvalidArgument("None compaction is not supported");
5667 // Verify seqno and key range
5668 // Note: the seqno get changed at the max level by implementation
5669 // optimization, so skip the check of the max level.
5671 for (size_t k
= 0; k
< num_db
; k
++) {
5672 auto db
= db_list
[k
];
5673 db
->GetColumnFamilyMetaData(&meta
);
5674 // verify the number of sorted runs
5675 if (compaction_style
== kCompactionStyleLevel
) {
5676 assert(num_levels
- 1 == sorted_runs
[k
].size());
5677 } else if (compaction_style
== kCompactionStyleUniversal
) {
5678 assert(meta
.levels
[0].files
.size() + num_levels
- 1 ==
5679 sorted_runs
[k
].size());
5680 } else if (compaction_style
== kCompactionStyleFIFO
) {
5681 // TODO(gzh): FIFO compaction
5682 db
->GetColumnFamilyMetaData(&meta
);
5683 auto total_size
= meta
.levels
[0].size
;
5684 assert(total_size
<=
5685 db
->GetOptions().compaction_options_fifo
.max_table_files_size
);
5689 // verify smallest/largest seqno and key range of each sorted run
5690 auto max_level
= num_levels
- 1;
5692 for (size_t i
= 0; i
< sorted_runs
[k
].size(); i
++) {
5693 level
= static_cast<int>(max_level
- i
);
5694 SequenceNumber sorted_run_smallest_seqno
= kMaxSequenceNumber
;
5695 SequenceNumber sorted_run_largest_seqno
= 0;
5696 std::string sorted_run_smallest_key
, sorted_run_largest_key
;
5697 bool first_key
= true;
5698 for (auto fileMeta
: sorted_runs
[k
][i
]) {
5699 sorted_run_smallest_seqno
=
5700 std::min(sorted_run_smallest_seqno
, fileMeta
.smallest_seqno
);
5701 sorted_run_largest_seqno
=
5702 std::max(sorted_run_largest_seqno
, fileMeta
.largest_seqno
);
5704 db
->DefaultColumnFamily()->GetComparator()->Compare(
5705 fileMeta
.smallestkey
, sorted_run_smallest_key
) < 0) {
5706 sorted_run_smallest_key
= fileMeta
.smallestkey
;
5709 db
->DefaultColumnFamily()->GetComparator()->Compare(
5710 fileMeta
.largestkey
, sorted_run_largest_key
) > 0) {
5711 sorted_run_largest_key
= fileMeta
.largestkey
;
5715 if (compaction_style
== kCompactionStyleLevel
||
5716 (compaction_style
== kCompactionStyleUniversal
&& level
> 0)) {
5717 SequenceNumber level_smallest_seqno
= kMaxSequenceNumber
;
5718 SequenceNumber level_largest_seqno
= 0;
5719 for (auto fileMeta
: meta
.levels
[level
].files
) {
5720 level_smallest_seqno
=
5721 std::min(level_smallest_seqno
, fileMeta
.smallest_seqno
);
5722 level_largest_seqno
=
5723 std::max(level_largest_seqno
, fileMeta
.largest_seqno
);
5725 assert(sorted_run_smallest_key
==
5726 meta
.levels
[level
].files
.front().smallestkey
);
5727 assert(sorted_run_largest_key
==
5728 meta
.levels
[level
].files
.back().largestkey
);
5729 if (level
!= static_cast<int>(max_level
)) {
5730 // compaction at max_level would change sequence number
5731 assert(sorted_run_smallest_seqno
== level_smallest_seqno
);
5732 assert(sorted_run_largest_seqno
== level_largest_seqno
);
5734 } else if (compaction_style
== kCompactionStyleUniversal
) {
5735 // level <= 0 means sorted runs on level 0
5737 meta
.levels
[0].files
[sorted_runs
[k
].size() - 1 - i
];
5738 assert(sorted_run_smallest_key
== level0_file
.smallestkey
);
5739 assert(sorted_run_largest_key
== level0_file
.largestkey
);
5740 if (level
!= static_cast<int>(max_level
)) {
5741 assert(sorted_run_smallest_seqno
== level0_file
.smallest_seqno
);
5742 assert(sorted_run_largest_seqno
== level0_file
.largest_seqno
);
5748 // print the size of each sorted_run
5749 for (size_t k
= 0; k
< num_db
; k
++) {
5750 auto db
= db_list
[k
];
5752 "---------------------- DB %" ROCKSDB_PRIszt
5753 " LSM ---------------------\n",
5755 db
->GetColumnFamilyMetaData(&meta
);
5756 for (auto& levelMeta
: meta
.levels
) {
5757 if (levelMeta
.files
.empty()) {
5760 if (levelMeta
.level
== 0) {
5761 for (auto& fileMeta
: levelMeta
.files
) {
5762 fprintf(stdout
, "Level[%d]: %s(size: %" PRIi64
" bytes)\n",
5763 levelMeta
.level
, fileMeta
.name
.c_str(), fileMeta
.size
);
5766 fprintf(stdout
, "Level[%d]: %s - %s(total size: %" PRIi64
" bytes)\n",
5767 levelMeta
.level
, levelMeta
.files
.front().name
.c_str(),
5768 levelMeta
.files
.back().name
.c_str(), levelMeta
.size
);
5772 for (size_t i
= 0; i
< num_db
; i
++) {
5773 db_list
[i
]->SetOptions(
5774 {{"disable_auto_compactions",
5775 std::to_string(options_list
[i
].disable_auto_compactions
)},
5776 {"level0_slowdown_writes_trigger",
5777 std::to_string(options_list
[i
].level0_slowdown_writes_trigger
)},
5778 {"level0_stop_writes_trigger",
5779 std::to_string(options_list
[i
].level0_stop_writes_trigger
)}});
5781 return Status::OK();
5784 (void)compaction_style
;
5786 fprintf(stderr
, "Rocksdb Lite doesn't support filldeterministic\n");
5787 return Status::NotSupported(
5788 "Rocksdb Lite doesn't support filldeterministic");
5789 #endif // ROCKSDB_LITE
5792 void ReadSequential(ThreadState
* thread
) {
5793 if (db_
.db
!= nullptr) {
5794 ReadSequential(thread
, db_
.db
);
5796 for (const auto& db_with_cfh
: multi_dbs_
) {
5797 ReadSequential(thread
, db_with_cfh
.db
);
5802 void ReadSequential(ThreadState
* thread
, DB
* db
) {
5803 ReadOptions options
= read_options_
;
5804 std::unique_ptr
<char[]> ts_guard
;
5806 if (user_timestamp_size_
> 0) {
5807 ts_guard
.reset(new char[user_timestamp_size_
]);
5808 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
5809 options
.timestamp
= &ts
;
5812 options
.adaptive_readahead
= FLAGS_adaptive_readahead
;
5813 options
.async_io
= FLAGS_async_io
;
5815 Iterator
* iter
= db
->NewIterator(options
);
5818 for (iter
->SeekToFirst(); i
< reads_
&& iter
->Valid(); iter
->Next()) {
5819 bytes
+= iter
->key().size() + iter
->value().size();
5820 thread
->stats
.FinishedOps(nullptr, db
, 1, kRead
);
5823 if (thread
->shared
->read_rate_limiter
.get() != nullptr &&
5825 thread
->shared
->read_rate_limiter
->Request(1024, Env::IO_HIGH
,
5826 nullptr /* stats */,
5827 RateLimiter::OpType::kRead
);
5832 thread
->stats
.AddBytes(bytes
);
5835 void ReadToRowCache(ThreadState
* thread
) {
5839 int64_t key_rand
= 0;
5840 std::unique_ptr
<const char[]> key_guard
;
5841 Slice key
= AllocateKey(&key_guard
);
5842 PinnableSlice pinnable_val
;
5844 while (key_rand
< FLAGS_num
) {
5845 DBWithColumnFamilies
* db_with_cfh
= SelectDBWithCfh(thread
);
5846 // We use same key_rand as seed for key and column family so that we can
5847 // deterministically find the cfh corresponding to a particular key, as it
5848 // is done in DoWrite method.
5849 GenerateKeyFromInt(key_rand
, FLAGS_num
, &key
);
5853 if (FLAGS_num_column_families
> 1) {
5854 s
= db_with_cfh
->db
->Get(read_options_
, db_with_cfh
->GetCfh(key_rand
),
5855 key
, &pinnable_val
);
5857 pinnable_val
.Reset();
5858 s
= db_with_cfh
->db
->Get(read_options_
,
5859 db_with_cfh
->db
->DefaultColumnFamily(), key
,
5865 bytes
+= key
.size() + pinnable_val
.size();
5866 } else if (!s
.IsNotFound()) {
5867 fprintf(stderr
, "Get returned an error: %s\n", s
.ToString().c_str());
5871 if (thread
->shared
->read_rate_limiter
.get() != nullptr &&
5872 read
% 256 == 255) {
5873 thread
->shared
->read_rate_limiter
->Request(
5874 256, Env::IO_HIGH
, nullptr /* stats */, RateLimiter::OpType::kRead
);
5877 thread
->stats
.FinishedOps(db_with_cfh
, db_with_cfh
->db
, 1, kRead
);
5881 snprintf(msg
, sizeof(msg
), "(%" PRIu64
" of %" PRIu64
" found)\n", found
,
5884 thread
->stats
.AddBytes(bytes
);
5885 thread
->stats
.AddMessage(msg
);
5888 void ReadReverse(ThreadState
* thread
) {
5889 if (db_
.db
!= nullptr) {
5890 ReadReverse(thread
, db_
.db
);
5892 for (const auto& db_with_cfh
: multi_dbs_
) {
5893 ReadReverse(thread
, db_with_cfh
.db
);
5898 void ReadReverse(ThreadState
* thread
, DB
* db
) {
5899 Iterator
* iter
= db
->NewIterator(read_options_
);
5902 for (iter
->SeekToLast(); i
< reads_
&& iter
->Valid(); iter
->Prev()) {
5903 bytes
+= iter
->key().size() + iter
->value().size();
5904 thread
->stats
.FinishedOps(nullptr, db
, 1, kRead
);
5906 if (thread
->shared
->read_rate_limiter
.get() != nullptr &&
5908 thread
->shared
->read_rate_limiter
->Request(1024, Env::IO_HIGH
,
5909 nullptr /* stats */,
5910 RateLimiter::OpType::kRead
);
5914 thread
->stats
.AddBytes(bytes
);
5917 void ReadRandomFast(ThreadState
* thread
) {
5920 int64_t nonexist
= 0;
5921 ReadOptions options
= read_options_
;
5922 std::unique_ptr
<const char[]> key_guard
;
5923 Slice key
= AllocateKey(&key_guard
);
5926 std::unique_ptr
<char[]> ts_guard
;
5927 if (user_timestamp_size_
> 0) {
5928 ts_guard
.reset(new char[user_timestamp_size_
]);
5930 DB
* db
= SelectDBWithCfh(thread
)->db
;
5933 while (pot
< FLAGS_num
) {
5937 Duration
duration(FLAGS_duration
, reads_
);
5939 for (int i
= 0; i
< 100; ++i
) {
5940 int64_t key_rand
= thread
->rand
.Next() & (pot
- 1);
5941 GenerateKeyFromInt(key_rand
, FLAGS_num
, &key
);
5944 std::string
* ts_ptr
= nullptr;
5945 if (user_timestamp_size_
> 0) {
5946 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
,
5948 options
.timestamp
= &ts
;
5951 auto status
= db
->Get(options
, key
, &value
, ts_ptr
);
5954 } else if (!status
.IsNotFound()) {
5955 fprintf(stderr
, "Get returned an error: %s\n",
5956 status
.ToString().c_str());
5959 if (key_rand
>= FLAGS_num
) {
5963 if (thread
->shared
->read_rate_limiter
.get() != nullptr) {
5964 thread
->shared
->read_rate_limiter
->Request(
5965 100, Env::IO_HIGH
, nullptr /* stats */, RateLimiter::OpType::kRead
);
5968 thread
->stats
.FinishedOps(nullptr, db
, 100, kRead
);
5969 } while (!duration
.Done(100));
5972 snprintf(msg
, sizeof(msg
),
5973 "(%" PRIu64
" of %" PRIu64
5975 "issued %" PRIu64
" non-exist keys)\n",
5976 found
, read
, nonexist
);
5978 thread
->stats
.AddMessage(msg
);
5981 int64_t GetRandomKey(Random64
* rand
) {
5982 uint64_t rand_int
= rand
->Next();
5984 if (read_random_exp_range_
== 0) {
5985 key_rand
= rand_int
% FLAGS_num
;
5987 const uint64_t kBigInt
= static_cast<uint64_t>(1U) << 62;
5988 long double order
= -static_cast<long double>(rand_int
% kBigInt
) /
5989 static_cast<long double>(kBigInt
) *
5990 read_random_exp_range_
;
5991 long double exp_ran
= std::exp(order
);
5993 static_cast<int64_t>(exp_ran
* static_cast<long double>(FLAGS_num
));
5994 // Map to a different number to avoid locality.
5995 const uint64_t kBigPrime
= 0x5bd1e995;
5996 // Overflow is like %(2^64). Will have little impact of results.
5997 key_rand
= static_cast<int64_t>((rand_num
* kBigPrime
) % FLAGS_num
);
6002 void ReadRandom(ThreadState
* thread
) {
6007 int64_t key_rand
= 0;
6008 ReadOptions options
= read_options_
;
6009 std::unique_ptr
<const char[]> key_guard
;
6010 Slice key
= AllocateKey(&key_guard
);
6011 PinnableSlice pinnable_val
;
6012 std::vector
<PinnableSlice
> pinnable_vals
;
6013 if (read_operands_
) {
6014 // Start off with a small-ish value that'll be increased later if
6015 // `GetMergeOperands()` tells us it is not large enough.
6016 pinnable_vals
.resize(8);
6018 std::unique_ptr
<char[]> ts_guard
;
6020 if (user_timestamp_size_
> 0) {
6021 ts_guard
.reset(new char[user_timestamp_size_
]);
6024 Duration
duration(FLAGS_duration
, reads_
);
6025 while (!duration
.Done(1)) {
6026 DBWithColumnFamilies
* db_with_cfh
= SelectDBWithCfh(thread
);
6027 // We use same key_rand as seed for key and column family so that we can
6028 // deterministically find the cfh corresponding to a particular key, as it
6029 // is done in DoWrite method.
6030 if (entries_per_batch_
> 1 && FLAGS_multiread_stride
) {
6031 if (++num_keys
== entries_per_batch_
) {
6033 key_rand
= GetRandomKey(&thread
->rand
);
6034 if ((key_rand
+ (entries_per_batch_
- 1) * FLAGS_multiread_stride
) >=
6036 key_rand
= FLAGS_num
- entries_per_batch_
* FLAGS_multiread_stride
;
6039 key_rand
+= FLAGS_multiread_stride
;
6042 key_rand
= GetRandomKey(&thread
->rand
);
6044 GenerateKeyFromInt(key_rand
, FLAGS_num
, &key
);
6047 std::string
* ts_ptr
= nullptr;
6048 if (user_timestamp_size_
> 0) {
6049 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
6050 options
.timestamp
= &ts
;
6054 pinnable_val
.Reset();
6055 for (size_t i
= 0; i
< pinnable_vals
.size(); ++i
) {
6056 pinnable_vals
[i
].Reset();
6058 ColumnFamilyHandle
* cfh
;
6059 if (FLAGS_num_column_families
> 1) {
6060 cfh
= db_with_cfh
->GetCfh(key_rand
);
6062 cfh
= db_with_cfh
->db
->DefaultColumnFamily();
6064 if (read_operands_
) {
6065 GetMergeOperandsOptions get_merge_operands_options
;
6066 get_merge_operands_options
.expected_max_number_of_operands
=
6067 static_cast<int>(pinnable_vals
.size());
6068 int number_of_operands
;
6069 s
= db_with_cfh
->db
->GetMergeOperands(
6070 options
, cfh
, key
, pinnable_vals
.data(),
6071 &get_merge_operands_options
, &number_of_operands
);
6072 if (s
.IsIncomplete()) {
6073 // Should only happen a few times when we encounter a key that had
6074 // more merge operands than any key seen so far. Production use case
6075 // would typically retry in such event to get all the operands so do
6077 pinnable_vals
.resize(number_of_operands
);
6078 get_merge_operands_options
.expected_max_number_of_operands
=
6079 static_cast<int>(pinnable_vals
.size());
6080 s
= db_with_cfh
->db
->GetMergeOperands(
6081 options
, cfh
, key
, pinnable_vals
.data(),
6082 &get_merge_operands_options
, &number_of_operands
);
6085 s
= db_with_cfh
->db
->Get(options
, cfh
, key
, &pinnable_val
, ts_ptr
);
6090 bytes
+= key
.size() + pinnable_val
.size() + user_timestamp_size_
;
6091 for (size_t i
= 0; i
< pinnable_vals
.size(); ++i
) {
6092 bytes
+= pinnable_vals
[i
].size();
6093 pinnable_vals
[i
].Reset();
6095 } else if (!s
.IsNotFound()) {
6096 fprintf(stderr
, "Get returned an error: %s\n", s
.ToString().c_str());
6100 if (thread
->shared
->read_rate_limiter
.get() != nullptr &&
6101 read
% 256 == 255) {
6102 thread
->shared
->read_rate_limiter
->Request(
6103 256, Env::IO_HIGH
, nullptr /* stats */, RateLimiter::OpType::kRead
);
6106 thread
->stats
.FinishedOps(db_with_cfh
, db_with_cfh
->db
, 1, kRead
);
6110 snprintf(msg
, sizeof(msg
), "(%" PRIu64
" of %" PRIu64
" found)\n", found
,
6113 thread
->stats
.AddBytes(bytes
);
6114 thread
->stats
.AddMessage(msg
);
6117 // Calls MultiGet over a list of keys from a random distribution.
6118 // Returns the total number of keys found.
6119 void MultiReadRandom(ThreadState
* thread
) {
6122 int64_t num_multireads
= 0;
6124 ReadOptions options
= read_options_
;
6125 std::vector
<Slice
> keys
;
6126 std::vector
<std::unique_ptr
<const char[]>> key_guards
;
6127 std::vector
<std::string
> values(entries_per_batch_
);
6128 PinnableSlice
* pin_values
= new PinnableSlice
[entries_per_batch_
];
6129 std::unique_ptr
<PinnableSlice
[]> pin_values_guard(pin_values
);
6130 std::vector
<Status
> stat_list(entries_per_batch_
);
6131 while (static_cast<int64_t>(keys
.size()) < entries_per_batch_
) {
6132 key_guards
.push_back(std::unique_ptr
<const char[]>());
6133 keys
.push_back(AllocateKey(&key_guards
.back()));
6136 std::unique_ptr
<char[]> ts_guard
;
6137 if (user_timestamp_size_
> 0) {
6138 ts_guard
.reset(new char[user_timestamp_size_
]);
6141 Duration
duration(FLAGS_duration
, reads_
);
6142 while (!duration
.Done(entries_per_batch_
)) {
6143 DB
* db
= SelectDB(thread
);
6144 if (FLAGS_multiread_stride
) {
6145 int64_t key
= GetRandomKey(&thread
->rand
);
6146 if ((key
+ (entries_per_batch_
- 1) * FLAGS_multiread_stride
) >=
6147 static_cast<int64_t>(FLAGS_num
)) {
6148 key
= FLAGS_num
- entries_per_batch_
* FLAGS_multiread_stride
;
6150 for (int64_t i
= 0; i
< entries_per_batch_
; ++i
) {
6151 GenerateKeyFromInt(key
, FLAGS_num
, &keys
[i
]);
6152 key
+= FLAGS_multiread_stride
;
6155 for (int64_t i
= 0; i
< entries_per_batch_
; ++i
) {
6156 GenerateKeyFromInt(GetRandomKey(&thread
->rand
), FLAGS_num
, &keys
[i
]);
6160 if (user_timestamp_size_
> 0) {
6161 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
6162 options
.timestamp
= &ts
;
6164 if (!FLAGS_multiread_batched
) {
6165 std::vector
<Status
> statuses
= db
->MultiGet(options
, keys
, &values
);
6166 assert(static_cast<int64_t>(statuses
.size()) == entries_per_batch_
);
6168 read
+= entries_per_batch_
;
6170 for (int64_t i
= 0; i
< entries_per_batch_
; ++i
) {
6171 if (statuses
[i
].ok()) {
6172 bytes
+= keys
[i
].size() + values
[i
].size() + user_timestamp_size_
;
6174 } else if (!statuses
[i
].IsNotFound()) {
6175 fprintf(stderr
, "MultiGet returned an error: %s\n",
6176 statuses
[i
].ToString().c_str());
6181 db
->MultiGet(options
, db
->DefaultColumnFamily(), keys
.size(),
6182 keys
.data(), pin_values
, stat_list
.data());
6184 read
+= entries_per_batch_
;
6186 for (int64_t i
= 0; i
< entries_per_batch_
; ++i
) {
6187 if (stat_list
[i
].ok()) {
6189 keys
[i
].size() + pin_values
[i
].size() + user_timestamp_size_
;
6191 } else if (!stat_list
[i
].IsNotFound()) {
6192 fprintf(stderr
, "MultiGet returned an error: %s\n",
6193 stat_list
[i
].ToString().c_str());
6196 stat_list
[i
] = Status::OK();
6197 pin_values
[i
].Reset();
6200 if (thread
->shared
->read_rate_limiter
.get() != nullptr &&
6201 num_multireads
% 256 == 255) {
6202 thread
->shared
->read_rate_limiter
->Request(
6203 256 * entries_per_batch_
, Env::IO_HIGH
, nullptr /* stats */,
6204 RateLimiter::OpType::kRead
);
6206 thread
->stats
.FinishedOps(nullptr, db
, entries_per_batch_
, kRead
);
6210 snprintf(msg
, sizeof(msg
), "(%" PRIu64
" of %" PRIu64
" found)", found
,
6212 thread
->stats
.AddBytes(bytes
);
6213 thread
->stats
.AddMessage(msg
);
6216 // Calls ApproximateSize over random key ranges.
6217 void ApproximateSizeRandom(ThreadState
* thread
) {
6218 int64_t size_sum
= 0;
6219 int64_t num_sizes
= 0;
6220 const size_t batch_size
= entries_per_batch_
;
6221 std::vector
<Range
> ranges
;
6222 std::vector
<Slice
> lkeys
;
6223 std::vector
<std::unique_ptr
<const char[]>> lkey_guards
;
6224 std::vector
<Slice
> rkeys
;
6225 std::vector
<std::unique_ptr
<const char[]>> rkey_guards
;
6226 std::vector
<uint64_t> sizes
;
6227 while (ranges
.size() < batch_size
) {
6228 // Ugly without C++17 return from emplace_back
6229 lkey_guards
.emplace_back();
6230 rkey_guards
.emplace_back();
6231 lkeys
.emplace_back(AllocateKey(&lkey_guards
.back()));
6232 rkeys
.emplace_back(AllocateKey(&rkey_guards
.back()));
6233 ranges
.emplace_back(lkeys
.back(), rkeys
.back());
6236 Duration
duration(FLAGS_duration
, reads_
);
6237 while (!duration
.Done(1)) {
6238 DB
* db
= SelectDB(thread
);
6239 for (size_t i
= 0; i
< batch_size
; ++i
) {
6240 int64_t lkey
= GetRandomKey(&thread
->rand
);
6241 int64_t rkey
= GetRandomKey(&thread
->rand
);
6243 std::swap(lkey
, rkey
);
6245 GenerateKeyFromInt(lkey
, FLAGS_num
, &lkeys
[i
]);
6246 GenerateKeyFromInt(rkey
, FLAGS_num
, &rkeys
[i
]);
6248 db
->GetApproximateSizes(&ranges
[0], static_cast<int>(entries_per_batch_
),
6250 num_sizes
+= entries_per_batch_
;
6251 for (int64_t size
: sizes
) {
6254 thread
->stats
.FinishedOps(nullptr, db
, entries_per_batch_
, kOthers
);
6258 snprintf(msg
, sizeof(msg
), "(Avg approx size=%g)",
6259 static_cast<double>(size_sum
) / static_cast<double>(num_sizes
));
6260 thread
->stats
.AddMessage(msg
);
6263 // The inverse function of Pareto distribution
6264 int64_t ParetoCdfInversion(double u
, double theta
, double k
, double sigma
) {
6267 ret
= theta
- sigma
* std::log(u
);
6269 ret
= theta
+ sigma
* (std::pow(u
, -1 * k
) - 1) / k
;
6271 return static_cast<int64_t>(ceil(ret
));
6273 // The inverse function of power distribution (y=ax^b)
6274 int64_t PowerCdfInversion(double u
, double a
, double b
) {
6276 ret
= std::pow((u
/ a
), (1 / b
));
6277 return static_cast<int64_t>(ceil(ret
));
6280 // Add the noice to the QPS
6281 double AddNoise(double origin
, double noise_ratio
) {
6282 if (noise_ratio
< 0.0 || noise_ratio
> 1.0) {
6285 int band_int
= static_cast<int>(FLAGS_sine_a
);
6286 double delta
= (rand() % band_int
- band_int
/ 2) * noise_ratio
;
6287 if (origin
+ delta
< 0) {
6290 return (origin
+ delta
);
6294 // Decide the ratio of different query types
6295 // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
6296 class QueryDecider
{
6298 std::vector
<int> type_
;
6299 std::vector
<double> ratio_
;
6305 Status
Initiate(std::vector
<double> ratio_input
) {
6306 int range_max
= 1000;
6308 for (auto& ratio
: ratio_input
) {
6312 for (auto& ratio
: ratio_input
) {
6313 range_
+= static_cast<int>(ceil(range_max
* (ratio
/ sum
)));
6314 type_
.push_back(range_
);
6315 ratio_
.push_back(ratio
/ sum
);
6317 return Status::OK();
6320 int GetType(int64_t rand_num
) {
6322 rand_num
= rand_num
* (-1);
6324 assert(range_
!= 0);
6325 int pos
= static_cast<int>(rand_num
% range_
);
6326 for (int i
= 0; i
< static_cast<int>(type_
.size()); i
++) {
6327 if (pos
< type_
[i
]) {
6335 // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
6336 // to transfer a random value to one keyrange based on the hotness.
6337 struct KeyrangeUnit
{
6338 int64_t keyrange_start
;
6339 int64_t keyrange_access
;
6340 int64_t keyrange_keys
;
6343 // From our observations, the prefix hotness (key-range hotness) follows
6344 // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
6345 // However, we cannot directly use the inverse function to decide a
6346 // key-range from a random distribution. To achieve it, we create a list of
6347 // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
6348 // decided based on the hotness of the key-range. When a random value is
6349 // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
6350 // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being
6351 // selected is the same as the hotness of this KeyrangeUnit. After that, the
6352 // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
6353 // can based on the power distribution (y=ax^b) to generate the offset of
6354 // the key in the selected key-range. In this way, we generate the keyID
6355 // based on the hotness of the prefix and also the key hotness distribution.
6356 class GenerateTwoTermExpKeys
{
6358 // Avoid uninitialized warning-as-error in some compilers
6359 int64_t keyrange_rand_max_
= 0;
6360 int64_t keyrange_size_
= 0;
6361 int64_t keyrange_num_
= 0;
6362 std::vector
<KeyrangeUnit
> keyrange_set_
;
6364 // Initiate the KeyrangeUnit vector and calculate the size of each
6366 Status
InitiateExpDistribution(int64_t total_keys
, double prefix_a
,
6367 double prefix_b
, double prefix_c
,
6369 int64_t amplify
= 0;
6370 int64_t keyrange_start
= 0;
6371 if (FLAGS_keyrange_num
<= 0) {
6374 keyrange_num_
= FLAGS_keyrange_num
;
6376 keyrange_size_
= total_keys
/ keyrange_num_
;
6378 // Calculate the key-range shares size based on the input parameters
6379 for (int64_t pfx
= keyrange_num_
; pfx
>= 1; pfx
--) {
6380 // Step 1. Calculate the probability that this key range will be
6381 // accessed in a query. It is based on the two-term expoential
6383 double keyrange_p
= prefix_a
* std::exp(prefix_b
* pfx
) +
6384 prefix_c
* std::exp(prefix_d
* pfx
);
6385 if (keyrange_p
< std::pow(10.0, -16.0)) {
6388 // Step 2. Calculate the amplify
6389 // In order to allocate a query to a key-range based on the random
6390 // number generated for this query, we need to extend the probability
6391 // of each key range from [0,1] to [0, amplify]. Amplify is calculated
6392 // by 1/(smallest key-range probability). In this way, we ensure that
6393 // all key-ranges are assigned with an Integer that >=0
6394 if (amplify
== 0 && keyrange_p
> 0) {
6395 amplify
= static_cast<int64_t>(std::floor(1 / keyrange_p
)) + 1;
6398 // Step 3. For each key-range, we calculate its position in the
6399 // [0, amplify] range, including the start, the size (keyrange_access)
6400 KeyrangeUnit p_unit
;
6401 p_unit
.keyrange_start
= keyrange_start
;
6402 if (0.0 >= keyrange_p
) {
6403 p_unit
.keyrange_access
= 0;
6405 p_unit
.keyrange_access
=
6406 static_cast<int64_t>(std::floor(amplify
* keyrange_p
));
6408 p_unit
.keyrange_keys
= keyrange_size_
;
6409 keyrange_set_
.push_back(p_unit
);
6410 keyrange_start
+= p_unit
.keyrange_access
;
6412 keyrange_rand_max_
= keyrange_start
;
6414 // Step 4. Shuffle the key-ranges randomly
6415 // Since the access probability is calculated from small to large,
6416 // If we do not re-allocate them, hot key-ranges are always at the end
6417 // and cold key-ranges are at the begin of the key space. Therefore, the
6418 // key-ranges are shuffled and the rand seed is only decide by the
6419 // key-range hotness distribution. With the same distribution parameters
6420 // the shuffle results are the same.
6421 Random64
rand_loca(keyrange_rand_max_
);
6422 for (int64_t i
= 0; i
< FLAGS_keyrange_num
; i
++) {
6423 int64_t pos
= rand_loca
.Next() % FLAGS_keyrange_num
;
6424 assert(i
>= 0 && i
< static_cast<int64_t>(keyrange_set_
.size()) &&
6425 pos
>= 0 && pos
< static_cast<int64_t>(keyrange_set_
.size()));
6426 std::swap(keyrange_set_
[i
], keyrange_set_
[pos
]);
6429 // Step 5. Recalculate the prefix start postion after shuffling
6431 for (auto& p_unit
: keyrange_set_
) {
6432 p_unit
.keyrange_start
= offset
;
6433 offset
+= p_unit
.keyrange_access
;
6436 return Status::OK();
6439 // Generate the Key ID according to the input ini_rand and key distribution
6440 int64_t DistGetKeyID(int64_t ini_rand
, double key_dist_a
,
6441 double key_dist_b
) {
6442 int64_t keyrange_rand
= ini_rand
% keyrange_rand_max_
;
6444 // Calculate and select one key-range that contains the new key
6445 int64_t start
= 0, end
= static_cast<int64_t>(keyrange_set_
.size());
6446 while (start
+ 1 < end
) {
6447 int64_t mid
= start
+ (end
- start
) / 2;
6448 assert(mid
>= 0 && mid
< static_cast<int64_t>(keyrange_set_
.size()));
6449 if (keyrange_rand
< keyrange_set_
[mid
].keyrange_start
) {
6455 int64_t keyrange_id
= start
;
6457 // Select one key in the key-range and compose the keyID
6458 int64_t key_offset
= 0, key_seed
;
6459 if (key_dist_a
== 0.0 || key_dist_b
== 0.0) {
6460 key_offset
= ini_rand
% keyrange_size_
;
6463 static_cast<double>(ini_rand
% keyrange_size_
) / keyrange_size_
;
6464 key_seed
= static_cast<int64_t>(
6465 ceil(std::pow((u
/ key_dist_a
), (1 / key_dist_b
))));
6466 Random64
rand_key(key_seed
);
6467 key_offset
= rand_key
.Next() % keyrange_size_
;
6469 return keyrange_size_
* keyrange_id
+ key_offset
;
6473 // The social graph workload mixed with Get, Put, Iterator queries.
6474 // The value size and iterator length follow Pareto distribution.
6475 // The overall key access follow power distribution. If user models the
6476 // workload based on different key-ranges (or different prefixes), user
6477 // can use two-term-exponential distribution to fit the workload. User
6478 // needs to decide the ratio between Get, Put, Iterator queries before
6479 // starting the benchmark.
6480 void MixGraph(ThreadState
* thread
) {
6483 int64_t get_found
= 0;
6485 int64_t seek_found
= 0;
6487 double total_scan_length
= 0;
6488 double total_val_size
= 0;
6489 const int64_t default_value_max
= 1 * 1024 * 1024;
6490 int64_t value_max
= default_value_max
;
6491 int64_t scan_len_max
= FLAGS_mix_max_scan_len
;
6492 double write_rate
= 1000000.0;
6493 double read_rate
= 1000000.0;
6494 bool use_prefix_modeling
= false;
6495 bool use_random_modeling
= false;
6496 GenerateTwoTermExpKeys gen_exp
;
6497 std::vector
<double> ratio
{FLAGS_mix_get_ratio
, FLAGS_mix_put_ratio
,
6498 FLAGS_mix_seek_ratio
};
6499 char value_buffer
[default_value_max
];
6501 RandomGenerator gen
;
6503 if (value_max
> FLAGS_mix_max_value_size
) {
6504 value_max
= FLAGS_mix_max_value_size
;
6507 std::unique_ptr
<const char[]> key_guard
;
6508 Slice key
= AllocateKey(&key_guard
);
6509 PinnableSlice pinnable_val
;
6510 query
.Initiate(ratio
);
6512 // the limit of qps initiation
6513 if (FLAGS_sine_mix_rate
) {
6514 thread
->shared
->read_rate_limiter
.reset(
6515 NewGenericRateLimiter(static_cast<int64_t>(read_rate
)));
6516 thread
->shared
->write_rate_limiter
.reset(
6517 NewGenericRateLimiter(static_cast<int64_t>(write_rate
)));
6520 // Decide if user wants to use prefix based key generation
6521 if (FLAGS_keyrange_dist_a
!= 0.0 || FLAGS_keyrange_dist_b
!= 0.0 ||
6522 FLAGS_keyrange_dist_c
!= 0.0 || FLAGS_keyrange_dist_d
!= 0.0) {
6523 use_prefix_modeling
= true;
6524 gen_exp
.InitiateExpDistribution(
6525 FLAGS_num
, FLAGS_keyrange_dist_a
, FLAGS_keyrange_dist_b
,
6526 FLAGS_keyrange_dist_c
, FLAGS_keyrange_dist_d
);
6528 if (FLAGS_key_dist_a
== 0 || FLAGS_key_dist_b
== 0) {
6529 use_random_modeling
= true;
6532 Duration
duration(FLAGS_duration
, reads_
);
6533 while (!duration
.Done(1)) {
6534 DBWithColumnFamilies
* db_with_cfh
= SelectDBWithCfh(thread
);
6535 int64_t ini_rand
, rand_v
, key_rand
, key_seed
;
6536 ini_rand
= GetRandomKey(&thread
->rand
);
6537 rand_v
= ini_rand
% FLAGS_num
;
6538 double u
= static_cast<double>(rand_v
) / FLAGS_num
;
6540 // Generate the keyID based on the key hotness and prefix hotness
6541 if (use_random_modeling
) {
6542 key_rand
= ini_rand
;
6543 } else if (use_prefix_modeling
) {
6545 gen_exp
.DistGetKeyID(ini_rand
, FLAGS_key_dist_a
, FLAGS_key_dist_b
);
6547 key_seed
= PowerCdfInversion(u
, FLAGS_key_dist_a
, FLAGS_key_dist_b
);
6548 Random64
rand(key_seed
);
6549 key_rand
= static_cast<int64_t>(rand
.Next()) % FLAGS_num
;
6551 GenerateKeyFromInt(key_rand
, FLAGS_num
, &key
);
6552 int query_type
= query
.GetType(rand_v
);
6555 uint64_t now
= FLAGS_env
->NowMicros();
6556 uint64_t usecs_since_last
;
6557 if (now
> thread
->stats
.GetSineInterval()) {
6558 usecs_since_last
= now
- thread
->stats
.GetSineInterval();
6560 usecs_since_last
= 0;
6563 if (FLAGS_sine_mix_rate
&&
6565 (FLAGS_sine_mix_rate_interval_milliseconds
* uint64_t{1000})) {
6566 double usecs_since_start
=
6567 static_cast<double>(now
- thread
->stats
.GetStart());
6568 thread
->stats
.ResetSineInterval();
6569 double mix_rate_with_noise
= AddNoise(
6570 SineRate(usecs_since_start
/ 1000000.0), FLAGS_sine_mix_rate_noise
);
6571 read_rate
= mix_rate_with_noise
* (query
.ratio_
[0] + query
.ratio_
[2]);
6572 write_rate
= mix_rate_with_noise
* query
.ratio_
[1];
6574 if (read_rate
> 0) {
6575 thread
->shared
->read_rate_limiter
->SetBytesPerSecond(
6576 static_cast<int64_t>(read_rate
));
6578 if (write_rate
> 0) {
6579 thread
->shared
->write_rate_limiter
->SetBytesPerSecond(
6580 static_cast<int64_t>(write_rate
));
6584 if (query_type
== 0) {
6587 if (FLAGS_num_column_families
> 1) {
6588 s
= db_with_cfh
->db
->Get(read_options_
, db_with_cfh
->GetCfh(key_rand
),
6589 key
, &pinnable_val
);
6591 pinnable_val
.Reset();
6592 s
= db_with_cfh
->db
->Get(read_options_
,
6593 db_with_cfh
->db
->DefaultColumnFamily(), key
,
6599 bytes
+= key
.size() + pinnable_val
.size();
6600 } else if (!s
.IsNotFound()) {
6601 fprintf(stderr
, "Get returned an error: %s\n", s
.ToString().c_str());
6605 if (thread
->shared
->read_rate_limiter
&& (gets
+ seek
) % 100 == 0) {
6606 thread
->shared
->read_rate_limiter
->Request(100, Env::IO_HIGH
,
6609 thread
->stats
.FinishedOps(db_with_cfh
, db_with_cfh
->db
, 1, kRead
);
6610 } else if (query_type
== 1) {
6613 int64_t val_size
= ParetoCdfInversion(u
, FLAGS_value_theta
,
6614 FLAGS_value_k
, FLAGS_value_sigma
);
6615 if (val_size
< 10) {
6617 } else if (val_size
> value_max
) {
6618 val_size
= val_size
% value_max
;
6620 total_val_size
+= val_size
;
6622 s
= db_with_cfh
->db
->Put(
6623 write_options_
, key
,
6624 gen
.Generate(static_cast<unsigned int>(val_size
)));
6626 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
6630 if (thread
->shared
->write_rate_limiter
&& puts
% 100 == 0) {
6631 thread
->shared
->write_rate_limiter
->Request(100, Env::IO_HIGH
,
6634 thread
->stats
.FinishedOps(db_with_cfh
, db_with_cfh
->db
, 1, kWrite
);
6635 } else if (query_type
== 2) {
6637 if (db_with_cfh
->db
!= nullptr) {
6638 Iterator
* single_iter
= nullptr;
6639 single_iter
= db_with_cfh
->db
->NewIterator(read_options_
);
6640 if (single_iter
!= nullptr) {
6641 single_iter
->Seek(key
);
6643 if (single_iter
->Valid() && single_iter
->key().compare(key
) == 0) {
6646 int64_t scan_length
=
6647 ParetoCdfInversion(u
, FLAGS_iter_theta
, FLAGS_iter_k
,
6650 for (int64_t j
= 0; j
< scan_length
&& single_iter
->Valid(); j
++) {
6651 Slice value
= single_iter
->value();
6652 memcpy(value_buffer
, value
.data(),
6653 std::min(value
.size(), sizeof(value_buffer
)));
6654 bytes
+= single_iter
->key().size() + single_iter
->value().size();
6655 single_iter
->Next();
6656 assert(single_iter
->status().ok());
6657 total_scan_length
++;
6662 thread
->stats
.FinishedOps(db_with_cfh
, db_with_cfh
->db
, 1, kSeek
);
6666 snprintf(msg
, sizeof(msg
),
6667 "( Gets:%" PRIu64
" Puts:%" PRIu64
" Seek:%" PRIu64
6668 ", reads %" PRIu64
" in %" PRIu64
6670 "avg size: %.1f value, %.1f scan)\n",
6671 gets
, puts
, seek
, get_found
+ seek_found
, gets
+ seek
,
6672 total_val_size
/ puts
, total_scan_length
/ seek
);
6674 thread
->stats
.AddBytes(bytes
);
6675 thread
->stats
.AddMessage(msg
);
6678 void IteratorCreation(ThreadState
* thread
) {
6679 Duration
duration(FLAGS_duration
, reads_
);
6680 ReadOptions options
= read_options_
;
6681 std::unique_ptr
<char[]> ts_guard
;
6682 if (user_timestamp_size_
> 0) {
6683 ts_guard
.reset(new char[user_timestamp_size_
]);
6685 while (!duration
.Done(1)) {
6686 DB
* db
= SelectDB(thread
);
6688 if (user_timestamp_size_
> 0) {
6689 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
6690 options
.timestamp
= &ts
;
6692 Iterator
* iter
= db
->NewIterator(options
);
6694 thread
->stats
.FinishedOps(nullptr, db
, 1, kOthers
);
6698 void IteratorCreationWhileWriting(ThreadState
* thread
) {
6699 if (thread
->tid
> 0) {
6700 IteratorCreation(thread
);
6702 BGWriter(thread
, kWrite
);
6706 void SeekRandom(ThreadState
* thread
) {
6710 ReadOptions options
= read_options_
;
6711 std::unique_ptr
<char[]> ts_guard
;
6713 if (user_timestamp_size_
> 0) {
6714 ts_guard
.reset(new char[user_timestamp_size_
]);
6715 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
6716 options
.timestamp
= &ts
;
6719 std::vector
<Iterator
*> tailing_iters
;
6720 if (FLAGS_use_tailing_iterator
) {
6721 if (db_
.db
!= nullptr) {
6722 tailing_iters
.push_back(db_
.db
->NewIterator(options
));
6724 for (const auto& db_with_cfh
: multi_dbs_
) {
6725 tailing_iters
.push_back(db_with_cfh
.db
->NewIterator(options
));
6729 options
.auto_prefix_mode
= FLAGS_auto_prefix_mode
;
6731 std::unique_ptr
<const char[]> key_guard
;
6732 Slice key
= AllocateKey(&key_guard
);
6734 std::unique_ptr
<const char[]> upper_bound_key_guard
;
6735 Slice upper_bound
= AllocateKey(&upper_bound_key_guard
);
6736 std::unique_ptr
<const char[]> lower_bound_key_guard
;
6737 Slice lower_bound
= AllocateKey(&lower_bound_key_guard
);
6739 Duration
duration(FLAGS_duration
, reads_
);
6740 char value_buffer
[256];
6741 while (!duration
.Done(1)) {
6742 int64_t seek_pos
= thread
->rand
.Next() % FLAGS_num
;
6743 GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos
), FLAGS_num
,
6745 if (FLAGS_max_scan_distance
!= 0) {
6746 if (FLAGS_reverse_iterator
) {
6748 static_cast<uint64_t>(std::max(
6749 static_cast<int64_t>(0), seek_pos
- FLAGS_max_scan_distance
)),
6750 FLAGS_num
, &lower_bound
);
6751 options
.iterate_lower_bound
= &lower_bound
;
6754 std::min(FLAGS_num
, seek_pos
+ FLAGS_max_scan_distance
);
6755 GenerateKeyFromInt(static_cast<uint64_t>(min_num
), FLAGS_num
,
6757 options
.iterate_upper_bound
= &upper_bound
;
6759 } else if (FLAGS_auto_prefix_mode
&& prefix_extractor_
&&
6760 !FLAGS_reverse_iterator
) {
6761 // Set upper bound to next prefix
6762 auto mutable_upper_bound
= const_cast<char*>(upper_bound
.data());
6763 std::memcpy(mutable_upper_bound
, key
.data(), prefix_size_
);
6764 mutable_upper_bound
[prefix_size_
- 1]++;
6765 upper_bound
= Slice(upper_bound
.data(), prefix_size_
);
6766 options
.iterate_upper_bound
= &upper_bound
;
6769 // Pick a Iterator to use
6770 uint64_t db_idx_to_use
=
6772 ? (uint64_t{thread
->rand
.Next()} % multi_dbs_
.size())
6774 std::unique_ptr
<Iterator
> single_iter
;
6775 Iterator
* iter_to_use
;
6776 if (FLAGS_use_tailing_iterator
) {
6777 iter_to_use
= tailing_iters
[db_idx_to_use
];
6779 if (db_
.db
!= nullptr) {
6780 single_iter
.reset(db_
.db
->NewIterator(options
));
6782 single_iter
.reset(multi_dbs_
[db_idx_to_use
].db
->NewIterator(options
));
6784 iter_to_use
= single_iter
.get();
6787 iter_to_use
->Seek(key
);
6789 if (iter_to_use
->Valid() && iter_to_use
->key().compare(key
) == 0) {
6793 for (int j
= 0; j
< FLAGS_seek_nexts
&& iter_to_use
->Valid(); ++j
) {
6794 // Copy out iterator's value to make sure we read them.
6795 Slice value
= iter_to_use
->value();
6796 memcpy(value_buffer
, value
.data(),
6797 std::min(value
.size(), sizeof(value_buffer
)));
6798 bytes
+= iter_to_use
->key().size() + iter_to_use
->value().size();
6800 if (!FLAGS_reverse_iterator
) {
6801 iter_to_use
->Next();
6803 iter_to_use
->Prev();
6805 assert(iter_to_use
->status().ok());
6808 if (thread
->shared
->read_rate_limiter
.get() != nullptr &&
6809 read
% 256 == 255) {
6810 thread
->shared
->read_rate_limiter
->Request(
6811 256, Env::IO_HIGH
, nullptr /* stats */, RateLimiter::OpType::kRead
);
6814 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kSeek
);
6816 for (auto iter
: tailing_iters
) {
6821 snprintf(msg
, sizeof(msg
), "(%" PRIu64
" of %" PRIu64
" found)\n", found
,
6823 thread
->stats
.AddBytes(bytes
);
6824 thread
->stats
.AddMessage(msg
);
6827 void SeekRandomWhileWriting(ThreadState
* thread
) {
6828 if (thread
->tid
> 0) {
6831 BGWriter(thread
, kWrite
);
6835 void SeekRandomWhileMerging(ThreadState
* thread
) {
6836 if (thread
->tid
> 0) {
6839 BGWriter(thread
, kMerge
);
6843 void DoDelete(ThreadState
* thread
, bool seq
) {
6844 WriteBatch
batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6845 FLAGS_write_batch_protection_bytes_per_key
,
6846 user_timestamp_size_
);
6847 Duration
duration(seq
? 0 : FLAGS_duration
, deletes_
);
6849 std::unique_ptr
<const char[]> key_guard
;
6850 Slice key
= AllocateKey(&key_guard
);
6851 std::unique_ptr
<char[]> ts_guard
;
6853 if (user_timestamp_size_
> 0) {
6854 ts_guard
.reset(new char[user_timestamp_size_
]);
6857 while (!duration
.Done(entries_per_batch_
)) {
6858 DB
* db
= SelectDB(thread
);
6860 for (int64_t j
= 0; j
< entries_per_batch_
; ++j
) {
6861 const int64_t k
= seq
? i
+ j
: (thread
->rand
.Next() % FLAGS_num
);
6862 GenerateKeyFromInt(k
, FLAGS_num
, &key
);
6866 if (user_timestamp_size_
> 0) {
6867 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
6868 s
= batch
.UpdateTimestamps(
6869 ts
, [this](uint32_t) { return user_timestamp_size_
; });
6871 fprintf(stderr
, "assign timestamp: %s\n", s
.ToString().c_str());
6875 s
= db
->Write(write_options_
, &batch
);
6876 thread
->stats
.FinishedOps(nullptr, db
, entries_per_batch_
, kDelete
);
6878 fprintf(stderr
, "del error: %s\n", s
.ToString().c_str());
6881 i
+= entries_per_batch_
;
6885 void DeleteSeq(ThreadState
* thread
) { DoDelete(thread
, true); }
6887 void DeleteRandom(ThreadState
* thread
) { DoDelete(thread
, false); }
6889 void ReadWhileWriting(ThreadState
* thread
) {
6890 if (thread
->tid
> 0) {
6893 BGWriter(thread
, kWrite
);
6897 void MultiReadWhileWriting(ThreadState
* thread
) {
6898 if (thread
->tid
> 0) {
6899 MultiReadRandom(thread
);
6901 BGWriter(thread
, kWrite
);
6905 void ReadWhileMerging(ThreadState
* thread
) {
6906 if (thread
->tid
> 0) {
6909 BGWriter(thread
, kMerge
);
6913 void BGWriter(ThreadState
* thread
, enum OperationType write_merge
) {
6914 // Special thread that keeps writing until other threads are done.
6915 RandomGenerator gen
;
6918 std::unique_ptr
<RateLimiter
> write_rate_limiter
;
6919 if (FLAGS_benchmark_write_rate_limit
> 0) {
6920 write_rate_limiter
.reset(
6921 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit
));
6924 // Don't merge stats from this thread with the readers.
6925 thread
->stats
.SetExcludeFromMerge();
6927 std::unique_ptr
<const char[]> key_guard
;
6928 Slice key
= AllocateKey(&key_guard
);
6929 std::unique_ptr
<char[]> ts_guard
;
6930 std::unique_ptr
<const char[]> begin_key_guard
;
6931 Slice begin_key
= AllocateKey(&begin_key_guard
);
6932 std::unique_ptr
<const char[]> end_key_guard
;
6933 Slice end_key
= AllocateKey(&end_key_guard
);
6934 uint64_t num_range_deletions
= 0;
6935 std::vector
<std::unique_ptr
<const char[]>> expanded_key_guards
;
6936 std::vector
<Slice
> expanded_keys
;
6937 if (FLAGS_expand_range_tombstones
) {
6938 expanded_key_guards
.resize(range_tombstone_width_
);
6939 for (auto& expanded_key_guard
: expanded_key_guards
) {
6940 expanded_keys
.emplace_back(AllocateKey(&expanded_key_guard
));
6943 if (user_timestamp_size_
> 0) {
6944 ts_guard
.reset(new char[user_timestamp_size_
]);
6946 uint32_t written
= 0;
6947 bool hint_printed
= false;
6950 DB
* db
= SelectDB(thread
);
6952 MutexLock
l(&thread
->shared
->mu
);
6953 if (FLAGS_finish_after_writes
&& written
== writes_
) {
6954 fprintf(stderr
, "Exiting the writer after %u writes...\n", written
);
6957 if (thread
->shared
->num_done
+ 1 >= thread
->shared
->num_initialized
) {
6958 // Other threads have finished
6959 if (FLAGS_finish_after_writes
) {
6960 // Wait for the writes to be finished
6961 if (!hint_printed
) {
6962 fprintf(stderr
, "Reads are finished. Have %d more writes to do\n",
6963 static_cast<int>(writes_
) - written
);
6964 hint_printed
= true;
6967 // Finish the write immediately
6973 GenerateKeyFromInt(thread
->rand
.Next() % FLAGS_num
, FLAGS_num
, &key
);
6976 Slice val
= gen
.Generate();
6978 if (user_timestamp_size_
> 0) {
6979 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
6981 if (write_merge
== kWrite
) {
6982 if (user_timestamp_size_
== 0) {
6983 s
= db
->Put(write_options_
, key
, val
);
6985 s
= db
->Put(write_options_
, key
, ts
, val
);
6988 s
= db
->Merge(write_options_
, key
, val
);
6990 // Restore write_options_
6994 fprintf(stderr
, "put or merge error: %s\n", s
.ToString().c_str());
6997 bytes
+= key
.size() + val
.size() + user_timestamp_size_
;
6998 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kWrite
);
7000 if (FLAGS_benchmark_write_rate_limit
> 0) {
7001 write_rate_limiter
->Request(key
.size() + val
.size(), Env::IO_HIGH
,
7002 nullptr /* stats */,
7003 RateLimiter::OpType::kWrite
);
7006 if (writes_per_range_tombstone_
> 0 &&
7007 written
> writes_before_delete_range_
&&
7008 (written
- writes_before_delete_range_
) /
7009 writes_per_range_tombstone_
<=
7010 max_num_range_tombstones_
&&
7011 (written
- writes_before_delete_range_
) %
7012 writes_per_range_tombstone_
==
7014 num_range_deletions
++;
7015 int64_t begin_num
= thread
->rand
.Next() % FLAGS_num
;
7016 if (FLAGS_expand_range_tombstones
) {
7017 for (int64_t offset
= 0; offset
< range_tombstone_width_
; ++offset
) {
7018 GenerateKeyFromInt(begin_num
+ offset
, FLAGS_num
,
7019 &expanded_keys
[offset
]);
7020 if (!db
->Delete(write_options_
, expanded_keys
[offset
]).ok()) {
7021 fprintf(stderr
, "delete error: %s\n", s
.ToString().c_str());
7026 GenerateKeyFromInt(begin_num
, FLAGS_num
, &begin_key
);
7027 GenerateKeyFromInt(begin_num
+ range_tombstone_width_
, FLAGS_num
,
7029 if (!db
->DeleteRange(write_options_
, db
->DefaultColumnFamily(),
7032 fprintf(stderr
, "deleterange error: %s\n", s
.ToString().c_str());
7036 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kWrite
);
7037 // TODO: DeleteRange is not included in calculcation of bytes/rate
7041 if (num_range_deletions
> 0) {
7042 std::cout
<< "Number of range deletions: " << num_range_deletions
7045 thread
->stats
.AddBytes(bytes
);
7048 void ReadWhileScanning(ThreadState
* thread
) {
7049 if (thread
->tid
> 0) {
7056 void BGScan(ThreadState
* thread
) {
7057 if (FLAGS_num_multi_db
> 0) {
7058 fprintf(stderr
, "Not supporting multiple DBs.\n");
7061 assert(db_
.db
!= nullptr);
7062 ReadOptions read_options
= read_options_
;
7063 std::unique_ptr
<char[]> ts_guard
;
7065 if (user_timestamp_size_
> 0) {
7066 ts_guard
.reset(new char[user_timestamp_size_
]);
7067 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
7068 read_options
.timestamp
= &ts
;
7070 Iterator
* iter
= db_
.db
->NewIterator(read_options
);
7072 fprintf(stderr
, "num reads to do %" PRIu64
"\n", reads_
);
7073 Duration
duration(FLAGS_duration
, reads_
);
7074 uint64_t num_seek_to_first
= 0;
7075 uint64_t num_next
= 0;
7076 while (!duration
.Done(1)) {
7077 if (!iter
->Valid()) {
7078 iter
->SeekToFirst();
7079 num_seek_to_first
++;
7080 } else if (!iter
->status().ok()) {
7081 fprintf(stderr
, "Iterator error: %s\n",
7082 iter
->status().ToString().c_str());
7089 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kSeek
);
7094 // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
7095 // in DB atomically i.e in a single batch. Also refer GetMany.
7096 Status
PutMany(DB
* db
, const WriteOptions
& writeoptions
, const Slice
& key
,
7097 const Slice
& value
) {
7098 std::string suffixes
[3] = {"2", "1", "0"};
7099 std::string keys
[3];
7101 WriteBatch
batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
7102 FLAGS_write_batch_protection_bytes_per_key
,
7103 user_timestamp_size_
);
7105 for (int i
= 0; i
< 3; i
++) {
7106 keys
[i
] = key
.ToString() + suffixes
[i
];
7107 batch
.Put(keys
[i
], value
);
7110 std::unique_ptr
<char[]> ts_guard
;
7111 if (user_timestamp_size_
> 0) {
7112 ts_guard
.reset(new char[user_timestamp_size_
]);
7113 Slice ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7114 s
= batch
.UpdateTimestamps(
7115 ts
, [this](uint32_t) { return user_timestamp_size_
; });
7117 fprintf(stderr
, "assign timestamp to batch: %s\n",
7118 s
.ToString().c_str());
7123 s
= db
->Write(writeoptions
, &batch
);
7127 // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
7128 // in DB atomically i.e in a single batch. Also refer GetMany.
7129 Status
DeleteMany(DB
* db
, const WriteOptions
& writeoptions
,
7131 std::string suffixes
[3] = {"1", "2", "0"};
7132 std::string keys
[3];
7134 WriteBatch
batch(0, 0, FLAGS_write_batch_protection_bytes_per_key
,
7135 user_timestamp_size_
);
7137 for (int i
= 0; i
< 3; i
++) {
7138 keys
[i
] = key
.ToString() + suffixes
[i
];
7139 batch
.Delete(keys
[i
]);
7142 std::unique_ptr
<char[]> ts_guard
;
7143 if (user_timestamp_size_
> 0) {
7144 ts_guard
.reset(new char[user_timestamp_size_
]);
7145 Slice ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7146 s
= batch
.UpdateTimestamps(
7147 ts
, [this](uint32_t) { return user_timestamp_size_
; });
7149 fprintf(stderr
, "assign timestamp to batch: %s\n",
7150 s
.ToString().c_str());
7155 s
= db
->Write(writeoptions
, &batch
);
7159 // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
7160 // in the same snapshot, and verifies that all the values are identical.
7161 // ASSUMES that PutMany was used to put (K, V) into the DB.
7162 Status
GetMany(DB
* db
, const Slice
& key
, std::string
* value
) {
7163 std::string suffixes
[3] = {"0", "1", "2"};
7164 std::string keys
[3];
7165 Slice key_slices
[3];
7166 std::string values
[3];
7167 ReadOptions readoptionscopy
= read_options_
;
7169 std::unique_ptr
<char[]> ts_guard
;
7171 if (user_timestamp_size_
> 0) {
7172 ts_guard
.reset(new char[user_timestamp_size_
]);
7173 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7174 readoptionscopy
.timestamp
= &ts
;
7177 readoptionscopy
.snapshot
= db
->GetSnapshot();
7179 for (int i
= 0; i
< 3; i
++) {
7180 keys
[i
] = key
.ToString() + suffixes
[i
];
7181 key_slices
[i
] = keys
[i
];
7182 s
= db
->Get(readoptionscopy
, key_slices
[i
], value
);
7183 if (!s
.ok() && !s
.IsNotFound()) {
7184 fprintf(stderr
, "get error: %s\n", s
.ToString().c_str());
7186 // we continue after error rather than exiting so that we can
7187 // find more errors if any
7188 } else if (s
.IsNotFound()) {
7194 db
->ReleaseSnapshot(readoptionscopy
.snapshot
);
7196 if ((values
[0] != values
[1]) || (values
[1] != values
[2])) {
7197 fprintf(stderr
, "inconsistent values for key %s: %s, %s, %s\n",
7198 key
.ToString().c_str(), values
[0].c_str(), values
[1].c_str(),
7200 // we continue after error rather than exiting so that we can
7201 // find more errors if any
7207 // Differs from readrandomwriterandom in the following ways:
7208 // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
7209 // (b) Does deletes as well (per FLAGS_deletepercent)
7210 // (c) In order to achieve high % of 'found' during lookups, and to do
7211 // multiple writes (including puts and deletes) it uses upto
7212 // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
7213 // (d) Does not have a MultiGet option.
7214 void RandomWithVerify(ThreadState
* thread
) {
7215 RandomGenerator gen
;
7220 int delete_weight
= 0;
7221 int64_t gets_done
= 0;
7222 int64_t puts_done
= 0;
7223 int64_t deletes_done
= 0;
7225 std::unique_ptr
<const char[]> key_guard
;
7226 Slice key
= AllocateKey(&key_guard
);
7228 // the number of iterations is the larger of read_ or write_
7229 for (int64_t i
= 0; i
< readwrites_
; i
++) {
7230 DB
* db
= SelectDB(thread
);
7231 if (get_weight
== 0 && put_weight
== 0 && delete_weight
== 0) {
7232 // one batch completed, reinitialize for next batch
7233 get_weight
= FLAGS_readwritepercent
;
7234 delete_weight
= FLAGS_deletepercent
;
7235 put_weight
= 100 - get_weight
- delete_weight
;
7237 GenerateKeyFromInt(thread
->rand
.Next() % FLAGS_numdistinct
,
7238 FLAGS_numdistinct
, &key
);
7239 if (get_weight
> 0) {
7240 // do all the gets first
7241 Status s
= GetMany(db
, key
, &value
);
7242 if (!s
.ok() && !s
.IsNotFound()) {
7243 fprintf(stderr
, "getmany error: %s\n", s
.ToString().c_str());
7244 // we continue after error rather than exiting so that we can
7245 // find more errors if any
7246 } else if (!s
.IsNotFound()) {
7251 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kRead
);
7252 } else if (put_weight
> 0) {
7253 // then do all the corresponding number of puts
7254 // for all the gets we have done earlier
7255 Status s
= PutMany(db
, write_options_
, key
, gen
.Generate());
7257 fprintf(stderr
, "putmany error: %s\n", s
.ToString().c_str());
7262 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kWrite
);
7263 } else if (delete_weight
> 0) {
7264 Status s
= DeleteMany(db
, write_options_
, key
);
7266 fprintf(stderr
, "deletemany error: %s\n", s
.ToString().c_str());
7271 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kDelete
);
7275 snprintf(msg
, sizeof(msg
),
7276 "( get:%" PRIu64
" put:%" PRIu64
" del:%" PRIu64
" total:%" PRIu64
7277 " found:%" PRIu64
")",
7278 gets_done
, puts_done
, deletes_done
, readwrites_
, found
);
7279 thread
->stats
.AddMessage(msg
);
7282 // This is different from ReadWhileWriting because it does not use
7284 void ReadRandomWriteRandom(ThreadState
* thread
) {
7285 ReadOptions options
= read_options_
;
7286 RandomGenerator gen
;
7291 int64_t reads_done
= 0;
7292 int64_t writes_done
= 0;
7293 Duration
duration(FLAGS_duration
, readwrites_
);
7295 std::unique_ptr
<const char[]> key_guard
;
7296 Slice key
= AllocateKey(&key_guard
);
7298 std::unique_ptr
<char[]> ts_guard
;
7299 if (user_timestamp_size_
> 0) {
7300 ts_guard
.reset(new char[user_timestamp_size_
]);
7303 // the number of iterations is the larger of read_ or write_
7304 while (!duration
.Done(1)) {
7305 DB
* db
= SelectDB(thread
);
7306 GenerateKeyFromInt(thread
->rand
.Next() % FLAGS_num
, FLAGS_num
, &key
);
7307 if (get_weight
== 0 && put_weight
== 0) {
7308 // one batch completed, reinitialize for next batch
7309 get_weight
= FLAGS_readwritepercent
;
7310 put_weight
= 100 - get_weight
;
7312 if (get_weight
> 0) {
7313 // do all the gets first
7315 if (user_timestamp_size_
> 0) {
7316 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
,
7318 options
.timestamp
= &ts
;
7320 Status s
= db
->Get(options
, key
, &value
);
7321 if (!s
.ok() && !s
.IsNotFound()) {
7322 fprintf(stderr
, "get error: %s\n", s
.ToString().c_str());
7323 // we continue after error rather than exiting so that we can
7324 // find more errors if any
7325 } else if (!s
.IsNotFound()) {
7330 thread
->stats
.FinishedOps(nullptr, db
, 1, kRead
);
7331 } else if (put_weight
> 0) {
7332 // then do all the corresponding number of puts
7333 // for all the gets we have done earlier
7335 if (user_timestamp_size_
> 0) {
7336 Slice ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7337 s
= db
->Put(write_options_
, key
, ts
, gen
.Generate());
7339 s
= db
->Put(write_options_
, key
, gen
.Generate());
7342 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
7347 thread
->stats
.FinishedOps(nullptr, db
, 1, kWrite
);
7351 snprintf(msg
, sizeof(msg
),
7352 "( reads:%" PRIu64
" writes:%" PRIu64
" total:%" PRIu64
7353 " found:%" PRIu64
")",
7354 reads_done
, writes_done
, readwrites_
, found
);
7355 thread
->stats
.AddMessage(msg
);
7359 // Read-modify-write for random keys
7360 void UpdateRandom(ThreadState
* thread
) {
7361 ReadOptions options
= read_options_
;
7362 RandomGenerator gen
;
7366 Duration
duration(FLAGS_duration
, readwrites_
);
7368 std::unique_ptr
<const char[]> key_guard
;
7369 Slice key
= AllocateKey(&key_guard
);
7370 std::unique_ptr
<char[]> ts_guard
;
7371 if (user_timestamp_size_
> 0) {
7372 ts_guard
.reset(new char[user_timestamp_size_
]);
7374 // the number of iterations is the larger of read_ or write_
7375 while (!duration
.Done(1)) {
7376 DB
* db
= SelectDB(thread
);
7377 GenerateKeyFromInt(thread
->rand
.Next() % FLAGS_num
, FLAGS_num
, &key
);
7379 if (user_timestamp_size_
> 0) {
7380 // Read with newest timestamp because we are doing rmw.
7381 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7382 options
.timestamp
= &ts
;
7385 auto status
= db
->Get(options
, key
, &value
);
7388 bytes
+= key
.size() + value
.size() + user_timestamp_size_
;
7389 } else if (!status
.IsNotFound()) {
7390 fprintf(stderr
, "Get returned an error: %s\n",
7391 status
.ToString().c_str());
7395 if (thread
->shared
->write_rate_limiter
) {
7396 thread
->shared
->write_rate_limiter
->Request(
7397 key
.size() + value
.size(), Env::IO_HIGH
, nullptr /*stats*/,
7398 RateLimiter::OpType::kWrite
);
7401 Slice val
= gen
.Generate();
7403 if (user_timestamp_size_
> 0) {
7404 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7405 s
= db
->Put(write_options_
, key
, ts
, val
);
7407 s
= db
->Put(write_options_
, key
, val
);
7410 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
7413 bytes
+= key
.size() + val
.size() + user_timestamp_size_
;
7414 thread
->stats
.FinishedOps(nullptr, db
, 1, kUpdate
);
7417 snprintf(msg
, sizeof(msg
), "( updates:%" PRIu64
" found:%" PRIu64
")",
7418 readwrites_
, found
);
7419 thread
->stats
.AddBytes(bytes
);
7420 thread
->stats
.AddMessage(msg
);
7423 // Read-XOR-write for random keys. Xors the existing value with a randomly
7424 // generated value, and stores the result. Assuming A in the array of bytes
7425 // representing the existing value, we generate an array B of the same size,
7426 // then compute C = A^B as C[i]=A[i]^B[i], and store C
7427 void XORUpdateRandom(ThreadState
* thread
) {
7428 ReadOptions options
= read_options_
;
7429 RandomGenerator gen
;
7430 std::string existing_value
;
7432 Duration
duration(FLAGS_duration
, readwrites_
);
7434 BytesXOROperator xor_operator
;
7436 std::unique_ptr
<const char[]> key_guard
;
7437 Slice key
= AllocateKey(&key_guard
);
7438 std::unique_ptr
<char[]> ts_guard
;
7439 if (user_timestamp_size_
> 0) {
7440 ts_guard
.reset(new char[user_timestamp_size_
]);
7442 // the number of iterations is the larger of read_ or write_
7443 while (!duration
.Done(1)) {
7444 DB
* db
= SelectDB(thread
);
7445 GenerateKeyFromInt(thread
->rand
.Next() % FLAGS_num
, FLAGS_num
, &key
);
7447 if (user_timestamp_size_
> 0) {
7448 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7449 options
.timestamp
= &ts
;
7452 auto status
= db
->Get(options
, key
, &existing_value
);
7455 } else if (!status
.IsNotFound()) {
7456 fprintf(stderr
, "Get returned an error: %s\n",
7457 status
.ToString().c_str());
7462 gen
.Generate(static_cast<unsigned int>(existing_value
.size()));
7463 std::string new_value
;
7466 Slice existing_value_slice
= Slice(existing_value
);
7467 xor_operator
.XOR(&existing_value_slice
, value
, &new_value
);
7469 xor_operator
.XOR(nullptr, value
, &new_value
);
7473 if (user_timestamp_size_
> 0) {
7474 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7475 s
= db
->Put(write_options_
, key
, ts
, Slice(new_value
));
7477 s
= db
->Put(write_options_
, key
, Slice(new_value
));
7480 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
7483 thread
->stats
.FinishedOps(nullptr, db
, 1);
7486 snprintf(msg
, sizeof(msg
), "( updates:%" PRIu64
" found:%" PRIu64
")",
7487 readwrites_
, found
);
7488 thread
->stats
.AddMessage(msg
);
7491 // Read-modify-write for random keys.
7492 // Each operation causes the key grow by value_size (simulating an append).
7493 // Generally used for benchmarking against merges of similar type
7494 void AppendRandom(ThreadState
* thread
) {
7495 ReadOptions options
= read_options_
;
7496 RandomGenerator gen
;
7501 std::unique_ptr
<const char[]> key_guard
;
7502 Slice key
= AllocateKey(&key_guard
);
7503 std::unique_ptr
<char[]> ts_guard
;
7504 if (user_timestamp_size_
> 0) {
7505 ts_guard
.reset(new char[user_timestamp_size_
]);
7507 // The number of iterations is the larger of read_ or write_
7508 Duration
duration(FLAGS_duration
, readwrites_
);
7509 while (!duration
.Done(1)) {
7510 DB
* db
= SelectDB(thread
);
7511 GenerateKeyFromInt(thread
->rand
.Next() % FLAGS_num
, FLAGS_num
, &key
);
7513 if (user_timestamp_size_
> 0) {
7514 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7515 options
.timestamp
= &ts
;
7518 auto status
= db
->Get(options
, key
, &value
);
7521 bytes
+= key
.size() + value
.size() + user_timestamp_size_
;
7522 } else if (!status
.IsNotFound()) {
7523 fprintf(stderr
, "Get returned an error: %s\n",
7524 status
.ToString().c_str());
7527 // If not existing, then just assume an empty string of data
7531 // Update the value (by appending data)
7532 Slice operand
= gen
.Generate();
7533 if (value
.size() > 0) {
7534 // Use a delimiter to match the semantics for StringAppendOperator
7535 value
.append(1, ',');
7537 value
.append(operand
.data(), operand
.size());
7540 if (user_timestamp_size_
> 0) {
7541 ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7542 s
= db
->Put(write_options_
, key
, ts
, value
);
7544 // Write back to the database
7545 s
= db
->Put(write_options_
, key
, value
);
7548 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
7551 bytes
+= key
.size() + value
.size() + user_timestamp_size_
;
7552 thread
->stats
.FinishedOps(nullptr, db
, 1, kUpdate
);
7556 snprintf(msg
, sizeof(msg
), "( updates:%" PRIu64
" found:%" PRIu64
")",
7557 readwrites_
, found
);
7558 thread
->stats
.AddBytes(bytes
);
7559 thread
->stats
.AddMessage(msg
);
7562 // Read-modify-write for random keys (using MergeOperator)
7563 // The merge operator to use should be defined by FLAGS_merge_operator
7564 // Adjust FLAGS_value_size so that the keys are reasonable for this operator
7565 // Assumes that the merge operator is non-null (i.e.: is well-defined)
7567 // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
7568 // to simulate random additions over 64-bit integers using merge.
7570 // The number of merges on the same key can be controlled by adjusting
7571 // FLAGS_merge_keys.
7572 void MergeRandom(ThreadState
* thread
) {
7573 RandomGenerator gen
;
7575 std::unique_ptr
<const char[]> key_guard
;
7576 Slice key
= AllocateKey(&key_guard
);
7577 // The number of iterations is the larger of read_ or write_
7578 Duration
duration(FLAGS_duration
, readwrites_
);
7579 while (!duration
.Done(1)) {
7580 DBWithColumnFamilies
* db_with_cfh
= SelectDBWithCfh(thread
);
7581 int64_t key_rand
= thread
->rand
.Next() % merge_keys_
;
7582 GenerateKeyFromInt(key_rand
, merge_keys_
, &key
);
7585 Slice val
= gen
.Generate();
7586 if (FLAGS_num_column_families
> 1) {
7587 s
= db_with_cfh
->db
->Merge(write_options_
,
7588 db_with_cfh
->GetCfh(key_rand
), key
, val
);
7590 s
= db_with_cfh
->db
->Merge(
7591 write_options_
, db_with_cfh
->db
->DefaultColumnFamily(), key
, val
);
7595 fprintf(stderr
, "merge error: %s\n", s
.ToString().c_str());
7598 bytes
+= key
.size() + val
.size();
7599 thread
->stats
.FinishedOps(nullptr, db_with_cfh
->db
, 1, kMerge
);
7602 // Print some statistics
7604 snprintf(msg
, sizeof(msg
), "( updates:%" PRIu64
")", readwrites_
);
7605 thread
->stats
.AddBytes(bytes
);
7606 thread
->stats
.AddMessage(msg
);
7609 // Read and merge random keys. The amount of reads and merges are controlled
7610 // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
7611 // keys (and thus also the number of reads and merges on the same key) can be
7612 // adjusted with FLAGS_merge_keys.
7614 // As with MergeRandom, the merge operator to use should be defined by
7615 // FLAGS_merge_operator.
7616 void ReadRandomMergeRandom(ThreadState
* thread
) {
7617 RandomGenerator gen
;
7619 int64_t num_hits
= 0;
7620 int64_t num_gets
= 0;
7621 int64_t num_merges
= 0;
7622 size_t max_length
= 0;
7624 std::unique_ptr
<const char[]> key_guard
;
7625 Slice key
= AllocateKey(&key_guard
);
7626 // the number of iterations is the larger of read_ or write_
7627 Duration
duration(FLAGS_duration
, readwrites_
);
7628 while (!duration
.Done(1)) {
7629 DB
* db
= SelectDB(thread
);
7630 GenerateKeyFromInt(thread
->rand
.Next() % merge_keys_
, merge_keys_
, &key
);
7632 bool do_merge
= int(thread
->rand
.Next() % 100) < FLAGS_mergereadpercent
;
7635 Status s
= db
->Merge(write_options_
, key
, gen
.Generate());
7637 fprintf(stderr
, "merge error: %s\n", s
.ToString().c_str());
7641 thread
->stats
.FinishedOps(nullptr, db
, 1, kMerge
);
7643 Status s
= db
->Get(read_options_
, key
, &value
);
7644 if (value
.length() > max_length
) max_length
= value
.length();
7646 if (!s
.ok() && !s
.IsNotFound()) {
7647 fprintf(stderr
, "get error: %s\n", s
.ToString().c_str());
7648 // we continue after error rather than exiting so that we can
7649 // find more errors if any
7650 } else if (!s
.IsNotFound()) {
7654 thread
->stats
.FinishedOps(nullptr, db
, 1, kRead
);
7659 snprintf(msg
, sizeof(msg
),
7660 "(reads:%" PRIu64
" merges:%" PRIu64
" total:%" PRIu64
7661 " hits:%" PRIu64
" maxlength:%" ROCKSDB_PRIszt
")",
7662 num_gets
, num_merges
, readwrites_
, num_hits
, max_length
);
7663 thread
->stats
.AddMessage(msg
);
7666 void WriteSeqSeekSeq(ThreadState
* thread
) {
7667 writes_
= FLAGS_num
;
7668 DoWrite(thread
, SEQUENTIAL
);
7669 // exclude writes from the ops/sec calculation
7670 thread
->stats
.Start(thread
->tid
);
7672 DB
* db
= SelectDB(thread
);
7673 ReadOptions read_opts
= read_options_
;
7674 std::unique_ptr
<char[]> ts_guard
;
7676 if (user_timestamp_size_
> 0) {
7677 ts_guard
.reset(new char[user_timestamp_size_
]);
7678 ts
= mock_app_clock_
->GetTimestampForRead(thread
->rand
, ts_guard
.get());
7679 read_opts
.timestamp
= &ts
;
7681 std::unique_ptr
<Iterator
> iter(db
->NewIterator(read_opts
));
7683 std::unique_ptr
<const char[]> key_guard
;
7684 Slice key
= AllocateKey(&key_guard
);
7685 for (int64_t i
= 0; i
< FLAGS_num
; ++i
) {
7686 GenerateKeyFromInt(i
, FLAGS_num
, &key
);
7688 assert(iter
->Valid() && iter
->key() == key
);
7689 thread
->stats
.FinishedOps(nullptr, db
, 1, kSeek
);
7691 for (int j
= 0; j
< FLAGS_seek_nexts
&& i
+ 1 < FLAGS_num
; ++j
) {
7692 if (!FLAGS_reverse_iterator
) {
7697 GenerateKeyFromInt(++i
, FLAGS_num
, &key
);
7698 assert(iter
->Valid() && iter
->key() == key
);
7699 thread
->stats
.FinishedOps(nullptr, db
, 1, kSeek
);
7703 assert(iter
->Valid() && iter
->key() == key
);
7704 thread
->stats
.FinishedOps(nullptr, db
, 1, kSeek
);
7708 bool binary_search(std::vector
<int>& data
, int start
, int end
, int key
) {
7709 if (data
.empty()) return false;
7710 if (start
> end
) return false;
7711 int mid
= start
+ (end
- start
) / 2;
7712 if (mid
> static_cast<int>(data
.size()) - 1) return false;
7713 if (data
[mid
] == key
) {
7715 } else if (data
[mid
] > key
) {
7716 return binary_search(data
, start
, mid
- 1, key
);
7718 return binary_search(data
, mid
+ 1, end
, key
);
7722 // Does a bunch of merge operations for a key(key1) where the merge operand
7723 // is a sorted list. Next performance comparison is done between doing a Get
7724 // for key1 followed by searching for another key(key2) in the large sorted
7725 // list vs calling GetMergeOperands for key1 and then searching for the key2
7726 // in all the sorted sub-lists. Later case is expected to be a lot faster.
7727 void GetMergeOperands(ThreadState
* thread
) {
7728 DB
* db
= SelectDB(thread
);
7729 const int kTotalValues
= 100000;
7730 const int kListSize
= 100;
7731 std::string key
= "my_key";
7734 for (int i
= 1; i
< kTotalValues
; i
++) {
7735 if (i
% kListSize
== 0) {
7736 // Remove trailing ','
7738 db
->Merge(WriteOptions(), key
, value
);
7741 value
.append(std::to_string(i
)).append(",");
7746 std::vector
<int> data
;
7747 // This value can be experimented with and it will demonstrate the
7748 // perf difference between doing a Get and searching for lookup_key in the
7749 // resultant large sorted list vs doing GetMergeOperands and searching
7750 // for lookup_key within this resultant sorted sub-lists.
7754 std::cout
<< "--- Get API call --- \n";
7755 PinnableSlice p_slice
;
7756 uint64_t st
= FLAGS_env
->NowNanos();
7757 db
->Get(ReadOptions(), db
->DefaultColumnFamily(), key
, &p_slice
);
7758 s
.MakeVector(data
, p_slice
);
7760 binary_search(data
, 0, static_cast<int>(data
.size() - 1), lookup_key
);
7761 std::cout
<< "Found key? " << std::to_string(found
) << "\n";
7762 uint64_t sp
= FLAGS_env
->NowNanos();
7763 std::cout
<< "Get: " << (sp
- st
) / 1000000000.0 << " seconds\n";
7764 std::string
* dat_
= p_slice
.GetSelf();
7765 std::cout
<< "Sample data from Get API call: " << dat_
->substr(0, 10)
7769 // GetMergeOperands API call
7770 std::cout
<< "--- GetMergeOperands API --- \n";
7771 std::vector
<PinnableSlice
> a_slice((kTotalValues
/ kListSize
) + 1);
7772 st
= FLAGS_env
->NowNanos();
7773 int number_of_operands
= 0;
7774 GetMergeOperandsOptions get_merge_operands_options
;
7775 get_merge_operands_options
.expected_max_number_of_operands
=
7776 (kTotalValues
/ 100) + 1;
7777 db
->GetMergeOperands(ReadOptions(), db
->DefaultColumnFamily(), key
,
7778 a_slice
.data(), &get_merge_operands_options
,
7779 &number_of_operands
);
7780 for (PinnableSlice
& psl
: a_slice
) {
7781 s
.MakeVector(data
, psl
);
7783 binary_search(data
, 0, static_cast<int>(data
.size() - 1), lookup_key
);
7787 std::cout
<< "Found key? " << std::to_string(found
) << "\n";
7788 sp
= FLAGS_env
->NowNanos();
7789 std::cout
<< "Get Merge operands: " << (sp
- st
) / 1000000000.0
7792 std::cout
<< "Sample data from GetMergeOperands API call: ";
7793 for (PinnableSlice
& psl
: a_slice
) {
7794 std::cout
<< "List: " << to_print
<< " : " << *psl
.GetSelf() << "\n";
7795 if (to_print
++ > 2) break;
7799 #ifndef ROCKSDB_LITE
7800 void VerifyChecksum(ThreadState
* thread
) {
7801 DB
* db
= SelectDB(thread
);
7803 ro
.adaptive_readahead
= FLAGS_adaptive_readahead
;
7804 ro
.async_io
= FLAGS_async_io
;
7805 ro
.rate_limiter_priority
=
7806 FLAGS_rate_limit_user_ops
? Env::IO_USER
: Env::IO_TOTAL
;
7807 ro
.readahead_size
= FLAGS_readahead_size
;
7808 Status s
= db
->VerifyChecksum(ro
);
7810 fprintf(stderr
, "VerifyChecksum() failed: %s\n", s
.ToString().c_str());
7815 void VerifyFileChecksums(ThreadState
* thread
) {
7816 DB
* db
= SelectDB(thread
);
7818 ro
.adaptive_readahead
= FLAGS_adaptive_readahead
;
7819 ro
.async_io
= FLAGS_async_io
;
7820 ro
.rate_limiter_priority
=
7821 FLAGS_rate_limit_user_ops
? Env::IO_USER
: Env::IO_TOTAL
;
7822 ro
.readahead_size
= FLAGS_readahead_size
;
7823 Status s
= db
->VerifyFileChecksums(ro
);
7825 fprintf(stderr
, "VerifyFileChecksums() failed: %s\n",
7826 s
.ToString().c_str());
7831 // This benchmark stress tests Transactions. For a given --duration (or
7832 // total number of --writes, a Transaction will perform a read-modify-write
7833 // to increment the value of a key in each of N(--transaction-sets) sets of
7834 // keys (where each set has --num keys). If --threads is set, this will be
7835 // done in parallel.
7837 // To test transactions, use --transaction_db=true. Not setting this
7839 // will run the same benchmark without transactions.
7841 // RandomTransactionVerify() will then validate the correctness of the results
7842 // by checking if the sum of all keys in each set is the same.
7843 void RandomTransaction(ThreadState
* thread
) {
7844 Duration
duration(FLAGS_duration
, readwrites_
);
7845 uint16_t num_prefix_ranges
= static_cast<uint16_t>(FLAGS_transaction_sets
);
7846 uint64_t transactions_done
= 0;
7848 if (num_prefix_ranges
== 0 || num_prefix_ranges
> 9999) {
7849 fprintf(stderr
, "invalid value for transaction_sets\n");
7853 TransactionOptions txn_options
;
7854 txn_options
.lock_timeout
= FLAGS_transaction_lock_timeout
;
7855 txn_options
.set_snapshot
= FLAGS_transaction_set_snapshot
;
7857 RandomTransactionInserter
inserter(&thread
->rand
, write_options_
,
7858 read_options_
, FLAGS_num
,
7861 if (FLAGS_num_multi_db
> 1) {
7863 "Cannot run RandomTransaction benchmark with "
7864 "FLAGS_multi_db > 1.");
7868 while (!duration
.Done(1)) {
7871 // RandomTransactionInserter will attempt to insert a key for each
7872 // # of FLAGS_transaction_sets
7873 if (FLAGS_optimistic_transaction_db
) {
7874 success
= inserter
.OptimisticTransactionDBInsert(db_
.opt_txn_db
);
7875 } else if (FLAGS_transaction_db
) {
7876 TransactionDB
* txn_db
= reinterpret_cast<TransactionDB
*>(db_
.db
);
7877 success
= inserter
.TransactionDBInsert(txn_db
, txn_options
);
7879 success
= inserter
.DBInsert(db_
.db
);
7883 fprintf(stderr
, "Unexpected error: %s\n",
7884 inserter
.GetLastStatus().ToString().c_str());
7888 thread
->stats
.FinishedOps(nullptr, db_
.db
, 1, kOthers
);
7889 transactions_done
++;
7893 if (FLAGS_optimistic_transaction_db
|| FLAGS_transaction_db
) {
7894 snprintf(msg
, sizeof(msg
),
7895 "( transactions:%" PRIu64
" aborts:%" PRIu64
")",
7896 transactions_done
, inserter
.GetFailureCount());
7898 snprintf(msg
, sizeof(msg
), "( batches:%" PRIu64
" )", transactions_done
);
7900 thread
->stats
.AddMessage(msg
);
7901 thread
->stats
.AddBytes(static_cast<int64_t>(inserter
.GetBytesInserted()));
7904 // Verifies consistency of data after RandomTransaction() has been run.
7905 // Since each iteration of RandomTransaction() incremented a key in each set
7906 // by the same value, the sum of the keys in each set should be the same.
7907 void RandomTransactionVerify() {
7908 if (!FLAGS_transaction_db
&& !FLAGS_optimistic_transaction_db
) {
7909 // transactions not used, nothing to verify.
7913 Status s
= RandomTransactionInserter::Verify(
7914 db_
.db
, static_cast<uint16_t>(FLAGS_transaction_sets
));
7917 fprintf(stdout
, "RandomTransactionVerify Success.\n");
7919 fprintf(stdout
, "RandomTransactionVerify FAILED!!\n");
7922 #endif // ROCKSDB_LITE
7924 // Writes and deletes random keys without overwriting keys.
7926 // This benchmark is intended to partially replicate the behavior of MyRocks
7927 // secondary indices: All data is stored in keys and updates happen by
7928 // deleting the old version of the key and inserting the new version.
7929 void RandomReplaceKeys(ThreadState
* thread
) {
7930 std::unique_ptr
<const char[]> key_guard
;
7931 Slice key
= AllocateKey(&key_guard
);
7932 std::unique_ptr
<char[]> ts_guard
;
7933 if (user_timestamp_size_
> 0) {
7934 ts_guard
.reset(new char[user_timestamp_size_
]);
7936 std::vector
<uint32_t> counters(FLAGS_numdistinct
, 0);
7937 size_t max_counter
= 50;
7938 RandomGenerator gen
;
7941 DB
* db
= SelectDB(thread
);
7942 for (int64_t i
= 0; i
< FLAGS_numdistinct
; i
++) {
7943 GenerateKeyFromInt(i
* max_counter
, FLAGS_num
, &key
);
7944 if (user_timestamp_size_
> 0) {
7945 Slice ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7946 s
= db
->Put(write_options_
, key
, ts
, gen
.Generate());
7948 s
= db
->Put(write_options_
, key
, gen
.Generate());
7951 fprintf(stderr
, "Operation failed: %s\n", s
.ToString().c_str());
7958 std::default_random_engine generator
;
7959 std::normal_distribution
<double> distribution(FLAGS_numdistinct
/ 2.0,
7961 Duration
duration(FLAGS_duration
, FLAGS_num
);
7962 while (!duration
.Done(1)) {
7963 int64_t rnd_id
= static_cast<int64_t>(distribution(generator
));
7964 int64_t key_id
= std::max(std::min(FLAGS_numdistinct
- 1, rnd_id
),
7965 static_cast<int64_t>(0));
7966 GenerateKeyFromInt(key_id
* max_counter
+ counters
[key_id
], FLAGS_num
,
7968 if (user_timestamp_size_
> 0) {
7969 Slice ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7970 s
= FLAGS_use_single_deletes
? db
->SingleDelete(write_options_
, key
, ts
)
7971 : db
->Delete(write_options_
, key
, ts
);
7973 s
= FLAGS_use_single_deletes
? db
->SingleDelete(write_options_
, key
)
7974 : db
->Delete(write_options_
, key
);
7977 counters
[key_id
] = (counters
[key_id
] + 1) % max_counter
;
7978 GenerateKeyFromInt(key_id
* max_counter
+ counters
[key_id
], FLAGS_num
,
7980 if (user_timestamp_size_
> 0) {
7981 Slice ts
= mock_app_clock_
->Allocate(ts_guard
.get());
7982 s
= db
->Put(write_options_
, key
, ts
, Slice());
7984 s
= db
->Put(write_options_
, key
, Slice());
7989 fprintf(stderr
, "Operation failed: %s\n", s
.ToString().c_str());
7993 thread
->stats
.FinishedOps(nullptr, db
, 1, kOthers
);
7997 snprintf(msg
, sizeof(msg
),
7998 "use single deletes: %d, "
7999 "standard deviation: %lf\n",
8000 FLAGS_use_single_deletes
, FLAGS_stddev
);
8001 thread
->stats
.AddMessage(msg
);
8004 void TimeSeriesReadOrDelete(ThreadState
* thread
, bool do_deletion
) {
8009 Iterator
* iter
= nullptr;
8010 // Only work on single database
8011 assert(db_
.db
!= nullptr);
8012 iter
= db_
.db
->NewIterator(read_options_
);
8014 std::unique_ptr
<const char[]> key_guard
;
8015 Slice key
= AllocateKey(&key_guard
);
8017 char value_buffer
[256];
8020 MutexLock
l(&thread
->shared
->mu
);
8021 if (thread
->shared
->num_done
>= 1) {
8022 // Write thread have finished
8026 if (!FLAGS_use_tailing_iterator
) {
8028 iter
= db_
.db
->NewIterator(read_options_
);
8030 // Pick a Iterator to use
8032 int64_t key_id
= thread
->rand
.Next() % FLAGS_key_id_range
;
8033 GenerateKeyFromInt(key_id
, FLAGS_num
, &key
);
8034 // Reset last 8 bytes to 0
8035 char* start
= const_cast<char*>(key
.data());
8036 start
+= key
.size() - 8;
8037 memset(start
, 0, 8);
8040 bool key_found
= false;
8042 for (iter
->Seek(key
); iter
->Valid() && iter
->key().starts_with(key
);
8045 // Copy out iterator's value to make sure we read them.
8047 bytes
+= iter
->key().size();
8048 if (KeyExpired(timestamp_emulator_
.get(), iter
->key())) {
8049 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kDelete
);
8050 db_
.db
->Delete(write_options_
, iter
->key());
8055 bytes
+= iter
->key().size() + iter
->value().size();
8056 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kRead
);
8057 Slice value
= iter
->value();
8058 memcpy(value_buffer
, value
.data(),
8059 std::min(value
.size(), sizeof(value_buffer
)));
8061 assert(iter
->status().ok());
8066 if (thread
->shared
->read_rate_limiter
.get() != nullptr) {
8067 thread
->shared
->read_rate_limiter
->Request(
8068 1, Env::IO_HIGH
, nullptr /* stats */, RateLimiter::OpType::kRead
);
8074 snprintf(msg
, sizeof(msg
), "(%" PRIu64
" of %" PRIu64
" found)", found
,
8076 thread
->stats
.AddBytes(bytes
);
8077 thread
->stats
.AddMessage(msg
);
8080 void TimeSeriesWrite(ThreadState
* thread
) {
8081 // Special thread that keeps writing until other threads are done.
8082 RandomGenerator gen
;
8085 // Don't merge stats from this thread with the readers.
8086 thread
->stats
.SetExcludeFromMerge();
8088 std::unique_ptr
<RateLimiter
> write_rate_limiter
;
8089 if (FLAGS_benchmark_write_rate_limit
> 0) {
8090 write_rate_limiter
.reset(
8091 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit
));
8094 std::unique_ptr
<const char[]> key_guard
;
8095 Slice key
= AllocateKey(&key_guard
);
8097 Duration
duration(FLAGS_duration
, writes_
);
8098 while (!duration
.Done(1)) {
8099 DB
* db
= SelectDB(thread
);
8101 uint64_t key_id
= thread
->rand
.Next() % FLAGS_key_id_range
;
8103 GenerateKeyFromInt(key_id
, FLAGS_num
, &key
);
8106 char* start
= const_cast<char*>(key
.data());
8107 char* pos
= start
+ 8;
8109 std::min(key_size_
- static_cast<int>(pos
- start
), 8);
8110 uint64_t timestamp_value
= timestamp_emulator_
->Get();
8111 if (port::kLittleEndian
) {
8112 for (int i
= 0; i
< bytes_to_fill
; ++i
) {
8113 pos
[i
] = (timestamp_value
>> ((bytes_to_fill
- i
- 1) << 3)) & 0xFF;
8116 memcpy(pos
, static_cast<void*>(×tamp_value
), bytes_to_fill
);
8119 timestamp_emulator_
->Inc();
8122 Slice val
= gen
.Generate();
8123 s
= db
->Put(write_options_
, key
, val
);
8126 fprintf(stderr
, "put error: %s\n", s
.ToString().c_str());
8129 bytes
= key
.size() + val
.size();
8130 thread
->stats
.FinishedOps(&db_
, db_
.db
, 1, kWrite
);
8131 thread
->stats
.AddBytes(bytes
);
8133 if (FLAGS_benchmark_write_rate_limit
> 0) {
8134 write_rate_limiter
->Request(key
.size() + val
.size(), Env::IO_HIGH
,
8135 nullptr /* stats */,
8136 RateLimiter::OpType::kWrite
);
8141 void TimeSeries(ThreadState
* thread
) {
8142 if (thread
->tid
> 0) {
8143 bool do_deletion
= FLAGS_expire_style
== "delete" &&
8144 thread
->tid
<= FLAGS_num_deletion_threads
;
8145 TimeSeriesReadOrDelete(thread
, do_deletion
);
8147 TimeSeriesWrite(thread
);
8148 thread
->stats
.Stop();
8149 thread
->stats
.Report("timeseries write");
8153 void Compact(ThreadState
* thread
) {
8154 DB
* db
= SelectDB(thread
);
8155 CompactRangeOptions cro
;
8156 cro
.bottommost_level_compaction
=
8157 BottommostLevelCompaction::kForceOptimized
;
8158 db
->CompactRange(cro
, nullptr, nullptr);
8162 if (db_
.db
!= nullptr) {
8163 db_
.db
->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8165 for (const auto& db_with_cfh
: multi_dbs_
) {
8166 db_with_cfh
.db
->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8170 #ifndef ROCKSDB_LITE
8171 void WaitForCompactionHelper(DBWithColumnFamilies
& db
) {
8172 // This is an imperfect way of waiting for compaction. The loop and sleep
8173 // is done because a thread that finishes a compaction job should get a
8174 // chance to pickup a new compaction job.
8176 std::vector
<std::string
> keys
= {DB::Properties::kMemTableFlushPending
,
8177 DB::Properties::kNumRunningFlushes
,
8178 DB::Properties::kCompactionPending
,
8179 DB::Properties::kNumRunningCompactions
};
8181 fprintf(stdout
, "waitforcompaction(%s): started\n",
8182 db
.db
->GetName().c_str());
8187 for (const auto& k
: keys
) {
8189 if (!db
.db
->GetIntProperty(k
, &v
)) {
8190 fprintf(stderr
, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
8191 db
.db
->GetName().c_str(), k
.c_str());
8195 "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
8196 db
.db
->GetName().c_str(), k
.c_str());
8197 FLAGS_env
->SleepForMicroseconds(10 * 1000000);
8204 fprintf(stdout
, "waitforcompaction(%s): finished\n",
8205 db
.db
->GetName().c_str());
8211 void WaitForCompaction() {
8212 // Give background threads a chance to wake
8213 FLAGS_env
->SleepForMicroseconds(5 * 1000000);
8215 // I am skeptical that this check race free. I hope that checking twice
8216 // reduces the chance.
8217 if (db_
.db
!= nullptr) {
8218 WaitForCompactionHelper(db_
);
8219 WaitForCompactionHelper(db_
);
8221 for (auto& db_with_cfh
: multi_dbs_
) {
8222 WaitForCompactionHelper(db_with_cfh
);
8223 WaitForCompactionHelper(db_with_cfh
);
8228 bool CompactLevelHelper(DBWithColumnFamilies
& db_with_cfh
, int from_level
) {
8229 std::vector
<LiveFileMetaData
> files
;
8230 db_with_cfh
.db
->GetLiveFilesMetaData(&files
);
8232 assert(from_level
== 0 || from_level
== 1);
8234 int real_from_level
= from_level
;
8235 if (real_from_level
> 0) {
8236 // With dynamic leveled compaction the first level with data beyond L0
8238 real_from_level
= std::numeric_limits
<int>::max();
8240 for (auto& f
: files
) {
8241 if (f
.level
> 0 && f
.level
< real_from_level
) real_from_level
= f
.level
;
8244 if (real_from_level
== std::numeric_limits
<int>::max()) {
8245 fprintf(stdout
, "compact%d found 0 files to compact\n", from_level
);
8250 // The goal is to compact from from_level to the level that follows it,
8251 // and with dynamic leveled compaction the next level might not be
8252 // real_from_level+1
8253 int next_level
= std::numeric_limits
<int>::max();
8255 std::vector
<std::string
> files_to_compact
;
8256 for (auto& f
: files
) {
8257 if (f
.level
== real_from_level
)
8258 files_to_compact
.push_back(f
.name
);
8259 else if (f
.level
> real_from_level
&& f
.level
< next_level
)
8260 next_level
= f
.level
;
8263 if (files_to_compact
.empty()) {
8264 fprintf(stdout
, "compact%d found 0 files to compact\n", from_level
);
8266 } else if (next_level
== std::numeric_limits
<int>::max()) {
8267 // There is no data beyond real_from_level. So we are done.
8268 fprintf(stdout
, "compact%d found no data beyond L%d\n", from_level
,
8273 fprintf(stdout
, "compact%d found %d files to compact from L%d to L%d\n",
8274 from_level
, static_cast<int>(files_to_compact
.size()),
8275 real_from_level
, next_level
);
8277 ROCKSDB_NAMESPACE::CompactionOptions options
;
8278 // Lets RocksDB use the configured compression for this level
8279 options
.compression
= ROCKSDB_NAMESPACE::kDisableCompressionOption
;
8281 ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc
;
8282 db_with_cfh
.db
->DefaultColumnFamily()->GetDescriptor(&cfDesc
);
8283 options
.output_file_size_limit
= cfDesc
.options
.target_file_size_base
;
8286 db_with_cfh
.db
->CompactFiles(options
, files_to_compact
, next_level
);
8288 // This can fail for valid reasons including the operation was aborted
8289 // or a filename is invalid because background compaction removed it.
8290 // Having read the current cases for which an error is raised I prefer
8291 // not to figure out whether an exception should be thrown here.
8292 fprintf(stderr
, "compact%d CompactFiles failed: %s\n", from_level
,
8293 status
.ToString().c_str());
8299 void CompactLevel(int from_level
) {
8300 if (db_
.db
!= nullptr) {
8301 while (!CompactLevelHelper(db_
, from_level
)) WaitForCompaction();
8303 for (auto& db_with_cfh
: multi_dbs_
) {
8304 while (!CompactLevelHelper(db_with_cfh
, from_level
)) WaitForCompaction();
8310 FlushOptions flush_opt
;
8311 flush_opt
.wait
= true;
8313 if (db_
.db
!= nullptr) {
8315 if (FLAGS_num_column_families
> 1) {
8316 s
= db_
.db
->Flush(flush_opt
, db_
.cfh
);
8318 s
= db_
.db
->Flush(flush_opt
, db_
.db
->DefaultColumnFamily());
8322 fprintf(stderr
, "Flush failed: %s\n", s
.ToString().c_str());
8326 for (const auto& db_with_cfh
: multi_dbs_
) {
8328 if (FLAGS_num_column_families
> 1) {
8329 s
= db_with_cfh
.db
->Flush(flush_opt
, db_with_cfh
.cfh
);
8331 s
= db_with_cfh
.db
->Flush(flush_opt
,
8332 db_with_cfh
.db
->DefaultColumnFamily());
8336 fprintf(stderr
, "Flush failed: %s\n", s
.ToString().c_str());
8341 fprintf(stdout
, "flush memtable\n");
8345 if (db_
.db
!= nullptr) {
8346 db_
.db
->ResetStats();
8348 for (const auto& db_with_cfh
: multi_dbs_
) {
8349 db_with_cfh
.db
->ResetStats();
8353 void PrintStatsHistory() {
8354 if (db_
.db
!= nullptr) {
8355 PrintStatsHistoryImpl(db_
.db
, false);
8357 for (const auto& db_with_cfh
: multi_dbs_
) {
8358 PrintStatsHistoryImpl(db_with_cfh
.db
, true);
8362 void PrintStatsHistoryImpl(DB
* db
, bool print_header
) {
8364 fprintf(stdout
, "\n==== DB: %s ===\n", db
->GetName().c_str());
8367 std::unique_ptr
<StatsHistoryIterator
> shi
;
8369 db
->GetStatsHistory(0, std::numeric_limits
<uint64_t>::max(), &shi
);
8371 fprintf(stdout
, "%s\n", s
.ToString().c_str());
8375 while (shi
->Valid()) {
8376 uint64_t stats_time
= shi
->GetStatsTime();
8377 fprintf(stdout
, "------ %s ------\n",
8378 TimeToHumanString(static_cast<int>(stats_time
)).c_str());
8379 for (auto& entry
: shi
->GetStatsMap()) {
8380 fprintf(stdout
, " %" PRIu64
" %s %" PRIu64
"\n", stats_time
,
8381 entry
.first
.c_str(), entry
.second
);
8387 void PrintStats(const char* key
) {
8388 if (db_
.db
!= nullptr) {
8389 PrintStats(db_
.db
, key
, false);
8391 for (const auto& db_with_cfh
: multi_dbs_
) {
8392 PrintStats(db_with_cfh
.db
, key
, true);
8396 void PrintStats(DB
* db
, const char* key
, bool print_header
= false) {
8398 fprintf(stdout
, "\n==== DB: %s ===\n", db
->GetName().c_str());
8401 if (!db
->GetProperty(key
, &stats
)) {
8404 fprintf(stdout
, "\n%s\n", stats
.c_str());
8407 void PrintStats(const std::vector
<std::string
>& keys
) {
8408 if (db_
.db
!= nullptr) {
8409 PrintStats(db_
.db
, keys
);
8411 for (const auto& db_with_cfh
: multi_dbs_
) {
8412 PrintStats(db_with_cfh
.db
, keys
, true);
8416 void PrintStats(DB
* db
, const std::vector
<std::string
>& keys
,
8417 bool print_header
= false) {
8419 fprintf(stdout
, "\n==== DB: %s ===\n", db
->GetName().c_str());
8422 for (const auto& key
: keys
) {
8424 if (!db
->GetProperty(key
, &stats
)) {
8427 fprintf(stdout
, "%s: %s\n", key
.c_str(), stats
.c_str());
8431 #ifndef ROCKSDB_LITE
8433 void Replay(ThreadState
* thread
) {
8434 if (db_
.db
!= nullptr) {
8435 Replay(thread
, &db_
);
8439 void Replay(ThreadState
* /*thread*/, DBWithColumnFamilies
* db_with_cfh
) {
8441 std::unique_ptr
<TraceReader
> trace_reader
;
8442 s
= NewFileTraceReader(FLAGS_env
, EnvOptions(), FLAGS_trace_file
,
8447 "Encountered an error creating a TraceReader from the trace file. "
8449 s
.ToString().c_str());
8452 std::unique_ptr
<Replayer
> replayer
;
8453 s
= db_with_cfh
->db
->NewDefaultReplayer(db_with_cfh
->cfh
,
8454 std::move(trace_reader
), &replayer
);
8457 "Encountered an error creating a default Replayer. "
8459 s
.ToString().c_str());
8462 s
= replayer
->Prepare();
8464 fprintf(stderr
, "Prepare for replay failed. Error: %s\n",
8465 s
.ToString().c_str());
8467 s
= replayer
->Replay(
8468 ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads
),
8469 FLAGS_trace_replay_fast_forward
),
8473 fprintf(stdout
, "Replay completed from trace_file: %s\n",
8474 FLAGS_trace_file
.c_str());
8476 fprintf(stderr
, "Replay failed. Error: %s\n", s
.ToString().c_str());
8480 void Backup(ThreadState
* thread
) {
8481 DB
* db
= SelectDB(thread
);
8482 std::unique_ptr
<BackupEngineOptions
> engine_options(
8483 new BackupEngineOptions(FLAGS_backup_dir
));
8485 BackupEngine
* backup_engine
;
8486 if (FLAGS_backup_rate_limit
> 0) {
8487 engine_options
->backup_rate_limiter
.reset(NewGenericRateLimiter(
8488 FLAGS_backup_rate_limit
, 100000 /* refill_period_us */,
8489 10 /* fairness */, RateLimiter::Mode::kAllIo
));
8491 // Build new backup of the entire DB
8492 engine_options
->destroy_old_data
= true;
8493 s
= BackupEngine::Open(FLAGS_env
, *engine_options
, &backup_engine
);
8495 s
= backup_engine
->CreateNewBackup(db
);
8497 std::vector
<BackupInfo
> backup_info
;
8498 backup_engine
->GetBackupInfo(&backup_info
);
8499 // Verify that a new backup is created
8500 assert(backup_info
.size() == 1);
8503 void Restore(ThreadState
* /* thread */) {
8504 std::unique_ptr
<BackupEngineOptions
> engine_options(
8505 new BackupEngineOptions(FLAGS_backup_dir
));
8506 if (FLAGS_restore_rate_limit
> 0) {
8507 engine_options
->restore_rate_limiter
.reset(NewGenericRateLimiter(
8508 FLAGS_restore_rate_limit
, 100000 /* refill_period_us */,
8509 10 /* fairness */, RateLimiter::Mode::kAllIo
));
8511 BackupEngineReadOnly
* backup_engine
;
8513 BackupEngineReadOnly::Open(FLAGS_env
, *engine_options
, &backup_engine
);
8515 s
= backup_engine
->RestoreDBFromLatestBackup(FLAGS_restore_dir
,
8518 delete backup_engine
;
8521 #endif // ROCKSDB_LITE
8524 int db_bench_tool(int argc
, char** argv
) {
8525 ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
8526 ConfigOptions config_options
;
8527 static bool initialized
= false;
8529 SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv
[0]) +
8531 SetVersionString(GetRocksVersionAsString(true));
8534 ParseCommandLineFlags(&argc
, &argv
, true);
8535 FLAGS_compaction_style_e
=
8536 (ROCKSDB_NAMESPACE::CompactionStyle
)FLAGS_compaction_style
;
8537 #ifndef ROCKSDB_LITE
8538 if (FLAGS_statistics
&& !FLAGS_statistics_string
.empty()) {
8540 "Cannot provide both --statistics and --statistics_string.\n");
8543 if (!FLAGS_statistics_string
.empty()) {
8544 Status s
= Statistics::CreateFromString(config_options
,
8545 FLAGS_statistics_string
, &dbstats
);
8546 if (dbstats
== nullptr) {
8548 "No Statistics registered matching string: %s status=%s\n",
8549 FLAGS_statistics_string
.c_str(), s
.ToString().c_str());
8553 #endif // ROCKSDB_LITE
8554 if (FLAGS_statistics
) {
8555 dbstats
= ROCKSDB_NAMESPACE::CreateDBStatistics();
8558 dbstats
->set_stats_level(static_cast<StatsLevel
>(FLAGS_stats_level
));
8560 FLAGS_compaction_pri_e
=
8561 (ROCKSDB_NAMESPACE::CompactionPri
)FLAGS_compaction_pri
;
8563 std::vector
<std::string
> fanout
= ROCKSDB_NAMESPACE::StringSplit(
8564 FLAGS_max_bytes_for_level_multiplier_additional
, ',');
8565 for (size_t j
= 0; j
< fanout
.size(); j
++) {
8566 FLAGS_max_bytes_for_level_multiplier_additional_v
.push_back(
8568 std::stoi(fanout
[j
]));
8574 FLAGS_compression_type_e
=
8575 StringToCompressionType(FLAGS_compression_type
.c_str());
8577 FLAGS_wal_compression_e
=
8578 StringToCompressionType(FLAGS_wal_compression
.c_str());
8580 FLAGS_compressed_secondary_cache_compression_type_e
= StringToCompressionType(
8581 FLAGS_compressed_secondary_cache_compression_type
.c_str());
8583 #ifndef ROCKSDB_LITE
8585 FLAGS_blob_db_compression_type_e
=
8586 StringToCompressionType(FLAGS_blob_db_compression_type
.c_str());
8588 int env_opts
= !FLAGS_env_uri
.empty() + !FLAGS_fs_uri
.empty();
8590 fprintf(stderr
, "Error: --env_uri and --fs_uri are mutually exclusive\n");
8594 if (env_opts
== 1) {
8595 Status s
= Env::CreateFromUri(config_options
, FLAGS_env_uri
, FLAGS_fs_uri
,
8596 &FLAGS_env
, &env_guard
);
8598 fprintf(stderr
, "Failed creating env: %s\n", s
.ToString().c_str());
8601 } else if (FLAGS_simulate_hdd
|| FLAGS_simulate_hybrid_fs_file
!= "") {
8602 //**TODO: Make the simulate fs something that can be loaded
8603 // from the ObjectRegistry...
8604 static std::shared_ptr
<ROCKSDB_NAMESPACE::Env
> composite_env
=
8605 NewCompositeEnv(std::make_shared
<SimulatedHybridFileSystem
>(
8606 FileSystem::Default(), FLAGS_simulate_hybrid_fs_file
,
8607 /*throughput_multiplier=*/
8608 int{FLAGS_simulate_hybrid_hdd_multipliers
},
8609 /*is_full_fs_warm=*/FLAGS_simulate_hdd
));
8610 FLAGS_env
= composite_env
.get();
8613 // Let -readonly imply -use_existing_db
8614 FLAGS_use_existing_db
|= FLAGS_readonly
;
8615 #endif // ROCKSDB_LITE
8617 if (FLAGS_build_info
) {
8618 std::string build_info
;
8619 std::cout
<< GetRocksBuildInfoAsString(build_info
, true) << std::endl
;
8620 // Similar to --version, nothing else will be done when this flag is set
8625 uint64_t now
= FLAGS_env
->GetSystemClock()->NowMicros();
8626 seed_base
= static_cast<int64_t>(now
);
8627 fprintf(stdout
, "Set seed to %" PRIu64
" because --seed was 0\n",
8630 seed_base
= FLAGS_seed
;
8633 if (FLAGS_use_existing_keys
&& !FLAGS_use_existing_db
) {
8635 "`-use_existing_db` must be true for `-use_existing_keys` to be "
8640 if (!strcasecmp(FLAGS_compaction_fadvice
.c_str(), "NONE"))
8641 FLAGS_compaction_fadvice_e
= ROCKSDB_NAMESPACE::Options::NONE
;
8642 else if (!strcasecmp(FLAGS_compaction_fadvice
.c_str(), "NORMAL"))
8643 FLAGS_compaction_fadvice_e
= ROCKSDB_NAMESPACE::Options::NORMAL
;
8644 else if (!strcasecmp(FLAGS_compaction_fadvice
.c_str(), "SEQUENTIAL"))
8645 FLAGS_compaction_fadvice_e
= ROCKSDB_NAMESPACE::Options::SEQUENTIAL
;
8646 else if (!strcasecmp(FLAGS_compaction_fadvice
.c_str(), "WILLNEED"))
8647 FLAGS_compaction_fadvice_e
= ROCKSDB_NAMESPACE::Options::WILLNEED
;
8649 fprintf(stdout
, "Unknown compaction fadvice:%s\n",
8650 FLAGS_compaction_fadvice
.c_str());
8654 FLAGS_value_size_distribution_type_e
=
8655 StringToDistributionType(FLAGS_value_size_distribution_type
.c_str());
8657 // Note options sanitization may increase thread pool sizes according to
8658 // max_background_flushes/max_background_compactions/max_background_jobs
8659 FLAGS_env
->SetBackgroundThreads(FLAGS_num_high_pri_threads
,
8660 ROCKSDB_NAMESPACE::Env::Priority::HIGH
);
8661 FLAGS_env
->SetBackgroundThreads(FLAGS_num_bottom_pri_threads
,
8662 ROCKSDB_NAMESPACE::Env::Priority::BOTTOM
);
8663 FLAGS_env
->SetBackgroundThreads(FLAGS_num_low_pri_threads
,
8664 ROCKSDB_NAMESPACE::Env::Priority::LOW
);
8666 // Choose a location for the test database if none given with --db=<path>
8667 if (FLAGS_db
.empty()) {
8668 std::string default_db_path
;
8669 FLAGS_env
->GetTestDirectory(&default_db_path
);
8670 default_db_path
+= "/dbbench";
8671 FLAGS_db
= default_db_path
;
8674 if (FLAGS_backup_dir
.empty()) {
8675 FLAGS_backup_dir
= FLAGS_db
+ "/backup";
8678 if (FLAGS_restore_dir
.empty()) {
8679 FLAGS_restore_dir
= FLAGS_db
+ "/restore";
8682 if (FLAGS_stats_interval_seconds
> 0) {
8683 // When both are set then FLAGS_stats_interval determines the frequency
8684 // at which the timer is checked for FLAGS_stats_interval_seconds
8685 FLAGS_stats_interval
= 1000;
8688 if (FLAGS_seek_missing_prefix
&& FLAGS_prefix_size
<= 8) {
8689 fprintf(stderr
, "prefix_size > 8 required by --seek_missing_prefix\n");
8693 ROCKSDB_NAMESPACE::Benchmark benchmark
;
8696 #ifndef ROCKSDB_LITE
8697 if (FLAGS_print_malloc_stats
) {
8698 std::string stats_string
;
8699 ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string
);
8700 fprintf(stdout
, "Malloc stats:\n%s\n", stats_string
.c_str());
8702 #endif // ROCKSDB_LITE
8706 } // namespace ROCKSDB_NAMESPACE