]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/tools/db_bench_tool.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / tools / db_bench_tool.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #ifdef GFLAGS
11 #ifdef NUMA
12 #include <numa.h>
13 #endif
14 #ifndef OS_WIN
15 #include <unistd.h>
16 #endif
17 #include <fcntl.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <sys/types.h>
21 #ifdef __APPLE__
22 #include <mach/host_info.h>
23 #include <mach/mach_host.h>
24 #include <sys/sysctl.h>
25 #endif
26 #ifdef __FreeBSD__
27 #include <sys/sysctl.h>
28 #endif
29 #include <atomic>
30 #include <cinttypes>
31 #include <condition_variable>
32 #include <cstddef>
33 #include <iostream>
34 #include <memory>
35 #include <mutex>
36 #include <queue>
37 #include <thread>
38 #include <unordered_map>
39
40 #include "db/db_impl/db_impl.h"
41 #include "db/malloc_stats.h"
42 #include "db/version_set.h"
43 #include "monitoring/histogram.h"
44 #include "monitoring/statistics.h"
45 #include "options/cf_options.h"
46 #include "port/port.h"
47 #include "port/stack_trace.h"
48 #include "rocksdb/cache.h"
49 #include "rocksdb/convenience.h"
50 #include "rocksdb/db.h"
51 #include "rocksdb/env.h"
52 #include "rocksdb/filter_policy.h"
53 #include "rocksdb/memtablerep.h"
54 #include "rocksdb/options.h"
55 #include "rocksdb/perf_context.h"
56 #include "rocksdb/persistent_cache.h"
57 #include "rocksdb/rate_limiter.h"
58 #include "rocksdb/secondary_cache.h"
59 #include "rocksdb/slice.h"
60 #include "rocksdb/slice_transform.h"
61 #include "rocksdb/stats_history.h"
62 #include "rocksdb/table.h"
63 #include "rocksdb/utilities/backup_engine.h"
64 #include "rocksdb/utilities/object_registry.h"
65 #include "rocksdb/utilities/optimistic_transaction_db.h"
66 #include "rocksdb/utilities/options_type.h"
67 #include "rocksdb/utilities/options_util.h"
68 #ifndef ROCKSDB_LITE
69 #include "rocksdb/utilities/replayer.h"
70 #endif // ROCKSDB_LITE
71 #include "rocksdb/utilities/sim_cache.h"
72 #include "rocksdb/utilities/transaction.h"
73 #include "rocksdb/utilities/transaction_db.h"
74 #include "rocksdb/write_batch.h"
75 #include "test_util/testutil.h"
76 #include "test_util/transaction_test_util.h"
77 #include "tools/simulated_hybrid_file_system.h"
78 #include "util/cast_util.h"
79 #include "util/compression.h"
80 #include "util/crc32c.h"
81 #include "util/file_checksum_helper.h"
82 #include "util/gflags_compat.h"
83 #include "util/mutexlock.h"
84 #include "util/random.h"
85 #include "util/stderr_logger.h"
86 #include "util/string_util.h"
87 #include "util/xxhash.h"
88 #include "utilities/blob_db/blob_db.h"
89 #include "utilities/counted_fs.h"
90 #include "utilities/merge_operators.h"
91 #include "utilities/merge_operators/bytesxor.h"
92 #include "utilities/merge_operators/sortlist.h"
93 #include "utilities/persistent_cache/block_cache_tier.h"
94
95 #ifdef MEMKIND
96 #include "memory/memkind_kmem_allocator.h"
97 #endif
98
99 #ifdef OS_WIN
100 #include <io.h> // open/close
101 #endif
102
103 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
104 using GFLAGS_NAMESPACE::RegisterFlagValidator;
105 using GFLAGS_NAMESPACE::SetUsageMessage;
106 using GFLAGS_NAMESPACE::SetVersionString;
107
108 #ifdef ROCKSDB_LITE
109 #define IF_ROCKSDB_LITE(Then, Else) Then
110 #else
111 #define IF_ROCKSDB_LITE(Then, Else) Else
112 #endif
113
114 DEFINE_string(
115 benchmarks,
116 "fillseq,"
117 "fillseqdeterministic,"
118 "fillsync,"
119 "fillrandom,"
120 "filluniquerandomdeterministic,"
121 "overwrite,"
122 "readrandom,"
123 "newiterator,"
124 "newiteratorwhilewriting,"
125 "seekrandom,"
126 "seekrandomwhilewriting,"
127 "seekrandomwhilemerging,"
128 "readseq,"
129 "readreverse,"
130 "compact,"
131 "compactall,"
132 "flush,"
133 IF_ROCKSDB_LITE("",
134 "compact0,"
135 "compact1,"
136 "waitforcompaction,"
137 )
138 "multireadrandom,"
139 "mixgraph,"
140 "readseq,"
141 "readtorowcache,"
142 "readtocache,"
143 "readreverse,"
144 "readwhilewriting,"
145 "readwhilemerging,"
146 "readwhilescanning,"
147 "readrandomwriterandom,"
148 "updaterandom,"
149 "xorupdaterandom,"
150 "approximatesizerandom,"
151 "randomwithverify,"
152 "fill100K,"
153 "crc32c,"
154 "xxhash,"
155 "xxhash64,"
156 "xxh3,"
157 "compress,"
158 "uncompress,"
159 "acquireload,"
160 "fillseekseq,"
161 "randomtransaction,"
162 "randomreplacekeys,"
163 "timeseries,"
164 "getmergeoperands,",
165 "readrandomoperands,"
166 "backup,"
167 "restore"
168
169 "Comma-separated list of operations to run in the specified"
170 " order. Available benchmarks:\n"
171 "\tfillseq -- write N values in sequential key"
172 " order in async mode\n"
173 "\tfillseqdeterministic -- write N values in the specified"
174 " key order and keep the shape of the LSM tree\n"
175 "\tfillrandom -- write N values in random key order in async"
176 " mode\n"
177 "\tfilluniquerandomdeterministic -- write N values in a random"
178 " key order and keep the shape of the LSM tree\n"
179 "\toverwrite -- overwrite N values in random key order in "
180 "async mode\n"
181 "\tfillsync -- write N/1000 values in random key order in "
182 "sync mode\n"
183 "\tfill100K -- write N/1000 100K values in random order in"
184 " async mode\n"
185 "\tdeleteseq -- delete N keys in sequential order\n"
186 "\tdeleterandom -- delete N keys in random order\n"
187 "\treadseq -- read N times sequentially\n"
188 "\treadtocache -- 1 thread reading database sequentially\n"
189 "\treadreverse -- read N times in reverse order\n"
190 "\treadrandom -- read N times in random order\n"
191 "\treadmissing -- read N missing keys in random order\n"
192 "\treadwhilewriting -- 1 writer, N threads doing random "
193 "reads\n"
194 "\treadwhilemerging -- 1 merger, N threads doing random "
195 "reads\n"
196 "\treadwhilescanning -- 1 thread doing full table scan, "
197 "N threads doing random reads\n"
198 "\treadrandomwriterandom -- N threads doing random-read, "
199 "random-write\n"
200 "\tupdaterandom -- N threads doing read-modify-write for random "
201 "keys\n"
202 "\txorupdaterandom -- N threads doing read-XOR-write for "
203 "random keys\n"
204 "\tappendrandom -- N threads doing read-modify-write with "
205 "growing values\n"
206 "\tmergerandom -- same as updaterandom/appendrandom using merge"
207 " operator. "
208 "Must be used with merge_operator\n"
209 "\treadrandommergerandom -- perform N random read-or-merge "
210 "operations. Must be used with merge_operator\n"
211 "\tnewiterator -- repeated iterator creation\n"
212 "\tseekrandom -- N random seeks, call Next seek_nexts times "
213 "per seek\n"
214 "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
215 "overwrite\n"
216 "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
217 "merge\n"
218 "\tcrc32c -- repeated crc32c of <block size> data\n"
219 "\txxhash -- repeated xxHash of <block size> data\n"
220 "\txxhash64 -- repeated xxHash64 of <block size> data\n"
221 "\txxh3 -- repeated XXH3 of <block size> data\n"
222 "\tacquireload -- load N*1000 times\n"
223 "\tfillseekseq -- write N values in sequential key, then read "
224 "them by seeking to each key\n"
225 "\trandomtransaction -- execute N random transactions and "
226 "verify correctness\n"
227 "\trandomreplacekeys -- randomly replaces N keys by deleting "
228 "the old version and putting the new version\n\n"
229 "\ttimeseries -- 1 writer generates time series data "
230 "and multiple readers doing random reads on id\n\n"
231 "Meta operations:\n"
232 "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
233 "\tcompactall -- Compact the entire DB\n"
234 IF_ROCKSDB_LITE("",
235 "\tcompact0 -- compact L0 into L1\n"
236 "\tcompact1 -- compact L1 into L2\n"
237 "\twaitforcompaction - pause until compaction is (probably) done\n"
238 )
239 "\tflush - flush the memtable\n"
240 "\tstats -- Print DB stats\n"
241 "\tresetstats -- Reset DB stats\n"
242 "\tlevelstats -- Print the number of files and bytes per level\n"
243 "\tmemstats -- Print memtable stats\n"
244 "\tsstables -- Print sstable info\n"
245 "\theapprofile -- Dump a heap profile (if supported by this port)\n"
246 IF_ROCKSDB_LITE("",
247 "\treplay -- replay the trace file specified with trace_file\n"
248 )
249 "\tgetmergeoperands -- Insert lots of merge records which are a list of "
250 "sorted ints for a key and then compare performance of lookup for another "
251 "key by doing a Get followed by binary searching in the large sorted list "
252 "vs doing a GetMergeOperands and binary searching in the operands which "
253 "are sorted sub-lists. The MergeOperator used is sortlist.h\n"
254 "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An "
255 "operation includes a rare but possible retry in case it got "
256 "`Status::Incomplete()`. This happens upon encountering more keys than "
257 "have ever been seen by the thread (or eight initially)\n"
258 "\tbackup -- Create a backup of the current DB and verify that a new backup is corrected. "
259 "Rate limit can be specified through --backup_rate_limit\n"
260 "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n");
261
262 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
263
264 DEFINE_int64(numdistinct, 1000,
265 "Number of distinct keys to use. Used in RandomWithVerify to "
266 "read/write on fewer keys so that gets are more likely to find the"
267 " key and puts are more likely to update the same key");
268
269 DEFINE_int64(merge_keys, -1,
270 "Number of distinct keys to use for MergeRandom and "
271 "ReadRandomMergeRandom. "
272 "If negative, there will be FLAGS_num keys.");
273 DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
274
275 DEFINE_int32(
276 num_hot_column_families, 0,
277 "Number of Hot Column Families. If more than 0, only write to this "
278 "number of column families. After finishing all the writes to them, "
279 "create new set of column families and insert to them. Only used "
280 "when num_column_families > 1.");
281
282 DEFINE_string(column_family_distribution, "",
283 "Comma-separated list of percentages, where the ith element "
284 "indicates the probability of an op using the ith column family. "
285 "The number of elements must be `num_hot_column_families` if "
286 "specified; otherwise, it must be `num_column_families`. The "
287 "sum of elements must be 100. E.g., if `num_column_families=4`, "
288 "and `num_hot_column_families=0`, a valid list could be "
289 "\"10,20,30,40\".");
290
291 DEFINE_int64(reads, -1,
292 "Number of read operations to do. "
293 "If negative, do FLAGS_num reads.");
294
295 DEFINE_int64(deletes, -1,
296 "Number of delete operations to do. "
297 "If negative, do FLAGS_num deletions.");
298
299 DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
300
301 DEFINE_int64(seed, 0,
302 "Seed base for random number generators. "
303 "When 0 it is derived from the current time.");
304 static int64_t seed_base;
305
306 DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
307
308 DEFINE_int32(duration, 0,
309 "Time in seconds for the random-ops tests to run."
310 " When 0 then num & reads determine the test duration");
311
312 DEFINE_string(value_size_distribution_type, "fixed",
313 "Value size distribution type: fixed, uniform, normal");
314
315 DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
316 static unsigned int value_size = 100;
317
318 DEFINE_int32(value_size_min, 100, "Min size of random value");
319
320 DEFINE_int32(value_size_max, 102400, "Max size of random value");
321
322 DEFINE_int32(seek_nexts, 0,
323 "How many times to call Next() after Seek() in "
324 "fillseekseq, seekrandom, seekrandomwhilewriting and "
325 "seekrandomwhilemerging");
326
327 DEFINE_bool(reverse_iterator, false,
328 "When true use Prev rather than Next for iterators that do "
329 "Seek and then Next");
330
331 DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark");
332
333 DEFINE_int64(max_scan_distance, 0,
334 "Used to define iterate_upper_bound (or iterate_lower_bound "
335 "if FLAGS_reverse_iterator is set to true) when value is nonzero");
336
337 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
338
339 DEFINE_int64(batch_size, 1, "Batch size");
340
341 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
342 return true;
343 }
344
345 static bool ValidateUint32Range(const char* flagname, uint64_t value) {
346 if (value > std::numeric_limits<uint32_t>::max()) {
347 fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
348 (unsigned long)value);
349 return false;
350 }
351 return true;
352 }
353
354 DEFINE_int32(key_size, 16, "size of each key");
355
356 DEFINE_int32(user_timestamp_size, 0,
357 "number of bytes in a user-defined timestamp");
358
359 DEFINE_int32(num_multi_db, 0,
360 "Number of DBs used in the benchmark. 0 means single DB.");
361
362 DEFINE_double(compression_ratio, 0.5,
363 "Arrange to generate values that shrink to this fraction of "
364 "their original size after compression");
365
366 DEFINE_double(
367 overwrite_probability, 0.0,
368 "Used in 'filluniquerandom' benchmark: for each write operation, "
369 "we give a probability to perform an overwrite instead. The key used for "
370 "the overwrite is randomly chosen from the last 'overwrite_window_size' "
371 "keys previously inserted into the DB. "
372 "Valid overwrite_probability values: [0.0, 1.0].");
373
374 DEFINE_uint32(overwrite_window_size, 1,
375 "Used in 'filluniquerandom' benchmark. For each write operation,"
376 " when the overwrite_probability flag is set by the user, the "
377 "key used to perform an overwrite is randomly chosen from the "
378 "last 'overwrite_window_size' keys previously inserted into DB. "
379 "Warning: large values can affect throughput. "
380 "Valid overwrite_window_size values: [1, kMaxUint32].");
381
382 DEFINE_uint64(
383 disposable_entries_delete_delay, 0,
384 "Minimum delay in microseconds for the series of Deletes "
385 "to be issued. When 0 the insertion of the last disposable entry is "
386 "immediately followed by the issuance of the Deletes. "
387 "(only compatible with fillanddeleteuniquerandom benchmark).");
388
389 DEFINE_uint64(disposable_entries_batch_size, 0,
390 "Number of consecutively inserted disposable KV entries "
391 "that will be deleted after 'delete_delay' microseconds. "
392 "A series of Deletes is always issued once all the "
393 "disposable KV entries it targets have been inserted "
394 "into the DB. When 0 no deletes are issued and a "
395 "regular 'filluniquerandom' benchmark occurs. "
396 "(only compatible with fillanddeleteuniquerandom benchmark)");
397
398 DEFINE_int32(disposable_entries_value_size, 64,
399 "Size of the values (in bytes) of the entries targeted by "
400 "selective deletes. "
401 "(only compatible with fillanddeleteuniquerandom benchmark)");
402
403 DEFINE_uint64(
404 persistent_entries_batch_size, 0,
405 "Number of KV entries being inserted right before the deletes "
406 "targeting the disposable KV entries are issued. These "
407 "persistent keys are not targeted by the deletes, and will always "
408 "remain valid in the DB. (only compatible with "
409 "--benchmarks='fillanddeleteuniquerandom' "
410 "and used when--disposable_entries_batch_size is > 0).");
411
412 DEFINE_int32(persistent_entries_value_size, 64,
413 "Size of the values (in bytes) of the entries not targeted by "
414 "deletes. (only compatible with "
415 "--benchmarks='fillanddeleteuniquerandom' "
416 "and used when--disposable_entries_batch_size is > 0).");
417
418 DEFINE_double(read_random_exp_range, 0.0,
419 "Read random's key will be generated using distribution of "
420 "num * exp(-r) where r is uniform number from 0 to this value. "
421 "The larger the number is, the more skewed the reads are. "
422 "Only used in readrandom and multireadrandom benchmarks.");
423
424 DEFINE_bool(histogram, false, "Print histogram of operation timings");
425
426 DEFINE_bool(confidence_interval_only, false,
427 "Print 95% confidence interval upper and lower bounds only for "
428 "aggregate stats.");
429
430 DEFINE_bool(enable_numa, false,
431 "Make operations aware of NUMA architecture and bind memory "
432 "and cpus corresponding to nodes together. In NUMA, memory "
433 "in same node as CPUs are closer when compared to memory in "
434 "other nodes. Reads can be faster when the process is bound to "
435 "CPU and memory of same node. Use \"$numactl --hardware\" command "
436 "to see NUMA memory architecture.");
437
438 DEFINE_int64(db_write_buffer_size,
439 ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
440 "Number of bytes to buffer in all memtables before compacting");
441
442 DEFINE_bool(cost_write_buffer_to_cache, false,
443 "The usage of memtable is costed to the block cache");
444
445 DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
446 "The size, in bytes, of one block in arena memory allocation.");
447
448 DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
449 "Number of bytes to buffer in memtable before compacting");
450
451 DEFINE_int32(max_write_buffer_number,
452 ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
453 "The number of in-memory memtables. Each memtable is of size"
454 " write_buffer_size bytes.");
455
456 DEFINE_int32(min_write_buffer_number_to_merge,
457 ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
458 "The minimum number of write buffers that will be merged together"
459 "before writing to storage. This is cheap because it is an"
460 "in-memory merge. If this feature is not enabled, then all these"
461 "write buffers are flushed to L0 as separate files and this "
462 "increases read amplification because a get request has to check"
463 " in all of these files. Also, an in-memory merge may result in"
464 " writing less data to storage if there are duplicate records "
465 " in each of these individual write buffers.");
466
467 DEFINE_int32(max_write_buffer_number_to_maintain,
468 ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
469 "The total maximum number of write buffers to maintain in memory "
470 "including copies of buffers that have already been flushed. "
471 "Unlike max_write_buffer_number, this parameter does not affect "
472 "flushing. This controls the minimum amount of write history "
473 "that will be available in memory for conflict checking when "
474 "Transactions are used. If this value is too low, some "
475 "transactions may fail at commit time due to not being able to "
476 "determine whether there were any write conflicts. Setting this "
477 "value to 0 will cause write buffers to be freed immediately "
478 "after they are flushed. If this value is set to -1, "
479 "'max_write_buffer_number' will be used.");
480
481 DEFINE_int64(max_write_buffer_size_to_maintain,
482 ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
483 "The total maximum size of write buffers to maintain in memory "
484 "including copies of buffers that have already been flushed. "
485 "Unlike max_write_buffer_number, this parameter does not affect "
486 "flushing. This controls the minimum amount of write history "
487 "that will be available in memory for conflict checking when "
488 "Transactions are used. If this value is too low, some "
489 "transactions may fail at commit time due to not being able to "
490 "determine whether there were any write conflicts. Setting this "
491 "value to 0 will cause write buffers to be freed immediately "
492 "after they are flushed. If this value is set to -1, "
493 "'max_write_buffer_number' will be used.");
494
495 DEFINE_int32(max_background_jobs,
496 ROCKSDB_NAMESPACE::Options().max_background_jobs,
497 "The maximum number of concurrent background jobs that can occur "
498 "in parallel.");
499
500 DEFINE_int32(num_bottom_pri_threads, 0,
501 "The number of threads in the bottom-priority thread pool (used "
502 "by universal compaction only).");
503
504 DEFINE_int32(num_high_pri_threads, 0,
505 "The maximum number of concurrent background compactions"
506 " that can occur in parallel.");
507
508 DEFINE_int32(num_low_pri_threads, 0,
509 "The maximum number of concurrent background compactions"
510 " that can occur in parallel.");
511
512 DEFINE_int32(max_background_compactions,
513 ROCKSDB_NAMESPACE::Options().max_background_compactions,
514 "The maximum number of concurrent background compactions"
515 " that can occur in parallel.");
516
517 DEFINE_uint64(subcompactions, 1,
518 "Maximum number of subcompactions to divide L0-L1 compactions "
519 "into.");
520 static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
521 RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
522
523 DEFINE_int32(max_background_flushes,
524 ROCKSDB_NAMESPACE::Options().max_background_flushes,
525 "The maximum number of concurrent background flushes"
526 " that can occur in parallel.");
527
528 static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
529 DEFINE_int32(compaction_style,
530 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
531 "style of compaction: level-based, universal and fifo");
532
533 static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
534 DEFINE_int32(compaction_pri,
535 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
536 "priority of files to compaction: by size or by data age");
537
538 DEFINE_int32(universal_size_ratio, 0,
539 "Percentage flexibility while comparing file size "
540 "(for universal compaction only).");
541
542 DEFINE_int32(universal_min_merge_width, 0,
543 "The minimum number of files in a single compaction run "
544 "(for universal compaction only).");
545
546 DEFINE_int32(universal_max_merge_width, 0,
547 "The max number of files to compact in universal style "
548 "compaction");
549
550 DEFINE_int32(universal_max_size_amplification_percent, 0,
551 "The max size amplification for universal style compaction");
552
553 DEFINE_int32(universal_compression_size_percent, -1,
554 "The percentage of the database to compress for universal "
555 "compaction. -1 means compress everything.");
556
557 DEFINE_bool(universal_allow_trivial_move, false,
558 "Allow trivial move in universal compaction.");
559
560 DEFINE_bool(universal_incremental, false,
561 "Enable incremental compactions in universal compaction.");
562
563 DEFINE_int64(cache_size, 8 << 20, // 8MB
564 "Number of bytes to use as a cache of uncompressed data");
565
566 DEFINE_int32(cache_numshardbits, -1,
567 "Number of shards for the block cache"
568 " is 2 ** cache_numshardbits. Negative means use default settings."
569 " This is applied only if FLAGS_cache_size is non-negative.");
570
571 DEFINE_double(cache_high_pri_pool_ratio, 0.0,
572 "Ratio of block cache reserve for high pri blocks. "
573 "If > 0.0, we also enable "
574 "cache_index_and_filter_blocks_with_high_priority.");
575
576 DEFINE_double(cache_low_pri_pool_ratio, 0.0,
577 "Ratio of block cache reserve for low pri blocks.");
578
579 DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
580
581 DEFINE_bool(use_compressed_secondary_cache, false,
582 "Use the CompressedSecondaryCache as the secondary cache.");
583
584 DEFINE_int64(compressed_secondary_cache_size, 8 << 20, // 8MB
585 "Number of bytes to use as a cache of data");
586
587 DEFINE_int32(compressed_secondary_cache_numshardbits, 6,
588 "Number of shards for the block cache"
589 " is 2 ** compressed_secondary_cache_numshardbits."
590 " Negative means use default settings."
591 " This is applied only if FLAGS_cache_size is non-negative.");
592
593 DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio, 0.0,
594 "Ratio of block cache reserve for high pri blocks. "
595 "If > 0.0, we also enable "
596 "cache_index_and_filter_blocks_with_high_priority.");
597
598 DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio, 0.0,
599 "Ratio of block cache reserve for low pri blocks.");
600
601 DEFINE_string(compressed_secondary_cache_compression_type, "lz4",
602 "The compression algorithm to use for large "
603 "values stored in CompressedSecondaryCache.");
604 static enum ROCKSDB_NAMESPACE::CompressionType
605 FLAGS_compressed_secondary_cache_compression_type_e =
606 ROCKSDB_NAMESPACE::kLZ4Compression;
607
608 DEFINE_uint32(
609 compressed_secondary_cache_compress_format_version, 2,
610 "compress_format_version can have two values: "
611 "compress_format_version == 1 -- decompressed size is not included"
612 " in the block header."
613 "compress_format_version == 2 -- decompressed size is included"
614 " in the block header in varint32 format.");
615
616 DEFINE_int64(simcache_size, -1,
617 "Number of bytes to use as a simcache of "
618 "uncompressed data. Nagative value disables simcache.");
619
620 DEFINE_bool(cache_index_and_filter_blocks, false,
621 "Cache index/filter blocks in block cache.");
622
623 DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false,
624 "Use JemallocNodumpAllocator for block/blob cache.");
625
626 DEFINE_bool(use_cache_memkind_kmem_allocator, false,
627 "Use memkind kmem allocator for block/blob cache.");
628
629 DEFINE_bool(partition_index_and_filters, false,
630 "Partition index and filter blocks.");
631
632 DEFINE_bool(partition_index, false, "Partition index blocks");
633
634 DEFINE_bool(index_with_first_key, false, "Include first key in the index");
635
636 DEFINE_bool(
637 optimize_filters_for_memory,
638 ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
639 "Minimize memory footprint of filters");
640
641 DEFINE_int64(
642 index_shortening_mode, 2,
643 "mode to shorten index: 0 for no shortening; 1 for only shortening "
644 "separaters; 2 for shortening shortening and successor");
645
646 DEFINE_int64(metadata_block_size,
647 ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
648 "Max partition size when partitioning index/filters");
649
650 // The default reduces the overhead of reading time with flash. With HDD, which
651 // offers much less throughput, however, this number better to be set to 1.
652 DEFINE_int32(ops_between_duration_checks, 1000,
653 "Check duration limit every x ops");
654
655 DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
656 "Pin index/filter blocks of L0 files in block cache.");
657
658 DEFINE_bool(
659 pin_top_level_index_and_filter, false,
660 "Pin top-level index of partitioned index/filter blocks in block cache.");
661
662 DEFINE_int32(block_size,
663 static_cast<int32_t>(
664 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
665 "Number of bytes in a block.");
666
667 DEFINE_int32(format_version,
668 static_cast<int32_t>(
669 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
670 "Format version of SST files.");
671
672 DEFINE_int32(block_restart_interval,
673 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
674 "Number of keys between restart points "
675 "for delta encoding of keys in data block.");
676
677 DEFINE_int32(
678 index_block_restart_interval,
679 ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
680 "Number of keys between restart points "
681 "for delta encoding of keys in index block.");
682
683 DEFINE_int32(read_amp_bytes_per_bit,
684 ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
685 "Number of bytes per bit to be used in block read-amp bitmap");
686
687 DEFINE_bool(
688 enable_index_compression,
689 ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
690 "Compress the index block");
691
692 DEFINE_bool(block_align,
693 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
694 "Align data blocks on page size");
695
696 DEFINE_int64(prepopulate_block_cache, 0,
697 "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
698 "to insert during flush");
699
700 DEFINE_bool(use_data_block_hash_index, false,
701 "if use kDataBlockBinaryAndHash "
702 "instead of kDataBlockBinarySearch. "
703 "This is valid if only we use BlockTable");
704
705 DEFINE_double(data_block_hash_table_util_ratio, 0.75,
706 "util ratio for data block hash index table. "
707 "This is only valid if use_data_block_hash_index is "
708 "set to true");
709
710 DEFINE_int64(compressed_cache_size, -1,
711 "Number of bytes to use as a cache of compressed data.");
712
713 DEFINE_int64(row_cache_size, 0,
714 "Number of bytes to use as a cache of individual rows"
715 " (0 = disabled).");
716
717 DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
718 "Maximum number of files to keep open at the same time"
719 " (use default if == 0)");
720
721 DEFINE_int32(file_opening_threads,
722 ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
723 "If open_files is set to -1, this option set the number of "
724 "threads that will be used to open files during DB::Open()");
725
726 DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
727
728 DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
729
730 DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
731 "Maximum windows randomaccess buffer size");
732
733 DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
734 "Maximum write buffer for Writable File");
735
736 DEFINE_int32(bloom_bits, -1,
737 "Bloom filter bits per key. Negative means use default."
738 "Zero disables.");
739
740 DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
741
742 DEFINE_double(memtable_bloom_size_ratio, 0,
743 "Ratio of memtable size used for bloom filter. 0 means no bloom "
744 "filter.");
745 DEFINE_bool(memtable_whole_key_filtering, false,
746 "Try to use whole key bloom filter in memtables.");
747 DEFINE_bool(memtable_use_huge_page, false,
748 "Try to use huge page in memtables.");
749
750 DEFINE_bool(whole_key_filtering,
751 ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering,
752 "Use whole keys (in addition to prefixes) in SST bloom filter.");
753
754 DEFINE_bool(use_existing_db, false,
755 "If true, do not destroy the existing database. If you set this "
756 "flag and also specify a benchmark that wants a fresh database, "
757 "that benchmark will fail.");
758
759 DEFINE_bool(use_existing_keys, false,
760 "If true, uses existing keys in the DB, "
761 "rather than generating new ones. This involves some startup "
762 "latency to load all keys into memory. It is supported for the "
763 "same read/overwrite benchmarks as `-use_existing_db=true`, which "
764 "must also be set for this flag to be enabled. When this flag is "
765 "set, the value for `-num` will be ignored.");
766
767 DEFINE_bool(show_table_properties, false,
768 "If true, then per-level table"
769 " properties will be printed on every stats-interval when"
770 " stats_interval is set and stats_per_interval is on.");
771
772 DEFINE_string(db, "", "Use the db with the following name.");
773
774 DEFINE_bool(progress_reports, true,
775 "If true, db_bench will report number of finished operations.");
776
777 // Read cache flags
778
779 DEFINE_string(read_cache_path, "",
780 "If not empty string, a read cache will be used in this path");
781
782 DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
783 "Maximum size of the read cache");
784
785 DEFINE_bool(read_cache_direct_write, true,
786 "Whether to use Direct IO for writing to the read cache");
787
788 DEFINE_bool(read_cache_direct_read, true,
789 "Whether to use Direct IO for reading from read cache");
790
791 DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
792
793 static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
794 if (value >= 20) {
795 fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname,
796 value);
797 return false;
798 }
799 return true;
800 }
801
802 DEFINE_bool(verify_checksum, true,
803 "Verify checksum for every block read from storage");
804
805 DEFINE_int32(checksum_type,
806 ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum,
807 "ChecksumType as an int");
808
809 DEFINE_bool(statistics, false, "Database statistics");
810 DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
811 "stats level for statistics");
812 DEFINE_string(statistics_string, "", "Serialized statistics string");
813 static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
814
815 DEFINE_int64(writes, -1,
816 "Number of write operations to do. If negative, do --num reads.");
817
818 DEFINE_bool(finish_after_writes, false,
819 "Write thread terminates after all writes are finished");
820
821 DEFINE_bool(sync, false, "Sync all writes to disk");
822
823 DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
824
825 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
826
827 DEFINE_bool(manual_wal_flush, false,
828 "If true, buffer WAL until buffer is full or a manual FlushWAL().");
829
830 DEFINE_string(wal_compression, "none",
831 "Algorithm to use for WAL compression. none to disable.");
832 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e =
833 ROCKSDB_NAMESPACE::kNoCompression;
834
835 DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
836
837 DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
838 "Truth key/values used when using verify");
839
840 DEFINE_int32(num_levels, 7, "The total number of levels");
841
842 DEFINE_int64(target_file_size_base,
843 ROCKSDB_NAMESPACE::Options().target_file_size_base,
844 "Target file size at level-1");
845
846 DEFINE_int32(target_file_size_multiplier,
847 ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
848 "A multiplier to compute target level-N file size (N >= 2)");
849
850 DEFINE_uint64(max_bytes_for_level_base,
851 ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
852 "Max bytes for level-1");
853
854 DEFINE_bool(level_compaction_dynamic_level_bytes, false,
855 "Whether level size base is dynamic");
856
857 DEFINE_double(max_bytes_for_level_multiplier, 10,
858 "A multiplier to compute max bytes for level-N (N >= 2)");
859
860 static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
861 DEFINE_string(max_bytes_for_level_multiplier_additional, "",
862 "A vector that specifies additional fanout per level");
863
864 DEFINE_int32(level0_stop_writes_trigger,
865 ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
866 "Number of files in level-0 that will trigger put stop.");
867
868 DEFINE_int32(level0_slowdown_writes_trigger,
869 ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
870 "Number of files in level-0 that will slow down writes.");
871
872 DEFINE_int32(level0_file_num_compaction_trigger,
873 ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
874 "Number of files in level-0 when compactions start.");
875
876 DEFINE_uint64(periodic_compaction_seconds,
877 ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
878 "Files older than this will be picked up for compaction and"
879 " rewritten to the same level");
880
881 DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl");
882
883 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
884 if (value <= 0 || value >= 100) {
885 fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname,
886 value);
887 return false;
888 }
889 return true;
890 }
891 DEFINE_int32(readwritepercent, 90,
892 "Ratio of reads to reads/writes (expressed as percentage) for "
893 "the ReadRandomWriteRandom workload. The default value 90 means "
894 "90% operations out of all reads and writes operations are "
895 "reads. In other words, 9 gets for every 1 put.");
896
897 DEFINE_int32(mergereadpercent, 70,
898 "Ratio of merges to merges&reads (expressed as percentage) for "
899 "the ReadRandomMergeRandom workload. The default value 70 means "
900 "70% out of all read and merge operations are merges. In other "
901 "words, 7 merges for every 3 gets.");
902
903 DEFINE_int32(deletepercent, 2,
904 "Percentage of deletes out of reads/writes/deletes (used in "
905 "RandomWithVerify only). RandomWithVerify "
906 "calculates writepercent as (100 - FLAGS_readwritepercent - "
907 "deletepercent), so deletepercent must be smaller than (100 - "
908 "FLAGS_readwritepercent)");
909
910 DEFINE_bool(optimize_filters_for_hits,
911 ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits,
912 "Optimizes bloom filters for workloads for most lookups return "
913 "a value. For now this doesn't create bloom filters for the max "
914 "level of the LSM to reduce metadata that should fit in RAM. ");
915
916 DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks,
917 "RocksDB will aggressively check consistency of the data.");
918
919 DEFINE_bool(force_consistency_checks,
920 ROCKSDB_NAMESPACE::Options().force_consistency_checks,
921 "Runs consistency checks on the LSM every time a change is "
922 "applied.");
923
924 DEFINE_bool(check_flush_compaction_key_order,
925 ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order,
926 "During flush or compaction, check whether keys inserted to "
927 "output files are in order.");
928
929 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
930 "Ignored. Left here for backward compatibility");
931
932 DEFINE_int64(writes_before_delete_range, 0,
933 "Number of writes before DeleteRange is called regularly.");
934
935 DEFINE_int64(writes_per_range_tombstone, 0,
936 "Number of writes between range tombstones");
937
938 DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
939
940 DEFINE_int64(max_num_range_tombstones, 0,
941 "Maximum number of range tombstones to insert.");
942
943 DEFINE_bool(expand_range_tombstones, false,
944 "Expand range tombstone into sequential regular tombstones.");
945
946 #ifndef ROCKSDB_LITE
947 // Transactions Options
948 DEFINE_bool(optimistic_transaction_db, false,
949 "Open a OptimisticTransactionDB instance. "
950 "Required for randomtransaction benchmark.");
951
952 DEFINE_bool(transaction_db, false,
953 "Open a TransactionDB instance. "
954 "Required for randomtransaction benchmark.");
955
956 DEFINE_uint64(transaction_sets, 2,
957 "Number of keys each transaction will "
958 "modify (use in RandomTransaction only). Max: 9999");
959
960 DEFINE_bool(transaction_set_snapshot, false,
961 "Setting to true will have each transaction call SetSnapshot()"
962 " upon creation.");
963
964 DEFINE_int32(transaction_sleep, 0,
965 "Max microseconds to sleep in between "
966 "reading and writing a value (used in RandomTransaction only). ");
967
968 DEFINE_uint64(transaction_lock_timeout, 100,
969 "If using a transaction_db, specifies the lock wait timeout in"
970 " milliseconds before failing a transaction waiting on a lock");
971 DEFINE_string(
972 options_file, "",
973 "The path to a RocksDB options file. If specified, then db_bench will "
974 "run with the RocksDB options in the default column family of the "
975 "specified options file. "
976 "Note that with this setting, db_bench will ONLY accept the following "
977 "RocksDB options related command-line arguments, all other arguments "
978 "that are related to RocksDB options will be ignored:\n"
979 "\t--use_existing_db\n"
980 "\t--use_existing_keys\n"
981 "\t--statistics\n"
982 "\t--row_cache_size\n"
983 "\t--row_cache_numshardbits\n"
984 "\t--enable_io_prio\n"
985 "\t--dump_malloc_stats\n"
986 "\t--num_multi_db\n");
987
988 // FIFO Compaction Options
989 DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
990 "The limit of total table file sizes to trigger FIFO compaction");
991
992 DEFINE_bool(fifo_compaction_allow_compaction, true,
993 "Allow compaction in FIFO compaction.");
994
995 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
996
997 DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
998
999 // Stacked BlobDB Options
1000 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
1001
1002 DEFINE_bool(
1003 blob_db_enable_gc,
1004 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
1005 "[Stacked BlobDB] Enable BlobDB garbage collection.");
1006
1007 DEFINE_double(
1008 blob_db_gc_cutoff,
1009 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
1010 "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
1011
1012 DEFINE_bool(blob_db_is_fifo,
1013 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
1014 "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
1015
1016 DEFINE_uint64(blob_db_max_db_size,
1017 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
1018 "[Stacked BlobDB] Max size limit of the directory where blob "
1019 "files are stored.");
1020
1021 DEFINE_uint64(blob_db_max_ttl_range, 0,
1022 "[Stacked BlobDB] TTL range to generate BlobDB data (in "
1023 "seconds). 0 means no TTL.");
1024
1025 DEFINE_uint64(
1026 blob_db_ttl_range_secs,
1027 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
1028 "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
1029
1030 DEFINE_uint64(
1031 blob_db_min_blob_size,
1032 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
1033 "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
1034 "smaller than this will be inlined with the key in the LSM tree.");
1035
1036 DEFINE_uint64(blob_db_bytes_per_sync,
1037 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
1038 "[Stacked BlobDB] Bytes to sync blob file at.");
1039
1040 DEFINE_uint64(blob_db_file_size,
1041 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
1042 "[Stacked BlobDB] Target size of each blob file.");
1043
1044 DEFINE_string(
1045 blob_db_compression_type, "snappy",
1046 "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
1047 static enum ROCKSDB_NAMESPACE::CompressionType
1048 FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
1049
1050 #endif // ROCKSDB_LITE
1051
1052 // Integrated BlobDB options
1053 DEFINE_bool(
1054 enable_blob_files,
1055 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
1056 "[Integrated BlobDB] Enable writing large values to separate blob files.");
1057
1058 DEFINE_uint64(min_blob_size,
1059 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
1060 "[Integrated BlobDB] The size of the smallest value to be stored "
1061 "separately in a blob file.");
1062
1063 DEFINE_uint64(blob_file_size,
1064 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
1065 "[Integrated BlobDB] The size limit for blob files.");
1066
1067 DEFINE_string(blob_compression_type, "none",
1068 "[Integrated BlobDB] The compression algorithm to use for large "
1069 "values stored in blob files.");
1070
1071 DEFINE_bool(enable_blob_garbage_collection,
1072 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1073 .enable_blob_garbage_collection,
1074 "[Integrated BlobDB] Enable blob garbage collection.");
1075
1076 DEFINE_double(blob_garbage_collection_age_cutoff,
1077 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1078 .blob_garbage_collection_age_cutoff,
1079 "[Integrated BlobDB] The cutoff in terms of blob file age for "
1080 "garbage collection.");
1081
1082 DEFINE_double(blob_garbage_collection_force_threshold,
1083 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1084 .blob_garbage_collection_force_threshold,
1085 "[Integrated BlobDB] The threshold for the ratio of garbage in "
1086 "the oldest blob files for forcing garbage collection.");
1087
1088 DEFINE_uint64(blob_compaction_readahead_size,
1089 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1090 .blob_compaction_readahead_size,
1091 "[Integrated BlobDB] Compaction readahead for blob files.");
1092
1093 DEFINE_int32(
1094 blob_file_starting_level,
1095 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level,
1096 "[Integrated BlobDB] The starting level for blob files.");
1097
1098 DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache.");
1099
1100 DEFINE_bool(
1101 use_shared_block_and_blob_cache, true,
1102 "[Integrated BlobDB] Use a shared backing cache for both block "
1103 "cache and blob cache. It only takes effect if use_blob_cache is enabled.");
1104
1105 DEFINE_uint64(
1106 blob_cache_size, 8 << 20,
1107 "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only "
1108 "takes effect if the block and blob caches are different "
1109 "(use_shared_block_and_blob_cache = false).");
1110
1111 DEFINE_int32(blob_cache_numshardbits, 6,
1112 "[Integrated BlobDB] Number of shards for the blob cache is 2 ** "
1113 "blob_cache_numshardbits. Negative means use default settings. "
1114 "It only takes effect if blob_cache_size is greater than 0, and "
1115 "the block and blob caches are different "
1116 "(use_shared_block_and_blob_cache = false).");
1117
1118 DEFINE_int32(prepopulate_blob_cache, 0,
1119 "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
1120 "to disable and 1 to insert during flush.");
1121
1122 #ifndef ROCKSDB_LITE
1123
1124 // Secondary DB instance Options
1125 DEFINE_bool(use_secondary_db, false,
1126 "Open a RocksDB secondary instance. A primary instance can be "
1127 "running in another db_bench process.");
1128
1129 DEFINE_string(secondary_path, "",
1130 "Path to a directory used by the secondary instance to store "
1131 "private files, e.g. info log.");
1132
1133 DEFINE_int32(secondary_update_interval, 5,
1134 "Secondary instance attempts to catch up with the primary every "
1135 "secondary_update_interval seconds.");
1136
1137 #endif // ROCKSDB_LITE
1138
1139 DEFINE_bool(report_bg_io_stats, false,
1140 "Measure times spents on I/Os while in compactions. ");
1141
1142 DEFINE_bool(use_stderr_info_logger, false,
1143 "Write info logs to stderr instead of to LOG file. ");
1144
1145 #ifndef ROCKSDB_LITE
1146
1147 DEFINE_string(trace_file, "", "Trace workload to a file. ");
1148
1149 DEFINE_double(trace_replay_fast_forward, 1.0,
1150 "Fast forward trace replay, must > 0.0.");
1151 DEFINE_int32(block_cache_trace_sampling_frequency, 1,
1152 "Block cache trace sampling frequency, termed s. It uses spatial "
1153 "downsampling and samples accesses to one out of s blocks.");
1154 DEFINE_int64(
1155 block_cache_trace_max_trace_file_size_in_bytes,
1156 uint64_t{64} * 1024 * 1024 * 1024,
1157 "The maximum block cache trace file size in bytes. Block cache accesses "
1158 "will not be logged if the trace file size exceeds this threshold. Default "
1159 "is 64 GB.");
1160 DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
1161 DEFINE_int32(trace_replay_threads, 1,
1162 "The number of threads to replay, must >=1.");
1163
1164 DEFINE_bool(io_uring_enabled, true,
1165 "If true, enable the use of IO uring if the platform supports it");
1166 extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; }
1167 #endif // ROCKSDB_LITE
1168
1169 DEFINE_bool(adaptive_readahead, false,
1170 "carry forward internal auto readahead size from one file to next "
1171 "file at each level during iteration");
1172
1173 DEFINE_bool(rate_limit_user_ops, false,
1174 "When true use Env::IO_USER priority level to charge internal rate "
1175 "limiter for reads associated with user operations.");
1176
1177 DEFINE_bool(file_checksum, false,
1178 "When true use FileChecksumGenCrc32cFactory for "
1179 "file_checksum_gen_factory.");
1180
1181 DEFINE_bool(rate_limit_auto_wal_flush, false,
1182 "When true use Env::IO_USER priority level to charge internal rate "
1183 "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
1184 "false) after the user write operation.");
1185
1186 DEFINE_bool(async_io, false,
1187 "When set true, RocksDB does asynchronous reads for internal auto "
1188 "readahead prefetching.");
1189
1190 DEFINE_bool(optimize_multiget_for_io, true,
1191 "When set true, RocksDB does asynchronous reads for SST files in "
1192 "multiple levels for MultiGet.");
1193
1194 DEFINE_bool(charge_compression_dictionary_building_buffer, false,
1195 "Setting for "
1196 "CacheEntryRoleOptions::charged of "
1197 "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
1198
1199 DEFINE_bool(charge_filter_construction, false,
1200 "Setting for "
1201 "CacheEntryRoleOptions::charged of "
1202 "CacheEntryRole::kFilterConstruction");
1203
1204 DEFINE_bool(charge_table_reader, false,
1205 "Setting for "
1206 "CacheEntryRoleOptions::charged of "
1207 "CacheEntryRole::kBlockBasedTableReader");
1208
1209 DEFINE_bool(charge_file_metadata, false,
1210 "Setting for "
1211 "CacheEntryRoleOptions::charged of "
1212 "CacheEntryRole::kFileMetadata");
1213
1214 DEFINE_bool(charge_blob_cache, false,
1215 "Setting for "
1216 "CacheEntryRoleOptions::charged of "
1217 "CacheEntryRole::kBlobCache");
1218
1219 DEFINE_uint64(backup_rate_limit, 0ull,
1220 "If non-zero, db_bench will rate limit reads and writes for DB "
1221 "backup. This "
1222 "is the global rate in ops/second.");
1223
1224 DEFINE_uint64(restore_rate_limit, 0ull,
1225 "If non-zero, db_bench will rate limit reads and writes for DB "
1226 "restore. This "
1227 "is the global rate in ops/second.");
1228
1229 DEFINE_string(backup_dir, "",
1230 "If not empty string, use the given dir for backup.");
1231
1232 DEFINE_string(restore_dir, "",
1233 "If not empty string, use the given dir for restore.");
1234
1235 DEFINE_uint64(
1236 initial_auto_readahead_size,
1237 ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size,
1238 "RocksDB does auto-readahead for iterators on noticing more than two reads "
1239 "for a table file if user doesn't provide readahead_size. The readahead "
1240 "size starts at initial_auto_readahead_size");
1241
1242 DEFINE_uint64(
1243 max_auto_readahead_size,
1244 ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size,
1245 "Rocksdb implicit readahead starts at "
1246 "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every "
1247 "additional read upto max_auto_readahead_size");
1248
1249 DEFINE_uint64(
1250 num_file_reads_for_auto_readahead,
1251 ROCKSDB_NAMESPACE::BlockBasedTableOptions()
1252 .num_file_reads_for_auto_readahead,
1253 "Rocksdb implicit readahead is enabled if reads are sequential and "
1254 "num_file_reads_for_auto_readahead indicates after how many sequential "
1255 "reads into that file internal auto prefetching should be start.");
1256
1257 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
1258 const char* ctype) {
1259 assert(ctype);
1260
1261 if (!strcasecmp(ctype, "none"))
1262 return ROCKSDB_NAMESPACE::kNoCompression;
1263 else if (!strcasecmp(ctype, "snappy"))
1264 return ROCKSDB_NAMESPACE::kSnappyCompression;
1265 else if (!strcasecmp(ctype, "zlib"))
1266 return ROCKSDB_NAMESPACE::kZlibCompression;
1267 else if (!strcasecmp(ctype, "bzip2"))
1268 return ROCKSDB_NAMESPACE::kBZip2Compression;
1269 else if (!strcasecmp(ctype, "lz4"))
1270 return ROCKSDB_NAMESPACE::kLZ4Compression;
1271 else if (!strcasecmp(ctype, "lz4hc"))
1272 return ROCKSDB_NAMESPACE::kLZ4HCCompression;
1273 else if (!strcasecmp(ctype, "xpress"))
1274 return ROCKSDB_NAMESPACE::kXpressCompression;
1275 else if (!strcasecmp(ctype, "zstd"))
1276 return ROCKSDB_NAMESPACE::kZSTD;
1277 else {
1278 fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
1279 exit(1);
1280 }
1281 }
1282
1283 static std::string ColumnFamilyName(size_t i) {
1284 if (i == 0) {
1285 return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
1286 } else {
1287 char name[100];
1288 snprintf(name, sizeof(name), "column_family_name_%06zu", i);
1289 return std::string(name);
1290 }
1291 }
1292
1293 DEFINE_string(compression_type, "snappy",
1294 "Algorithm to use to compress the database");
1295 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
1296 ROCKSDB_NAMESPACE::kSnappyCompression;
1297
1298 DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
1299
1300 DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
1301 "Compression level. The meaning of this value is library-"
1302 "dependent. If unset, we try to use the default for the library "
1303 "specified in `--compression_type`");
1304
1305 DEFINE_int32(compression_max_dict_bytes,
1306 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
1307 "Maximum size of dictionary used to prime the compression "
1308 "library.");
1309
1310 DEFINE_int32(compression_zstd_max_train_bytes,
1311 ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
1312 "Maximum size of training data passed to zstd's dictionary "
1313 "trainer.");
1314
1315 DEFINE_int32(min_level_to_compress, -1,
1316 "If non-negative, compression starts"
1317 " from this level. Levels with number < min_level_to_compress are"
1318 " not compressed. Otherwise, apply compression_type to "
1319 "all levels.");
1320
1321 DEFINE_int32(compression_parallel_threads, 1,
1322 "Number of threads for parallel compression.");
1323
1324 DEFINE_uint64(compression_max_dict_buffer_bytes,
1325 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
1326 "Maximum bytes to buffer to collect samples for dictionary.");
1327
1328 DEFINE_bool(compression_use_zstd_dict_trainer,
1329 ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer,
1330 "If true, use ZSTD_TrainDictionary() to create dictionary, else"
1331 "use ZSTD_FinalizeDictionary() to create dictionary");
1332
1333 static bool ValidateTableCacheNumshardbits(const char* flagname,
1334 int32_t value) {
1335 if (0 >= value || value >= 20) {
1336 fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val < 20\n",
1337 flagname, value);
1338 return false;
1339 }
1340 return true;
1341 }
1342 DEFINE_int32(table_cache_numshardbits, 4, "");
1343
1344 #ifndef ROCKSDB_LITE
1345 DEFINE_string(env_uri, "",
1346 "URI for registry Env lookup. Mutually exclusive with --fs_uri");
1347 DEFINE_string(fs_uri, "",
1348 "URI for registry Filesystem lookup. Mutually exclusive"
1349 " with --env_uri."
1350 " Creates a default environment with the specified filesystem.");
1351 #endif // ROCKSDB_LITE
1352 DEFINE_string(simulate_hybrid_fs_file, "",
1353 "File for Store Metadata for Simulate hybrid FS. Empty means "
1354 "disable the feature. Now, if it is set, last_level_temperature "
1355 "is set to kWarm.");
1356 DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
1357 "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
1358 "are simulated.");
1359 DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
1360
1361 DEFINE_int64(
1362 preclude_last_level_data_seconds, 0,
1363 "Preclude the latest data from the last level. (Used for tiered storage)");
1364
1365 DEFINE_int64(preserve_internal_time_seconds, 0,
1366 "Preserve the internal time information which stores with SST.");
1367
1368 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
1369
1370 static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
1371
1372 DEFINE_int64(stats_interval, 0,
1373 "Stats are reported every N operations when this is greater than "
1374 "zero. When 0 the interval grows over time.");
1375
1376 DEFINE_int64(stats_interval_seconds, 0,
1377 "Report stats every N seconds. This overrides stats_interval when"
1378 " both are > 0.");
1379
1380 DEFINE_int32(stats_per_interval, 0,
1381 "Reports additional stats per interval when this is greater than "
1382 "0.");
1383
1384 DEFINE_uint64(slow_usecs, 1000000,
1385 "A message is printed for operations that take at least this "
1386 "many microseconds.");
1387
1388 DEFINE_int64(report_interval_seconds, 0,
1389 "If greater than zero, it will write simple stats in CSV format "
1390 "to --report_file every N seconds");
1391
1392 DEFINE_string(report_file, "report.csv",
1393 "Filename where some simple stats are reported to (if "
1394 "--report_interval_seconds is bigger than 0)");
1395
1396 DEFINE_int32(thread_status_per_interval, 0,
1397 "Takes and report a snapshot of the current status of each thread"
1398 " when this is greater than 0.");
1399
1400 DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
1401 "Level of perf collection");
1402
1403 DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
1404 "Slowdown writes if pending compaction bytes exceed this number");
1405
1406 DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
1407 "Stop writes if pending compaction bytes exceed this number");
1408
1409 DEFINE_uint64(delayed_write_rate, 8388608u,
1410 "Limited bytes allowed to DB when soft_rate_limit or "
1411 "level0_slowdown_writes_trigger triggers");
1412
1413 DEFINE_bool(enable_pipelined_write, true,
1414 "Allow WAL and memtable writes to be pipelined");
1415
1416 DEFINE_bool(
1417 unordered_write, false,
1418 "Enable the unordered write feature, which provides higher throughput but "
1419 "relaxes the guarantees around atomic reads and immutable snapshots");
1420
1421 DEFINE_bool(allow_concurrent_memtable_write, true,
1422 "Allow multi-writers to update mem tables in parallel.");
1423
1424 DEFINE_double(experimental_mempurge_threshold, 0.0,
1425 "Maximum useful payload ratio estimate that triggers a mempurge "
1426 "(memtable garbage collection).");
1427
1428 DEFINE_bool(inplace_update_support,
1429 ROCKSDB_NAMESPACE::Options().inplace_update_support,
1430 "Support in-place memtable update for smaller or same-size values");
1431
1432 DEFINE_uint64(inplace_update_num_locks,
1433 ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
1434 "Number of RW locks to protect in-place memtable updates");
1435
1436 DEFINE_bool(enable_write_thread_adaptive_yield, true,
1437 "Use a yielding spin loop for brief writer thread waits.");
1438
1439 DEFINE_uint64(
1440 write_thread_max_yield_usec, 100,
1441 "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
1442
1443 DEFINE_uint64(write_thread_slow_yield_usec, 3,
1444 "The threshold at which a slow yield is considered a signal that "
1445 "other processes or threads want the core.");
1446
1447 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
1448
1449 DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000,
1450 "Set refill period on rate limiter.");
1451
1452 DEFINE_bool(rate_limiter_auto_tuned, false,
1453 "Enable dynamic adjustment of rate limit according to demand for "
1454 "background I/O");
1455
1456 DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit");
1457
1458 DEFINE_uint64(
1459 sine_write_rate_interval_milliseconds, 10000,
1460 "Interval of which the sine wave write_rate_limit is recalculated");
1461
1462 DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d");
1463
1464 DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d");
1465
1466 DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d");
1467
1468 DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d");
1469
1470 DEFINE_bool(rate_limit_bg_reads, false,
1471 "Use options.rate_limiter on compaction reads");
1472
1473 DEFINE_uint64(
1474 benchmark_write_rate_limit, 0,
1475 "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
1476 "is the global rate in bytes/second.");
1477
1478 // the parameters of mix_graph
1479 DEFINE_double(keyrange_dist_a, 0.0,
1480 "The parameter 'a' of prefix average access distribution "
1481 "f(x)=a*exp(b*x)+c*exp(d*x)");
1482 DEFINE_double(keyrange_dist_b, 0.0,
1483 "The parameter 'b' of prefix average access distribution "
1484 "f(x)=a*exp(b*x)+c*exp(d*x)");
1485 DEFINE_double(keyrange_dist_c, 0.0,
1486 "The parameter 'c' of prefix average access distribution"
1487 "f(x)=a*exp(b*x)+c*exp(d*x)");
1488 DEFINE_double(keyrange_dist_d, 0.0,
1489 "The parameter 'd' of prefix average access distribution"
1490 "f(x)=a*exp(b*x)+c*exp(d*x)");
1491 DEFINE_int64(keyrange_num, 1,
1492 "The number of key ranges that are in the same prefix "
1493 "group, each prefix range will have its key access distribution");
1494 DEFINE_double(key_dist_a, 0.0,
1495 "The parameter 'a' of key access distribution model f(x)=a*x^b");
1496 DEFINE_double(key_dist_b, 0.0,
1497 "The parameter 'b' of key access distribution model f(x)=a*x^b");
1498 DEFINE_double(value_theta, 0.0,
1499 "The parameter 'theta' of Generized Pareto Distribution "
1500 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1501 // Use reasonable defaults based on the mixgraph paper
1502 DEFINE_double(value_k, 0.2615,
1503 "The parameter 'k' of Generized Pareto Distribution "
1504 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1505 // Use reasonable defaults based on the mixgraph paper
1506 DEFINE_double(value_sigma, 25.45,
1507 "The parameter 'theta' of Generized Pareto Distribution "
1508 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1509 DEFINE_double(iter_theta, 0.0,
1510 "The parameter 'theta' of Generized Pareto Distribution "
1511 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1512 // Use reasonable defaults based on the mixgraph paper
1513 DEFINE_double(iter_k, 2.517,
1514 "The parameter 'k' of Generized Pareto Distribution "
1515 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1516 // Use reasonable defaults based on the mixgraph paper
1517 DEFINE_double(iter_sigma, 14.236,
1518 "The parameter 'sigma' of Generized Pareto Distribution "
1519 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1520 DEFINE_double(mix_get_ratio, 1.0,
1521 "The ratio of Get queries of mix_graph workload");
1522 DEFINE_double(mix_put_ratio, 0.0,
1523 "The ratio of Put queries of mix_graph workload");
1524 DEFINE_double(mix_seek_ratio, 0.0,
1525 "The ratio of Seek queries of mix_graph workload");
1526 DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
1527 DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
1528 DEFINE_double(
1529 sine_mix_rate_noise, 0.0,
1530 "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
1531 DEFINE_bool(sine_mix_rate, false,
1532 "Enable the sine QPS control on the mix workload");
1533 DEFINE_uint64(
1534 sine_mix_rate_interval_milliseconds, 10000,
1535 "Interval of which the sine wave read_rate_limit is recalculated");
1536 DEFINE_int64(mix_accesses, -1,
1537 "The total query accesses of mix_graph workload");
1538
1539 DEFINE_uint64(
1540 benchmark_read_rate_limit, 0,
1541 "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
1542 "is the global rate in ops/second.");
1543
1544 DEFINE_uint64(max_compaction_bytes,
1545 ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
1546 "Max bytes allowed in one compaction");
1547
1548 #ifndef ROCKSDB_LITE
1549 DEFINE_bool(readonly, false, "Run read only benchmarks.");
1550
1551 DEFINE_bool(print_malloc_stats, false,
1552 "Print malloc stats to stdout after benchmarks finish.");
1553 #endif // ROCKSDB_LITE
1554
1555 DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1556
1557 DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
1558 DEFINE_uint64(wal_size_limit_MB, 0,
1559 "Set the size limit for the WAL Files in MB.");
1560 DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1561
1562 DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
1563 "Allow reads to occur via mmap-ing files");
1564
1565 DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
1566 "Allow writes to occur via mmap-ing files");
1567
1568 DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
1569 "Use O_DIRECT for reading data");
1570
1571 DEFINE_bool(use_direct_io_for_flush_and_compaction,
1572 ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
1573 "Use O_DIRECT for background flush and compaction writes");
1574
1575 DEFINE_bool(advise_random_on_open,
1576 ROCKSDB_NAMESPACE::Options().advise_random_on_open,
1577 "Advise random access on table file open");
1578
1579 DEFINE_string(compaction_fadvice, "NORMAL",
1580 "Access pattern advice when a file is compacted");
1581 static auto FLAGS_compaction_fadvice_e =
1582 ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
1583
1584 DEFINE_bool(use_tailing_iterator, false,
1585 "Use tailing iterator to access a series of keys instead of get");
1586
1587 DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
1588 "Use adaptive mutex");
1589
1590 DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
1591 "Allows OS to incrementally sync SST files to disk while they are"
1592 " being written, in the background. Issue one request for every"
1593 " bytes_per_sync written. 0 turns it off.");
1594
1595 DEFINE_uint64(wal_bytes_per_sync,
1596 ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
1597 "Allows OS to incrementally sync WAL files to disk while they are"
1598 " being written, in the background. Issue one request for every"
1599 " wal_bytes_per_sync written. 0 turns it off.");
1600
1601 DEFINE_bool(use_single_deletes, true,
1602 "Use single deletes (used in RandomReplaceKeys only).");
1603
1604 DEFINE_double(stddev, 2000.0,
1605 "Standard deviation of normal distribution used for picking keys"
1606 " (used in RandomReplaceKeys only).");
1607
1608 DEFINE_int32(key_id_range, 100000,
1609 "Range of possible value of key id (used in TimeSeries only).");
1610
1611 DEFINE_string(expire_style, "none",
1612 "Style to remove expired time entries. Can be one of the options "
1613 "below: none (do not expired data), compaction_filter (use a "
1614 "compaction filter to remove expired data), delete (seek IDs and "
1615 "remove expired data) (used in TimeSeries only).");
1616
1617 DEFINE_uint64(
1618 time_range, 100000,
1619 "Range of timestamp that store in the database (used in TimeSeries"
1620 " only).");
1621
1622 DEFINE_int32(num_deletion_threads, 1,
1623 "Number of threads to do deletion (used in TimeSeries and delete "
1624 "expire_style only).");
1625
1626 DEFINE_int32(max_successive_merges, 0,
1627 "Maximum number of successive merge operations on a key in the "
1628 "memtable");
1629
1630 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
1631 if (value < 0 || value >= 2000000000) {
1632 fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
1633 flagname, value);
1634 return false;
1635 }
1636 return true;
1637 }
1638
1639 DEFINE_int32(prefix_size, 0,
1640 "control the prefix size for HashSkipList and plain table");
1641 DEFINE_int64(keys_per_prefix, 0,
1642 "control average number of keys generated per prefix, 0 means no "
1643 "special handling of the prefix, i.e. use the prefix comes with "
1644 "the generated random number.");
1645 DEFINE_bool(total_order_seek, false,
1646 "Enable total order seek regardless of index format.");
1647 DEFINE_bool(prefix_same_as_start, false,
1648 "Enforce iterator to return keys with prefix same as seek key.");
1649 DEFINE_bool(
1650 seek_missing_prefix, false,
1651 "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
1652
1653 DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
1654 "If non-zero, enable "
1655 "memtable insert with hint with the given prefix size.");
1656 DEFINE_bool(enable_io_prio, false,
1657 "Lower the background flush/compaction threads' IO priority");
1658 DEFINE_bool(enable_cpu_prio, false,
1659 "Lower the background flush/compaction threads' CPU priority");
1660 DEFINE_bool(identity_as_first_hash, false,
1661 "the first hash function of cuckoo table becomes an identity "
1662 "function. This is only valid when key is 8 bytes");
1663 DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
1664 DEFINE_uint64(stats_dump_period_sec,
1665 ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
1666 "Gap between printing stats to log in seconds");
1667 DEFINE_uint64(stats_persist_period_sec,
1668 ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
1669 "Gap between persisting stats in seconds");
1670 DEFINE_bool(persist_stats_to_disk,
1671 ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
1672 "whether to persist stats to disk");
1673 DEFINE_uint64(stats_history_buffer_size,
1674 ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
1675 "Max number of stats snapshots to keep in memory");
1676 DEFINE_bool(avoid_flush_during_recovery,
1677 ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
1678 "If true, avoids flushing the recovered WAL data where possible.");
1679 DEFINE_int64(multiread_stride, 0,
1680 "Stride length for the keys in a MultiGet batch");
1681 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
1682
1683 DEFINE_string(memtablerep, "skip_list", "");
1684 DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
1685 DEFINE_bool(use_plain_table, false,
1686 "if use plain table instead of block-based table format");
1687 DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
1688 DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1689 DEFINE_bool(use_hash_search, false,
1690 "if use kHashSearch instead of kBinarySearch. "
1691 "This is valid if only we use BlockTable");
1692 DEFINE_string(merge_operator, "",
1693 "The merge operator to use with the database."
1694 "If a new merge operator is specified, be sure to use fresh"
1695 " database The possible merge operators are defined in"
1696 " utilities/merge_operators.h");
1697 DEFINE_int32(skip_list_lookahead, 0,
1698 "Used with skip_list memtablerep; try linear search first for "
1699 "this many steps from the previous position");
1700 DEFINE_bool(report_file_operations, false,
1701 "if report number of file operations");
1702 DEFINE_bool(report_open_timing, false, "if report open timing");
1703 DEFINE_int32(readahead_size, 0, "Iterator readahead size");
1704
1705 DEFINE_bool(read_with_latest_user_timestamp, true,
1706 "If true, always use the current latest timestamp for read. If "
1707 "false, choose a random timestamp from the past.");
1708
1709 #ifndef ROCKSDB_LITE
1710 DEFINE_string(secondary_cache_uri, "",
1711 "Full URI for creating a custom secondary cache object");
1712 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
1713 #endif // ROCKSDB_LITE
1714
1715 static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
1716 RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
1717
1718 static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
1719 RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
1720
1721 static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
1722 RegisterFlagValidator(&FLAGS_cache_numshardbits,
1723 &ValidateCacheNumshardbits);
1724
1725 static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
1726 RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
1727
1728 DEFINE_int32(disable_seek_compaction, false,
1729 "Not used, left here for backwards compatibility");
1730
1731 DEFINE_bool(allow_data_in_errors,
1732 ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
1733 "If true, allow logging data, e.g. key, value in LOG files.");
1734
1735 static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
1736 RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
1737 static const bool FLAGS_table_cache_numshardbits_dummy
1738 __attribute__((__unused__)) = RegisterFlagValidator(
1739 &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits);
1740
1741 DEFINE_uint32(write_batch_protection_bytes_per_key, 0,
1742 "Size of per-key-value checksum in each write batch. Currently "
1743 "only value 0 and 8 are supported.");
1744
1745 DEFINE_uint32(
1746 memtable_protection_bytes_per_key, 0,
1747 "Enable memtable per key-value checksum protection. "
1748 "Each entry in memtable will be suffixed by a per key-value checksum. "
1749 "This options determines the size of such checksums. "
1750 "Supported values: 0, 1, 2, 4, 8.");
1751
1752 DEFINE_bool(build_info, false,
1753 "Print the build info via GetRocksBuildInfoAsString");
1754
1755 DEFINE_bool(track_and_verify_wals_in_manifest, false,
1756 "If true, enable WAL tracking in the MANIFEST");
1757
1758 namespace ROCKSDB_NAMESPACE {
1759 namespace {
1760 static Status CreateMemTableRepFactory(
1761 const ConfigOptions& config_options,
1762 std::shared_ptr<MemTableRepFactory>* factory) {
1763 Status s;
1764 if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) {
1765 factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead));
1766 #ifndef ROCKSDB_LITE
1767 } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) {
1768 factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
1769 } else if (!strcasecmp(FLAGS_memtablerep.c_str(),
1770 VectorRepFactory::kNickName())) {
1771 factory->reset(new VectorRepFactory());
1772 } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) {
1773 factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count));
1774 #endif // ROCKSDB_LITE
1775 } else {
1776 std::unique_ptr<MemTableRepFactory> unique;
1777 s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep,
1778 &unique);
1779 if (s.ok()) {
1780 factory->reset(unique.release());
1781 }
1782 }
1783 return s;
1784 }
1785
1786 } // namespace
1787
1788 enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal };
1789
1790 static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
1791
1792 static enum DistributionType StringToDistributionType(const char* ctype) {
1793 assert(ctype);
1794
1795 if (!strcasecmp(ctype, "fixed"))
1796 return kFixed;
1797 else if (!strcasecmp(ctype, "uniform"))
1798 return kUniform;
1799 else if (!strcasecmp(ctype, "normal"))
1800 return kNormal;
1801
1802 fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
1803 exit(1);
1804 }
1805
1806 class BaseDistribution {
1807 public:
1808 BaseDistribution(unsigned int _min, unsigned int _max)
1809 : min_value_size_(_min), max_value_size_(_max) {}
1810 virtual ~BaseDistribution() {}
1811
1812 unsigned int Generate() {
1813 auto val = Get();
1814 if (NeedTruncate()) {
1815 val = std::max(min_value_size_, val);
1816 val = std::min(max_value_size_, val);
1817 }
1818 return val;
1819 }
1820
1821 private:
1822 virtual unsigned int Get() = 0;
1823 virtual bool NeedTruncate() { return true; }
1824 unsigned int min_value_size_;
1825 unsigned int max_value_size_;
1826 };
1827
1828 class FixedDistribution : public BaseDistribution {
1829 public:
1830 FixedDistribution(unsigned int size)
1831 : BaseDistribution(size, size), size_(size) {}
1832
1833 private:
1834 virtual unsigned int Get() override { return size_; }
1835 virtual bool NeedTruncate() override { return false; }
1836 unsigned int size_;
1837 };
1838
1839 class NormalDistribution : public BaseDistribution,
1840 public std::normal_distribution<double> {
1841 public:
1842 NormalDistribution(unsigned int _min, unsigned int _max)
1843 : BaseDistribution(_min, _max),
1844 // 99.7% values within the range [min, max].
1845 std::normal_distribution<double>(
1846 (double)(_min + _max) / 2.0 /*mean*/,
1847 (double)(_max - _min) / 6.0 /*stddev*/),
1848 gen_(rd_()) {}
1849
1850 private:
1851 virtual unsigned int Get() override {
1852 return static_cast<unsigned int>((*this)(gen_));
1853 }
1854 std::random_device rd_;
1855 std::mt19937 gen_;
1856 };
1857
1858 class UniformDistribution : public BaseDistribution,
1859 public std::uniform_int_distribution<unsigned int> {
1860 public:
1861 UniformDistribution(unsigned int _min, unsigned int _max)
1862 : BaseDistribution(_min, _max),
1863 std::uniform_int_distribution<unsigned int>(_min, _max),
1864 gen_(rd_()) {}
1865
1866 private:
1867 virtual unsigned int Get() override { return (*this)(gen_); }
1868 virtual bool NeedTruncate() override { return false; }
1869 std::random_device rd_;
1870 std::mt19937 gen_;
1871 };
1872
1873 // Helper for quickly generating random data.
1874 class RandomGenerator {
1875 private:
1876 std::string data_;
1877 unsigned int pos_;
1878 std::unique_ptr<BaseDistribution> dist_;
1879
1880 public:
1881 RandomGenerator() {
1882 auto max_value_size = FLAGS_value_size_max;
1883 switch (FLAGS_value_size_distribution_type_e) {
1884 case kUniform:
1885 dist_.reset(new UniformDistribution(FLAGS_value_size_min,
1886 FLAGS_value_size_max));
1887 break;
1888 case kNormal:
1889 dist_.reset(
1890 new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max));
1891 break;
1892 case kFixed:
1893 default:
1894 dist_.reset(new FixedDistribution(value_size));
1895 max_value_size = value_size;
1896 }
1897 // We use a limited amount of data over and over again and ensure
1898 // that it is larger than the compression window (32KB), and also
1899 // large enough to serve all typical value sizes we want to write.
1900 Random rnd(301);
1901 std::string piece;
1902 while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
1903 // Add a short fragment that is as compressible as specified
1904 // by FLAGS_compression_ratio.
1905 test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
1906 data_.append(piece);
1907 }
1908 pos_ = 0;
1909 }
1910
1911 Slice Generate(unsigned int len) {
1912 assert(len <= data_.size());
1913 if (pos_ + len > data_.size()) {
1914 pos_ = 0;
1915 }
1916 pos_ += len;
1917 return Slice(data_.data() + pos_ - len, len);
1918 }
1919
1920 Slice Generate() {
1921 auto len = dist_->Generate();
1922 return Generate(len);
1923 }
1924 };
1925
1926 static void AppendWithSpace(std::string* str, Slice msg) {
1927 if (msg.empty()) return;
1928 if (!str->empty()) {
1929 str->push_back(' ');
1930 }
1931 str->append(msg.data(), msg.size());
1932 }
1933
1934 struct DBWithColumnFamilies {
1935 std::vector<ColumnFamilyHandle*> cfh;
1936 DB* db;
1937 #ifndef ROCKSDB_LITE
1938 OptimisticTransactionDB* opt_txn_db;
1939 #endif // ROCKSDB_LITE
1940 std::atomic<size_t> num_created; // Need to be updated after all the
1941 // new entries in cfh are set.
1942 size_t num_hot; // Number of column families to be queried at each moment.
1943 // After each CreateNewCf(), another num_hot number of new
1944 // Column families will be created and used to be queried.
1945 port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
1946 std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating
1947 // on cfh[i].
1948
1949 DBWithColumnFamilies()
1950 : db(nullptr)
1951 #ifndef ROCKSDB_LITE
1952 ,
1953 opt_txn_db(nullptr)
1954 #endif // ROCKSDB_LITE
1955 {
1956 cfh.clear();
1957 num_created = 0;
1958 num_hot = 0;
1959 }
1960
1961 DBWithColumnFamilies(const DBWithColumnFamilies& other)
1962 : cfh(other.cfh),
1963 db(other.db),
1964 #ifndef ROCKSDB_LITE
1965 opt_txn_db(other.opt_txn_db),
1966 #endif // ROCKSDB_LITE
1967 num_created(other.num_created.load()),
1968 num_hot(other.num_hot),
1969 cfh_idx_to_prob(other.cfh_idx_to_prob) {
1970 }
1971
1972 void DeleteDBs() {
1973 std::for_each(cfh.begin(), cfh.end(),
1974 [](ColumnFamilyHandle* cfhi) { delete cfhi; });
1975 cfh.clear();
1976 #ifndef ROCKSDB_LITE
1977 if (opt_txn_db) {
1978 delete opt_txn_db;
1979 opt_txn_db = nullptr;
1980 } else {
1981 delete db;
1982 db = nullptr;
1983 }
1984 #else
1985 delete db;
1986 db = nullptr;
1987 #endif // ROCKSDB_LITE
1988 }
1989
1990 ColumnFamilyHandle* GetCfh(int64_t rand_num) {
1991 assert(num_hot > 0);
1992 size_t rand_offset = 0;
1993 if (!cfh_idx_to_prob.empty()) {
1994 assert(cfh_idx_to_prob.size() == num_hot);
1995 int sum = 0;
1996 while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
1997 sum += cfh_idx_to_prob[rand_offset];
1998 ++rand_offset;
1999 }
2000 assert(rand_offset < cfh_idx_to_prob.size());
2001 } else {
2002 rand_offset = rand_num % num_hot;
2003 }
2004 return cfh[num_created.load(std::memory_order_acquire) - num_hot +
2005 rand_offset];
2006 }
2007
2008 // stage: assume CF from 0 to stage * num_hot has be created. Need to create
2009 // stage * num_hot + 1 to stage * (num_hot + 1).
2010 void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
2011 MutexLock l(&create_cf_mutex);
2012 if ((stage + 1) * num_hot <= num_created) {
2013 // Already created.
2014 return;
2015 }
2016 auto new_num_created = num_created + num_hot;
2017 assert(new_num_created <= cfh.size());
2018 for (size_t i = num_created; i < new_num_created; i++) {
2019 Status s =
2020 db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
2021 if (!s.ok()) {
2022 fprintf(stderr, "create column family error: %s\n",
2023 s.ToString().c_str());
2024 abort();
2025 }
2026 }
2027 num_created.store(new_num_created, std::memory_order_release);
2028 }
2029 };
2030
2031 // A class that reports stats to CSV file.
2032 class ReporterAgent {
2033 public:
2034 ReporterAgent(Env* env, const std::string& fname,
2035 uint64_t report_interval_secs)
2036 : env_(env),
2037 total_ops_done_(0),
2038 last_report_(0),
2039 report_interval_secs_(report_interval_secs),
2040 stop_(false) {
2041 auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
2042 if (s.ok()) {
2043 s = report_file_->Append(Header() + "\n");
2044 }
2045 if (s.ok()) {
2046 s = report_file_->Flush();
2047 }
2048 if (!s.ok()) {
2049 fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
2050 s.ToString().c_str());
2051 abort();
2052 }
2053
2054 reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
2055 }
2056
2057 ~ReporterAgent() {
2058 {
2059 std::unique_lock<std::mutex> lk(mutex_);
2060 stop_ = true;
2061 stop_cv_.notify_all();
2062 }
2063 reporting_thread_.join();
2064 }
2065
2066 // thread safe
2067 void ReportFinishedOps(int64_t num_ops) {
2068 total_ops_done_.fetch_add(num_ops);
2069 }
2070
2071 private:
2072 std::string Header() const { return "secs_elapsed,interval_qps"; }
2073 void SleepAndReport() {
2074 auto* clock = env_->GetSystemClock().get();
2075 auto time_started = clock->NowMicros();
2076 while (true) {
2077 {
2078 std::unique_lock<std::mutex> lk(mutex_);
2079 if (stop_ ||
2080 stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
2081 [&]() { return stop_; })) {
2082 // stopping
2083 break;
2084 }
2085 // else -> timeout, which means time for a report!
2086 }
2087 auto total_ops_done_snapshot = total_ops_done_.load();
2088 // round the seconds elapsed
2089 auto secs_elapsed =
2090 (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
2091 kMicrosInSecond;
2092 std::string report =
2093 std::to_string(secs_elapsed) + "," +
2094 std::to_string(total_ops_done_snapshot - last_report_) + "\n";
2095 auto s = report_file_->Append(report);
2096 if (s.ok()) {
2097 s = report_file_->Flush();
2098 }
2099 if (!s.ok()) {
2100 fprintf(stderr,
2101 "Can't write to report file (%s), stopping the reporting\n",
2102 s.ToString().c_str());
2103 break;
2104 }
2105 last_report_ = total_ops_done_snapshot;
2106 }
2107 }
2108
2109 Env* env_;
2110 std::unique_ptr<WritableFile> report_file_;
2111 std::atomic<int64_t> total_ops_done_;
2112 int64_t last_report_;
2113 const uint64_t report_interval_secs_;
2114 ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
2115 std::mutex mutex_;
2116 // will notify on stop
2117 std::condition_variable stop_cv_;
2118 bool stop_;
2119 };
2120
2121 enum OperationType : unsigned char {
2122 kRead = 0,
2123 kWrite,
2124 kDelete,
2125 kSeek,
2126 kMerge,
2127 kUpdate,
2128 kCompress,
2129 kUncompress,
2130 kCrc,
2131 kHash,
2132 kOthers
2133 };
2134
2135 static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
2136 OperationTypeString = {{kRead, "read"}, {kWrite, "write"},
2137 {kDelete, "delete"}, {kSeek, "seek"},
2138 {kMerge, "merge"}, {kUpdate, "update"},
2139 {kCompress, "compress"}, {kCompress, "uncompress"},
2140 {kCrc, "crc"}, {kHash, "hash"},
2141 {kOthers, "op"}};
2142
2143 class CombinedStats;
2144 class Stats {
2145 private:
2146 SystemClock* clock_;
2147 int id_;
2148 uint64_t start_ = 0;
2149 uint64_t sine_interval_;
2150 uint64_t finish_;
2151 double seconds_;
2152 uint64_t done_;
2153 uint64_t last_report_done_;
2154 uint64_t next_report_;
2155 uint64_t bytes_;
2156 uint64_t last_op_finish_;
2157 uint64_t last_report_finish_;
2158 std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
2159 std::hash<unsigned char>>
2160 hist_;
2161 std::string message_;
2162 bool exclude_from_merge_;
2163 ReporterAgent* reporter_agent_; // does not own
2164 friend class CombinedStats;
2165
2166 public:
2167 Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
2168
2169 void SetReporterAgent(ReporterAgent* reporter_agent) {
2170 reporter_agent_ = reporter_agent;
2171 }
2172
2173 void Start(int id) {
2174 id_ = id;
2175 next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
2176 last_op_finish_ = start_;
2177 hist_.clear();
2178 done_ = 0;
2179 last_report_done_ = 0;
2180 bytes_ = 0;
2181 seconds_ = 0;
2182 start_ = clock_->NowMicros();
2183 sine_interval_ = clock_->NowMicros();
2184 finish_ = start_;
2185 last_report_finish_ = start_;
2186 message_.clear();
2187 // When set, stats from this thread won't be merged with others.
2188 exclude_from_merge_ = false;
2189 }
2190
2191 void Merge(const Stats& other) {
2192 if (other.exclude_from_merge_) return;
2193
2194 for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
2195 auto this_it = hist_.find(it->first);
2196 if (this_it != hist_.end()) {
2197 this_it->second->Merge(*(other.hist_.at(it->first)));
2198 } else {
2199 hist_.insert({it->first, it->second});
2200 }
2201 }
2202
2203 done_ += other.done_;
2204 bytes_ += other.bytes_;
2205 seconds_ += other.seconds_;
2206 if (other.start_ < start_) start_ = other.start_;
2207 if (other.finish_ > finish_) finish_ = other.finish_;
2208
2209 // Just keep the messages from one thread.
2210 if (message_.empty()) message_ = other.message_;
2211 }
2212
2213 void Stop() {
2214 finish_ = clock_->NowMicros();
2215 seconds_ = (finish_ - start_) * 1e-6;
2216 }
2217
2218 void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); }
2219
2220 void SetId(int id) { id_ = id; }
2221 void SetExcludeFromMerge() { exclude_from_merge_ = true; }
2222
2223 void PrintThreadStatus() {
2224 std::vector<ThreadStatus> thread_list;
2225 FLAGS_env->GetThreadList(&thread_list);
2226
2227 fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID",
2228 "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage",
2229 "State", "OperationProperties");
2230
2231 int64_t current_time = 0;
2232 clock_->GetCurrentTime(&current_time).PermitUncheckedError();
2233 for (auto ts : thread_list) {
2234 fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
2235 ts.thread_id,
2236 ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
2237 ts.cf_name.c_str(),
2238 ThreadStatus::GetOperationName(ts.operation_type).c_str(),
2239 ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
2240 ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
2241 ThreadStatus::GetStateName(ts.state_type).c_str());
2242
2243 auto op_properties = ThreadStatus::InterpretOperationProperties(
2244 ts.operation_type, ts.op_properties);
2245 for (const auto& op_prop : op_properties) {
2246 fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(),
2247 op_prop.second);
2248 }
2249 fprintf(stderr, "\n");
2250 }
2251 }
2252
2253 void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
2254
2255 uint64_t GetSineInterval() { return sine_interval_; }
2256
2257 uint64_t GetStart() { return start_; }
2258
2259 void ResetLastOpTime() {
2260 // Set to now to avoid latency from calls to SleepForMicroseconds.
2261 last_op_finish_ = clock_->NowMicros();
2262 }
2263
2264 void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
2265 enum OperationType op_type = kOthers) {
2266 if (reporter_agent_) {
2267 reporter_agent_->ReportFinishedOps(num_ops);
2268 }
2269 if (FLAGS_histogram) {
2270 uint64_t now = clock_->NowMicros();
2271 uint64_t micros = now - last_op_finish_;
2272
2273 if (hist_.find(op_type) == hist_.end()) {
2274 auto hist_temp = std::make_shared<HistogramImpl>();
2275 hist_.insert({op_type, std::move(hist_temp)});
2276 }
2277 hist_[op_type]->Add(micros);
2278
2279 if (micros >= FLAGS_slow_usecs && !FLAGS_stats_interval) {
2280 fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
2281 fflush(stderr);
2282 }
2283 last_op_finish_ = now;
2284 }
2285
2286 done_ += num_ops;
2287 if (done_ >= next_report_ && FLAGS_progress_reports) {
2288 if (!FLAGS_stats_interval) {
2289 if (next_report_ < 1000)
2290 next_report_ += 100;
2291 else if (next_report_ < 5000)
2292 next_report_ += 500;
2293 else if (next_report_ < 10000)
2294 next_report_ += 1000;
2295 else if (next_report_ < 50000)
2296 next_report_ += 5000;
2297 else if (next_report_ < 100000)
2298 next_report_ += 10000;
2299 else if (next_report_ < 500000)
2300 next_report_ += 50000;
2301 else
2302 next_report_ += 100000;
2303 fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
2304 } else {
2305 uint64_t now = clock_->NowMicros();
2306 int64_t usecs_since_last = now - last_report_finish_;
2307
2308 // Determine whether to print status where interval is either
2309 // each N operations or each N seconds.
2310
2311 if (FLAGS_stats_interval_seconds &&
2312 usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
2313 // Don't check again for this many operations.
2314 next_report_ += FLAGS_stats_interval;
2315
2316 } else {
2317 fprintf(stderr,
2318 "%s ... thread %d: (%" PRIu64 ",%" PRIu64
2319 ") ops and "
2320 "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
2321 clock_->TimeToString(now / 1000000).c_str(), id_,
2322 done_ - last_report_done_, done_,
2323 (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
2324 done_ / ((now - start_) / 1000000.0),
2325 (now - last_report_finish_) / 1000000.0,
2326 (now - start_) / 1000000.0);
2327
2328 if (id_ == 0 && FLAGS_stats_per_interval) {
2329 std::string stats;
2330
2331 if (db_with_cfh && db_with_cfh->num_created.load()) {
2332 for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
2333 if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
2334 &stats))
2335 fprintf(stderr, "%s\n", stats.c_str());
2336 if (FLAGS_show_table_properties) {
2337 for (int level = 0; level < FLAGS_num_levels; ++level) {
2338 if (db->GetProperty(
2339 db_with_cfh->cfh[i],
2340 "rocksdb.aggregated-table-properties-at-level" +
2341 std::to_string(level),
2342 &stats)) {
2343 if (stats.find("# entries=0") == std::string::npos) {
2344 fprintf(stderr, "Level[%d]: %s\n", level,
2345 stats.c_str());
2346 }
2347 }
2348 }
2349 }
2350 }
2351 } else if (db) {
2352 if (db->GetProperty("rocksdb.stats", &stats)) {
2353 fprintf(stderr, "%s", stats.c_str());
2354 }
2355 if (db->GetProperty("rocksdb.num-running-compactions", &stats)) {
2356 fprintf(stderr, "num-running-compactions: %s\n", stats.c_str());
2357 }
2358 if (db->GetProperty("rocksdb.num-running-flushes", &stats)) {
2359 fprintf(stderr, "num-running-flushes: %s\n\n", stats.c_str());
2360 }
2361 if (FLAGS_show_table_properties) {
2362 for (int level = 0; level < FLAGS_num_levels; ++level) {
2363 if (db->GetProperty(
2364 "rocksdb.aggregated-table-properties-at-level" +
2365 std::to_string(level),
2366 &stats)) {
2367 if (stats.find("# entries=0") == std::string::npos) {
2368 fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
2369 }
2370 }
2371 }
2372 }
2373 }
2374 }
2375
2376 next_report_ += FLAGS_stats_interval;
2377 last_report_finish_ = now;
2378 last_report_done_ = done_;
2379 }
2380 }
2381 if (id_ == 0 && FLAGS_thread_status_per_interval) {
2382 PrintThreadStatus();
2383 }
2384 fflush(stderr);
2385 }
2386 }
2387
2388 void AddBytes(int64_t n) { bytes_ += n; }
2389
2390 void Report(const Slice& name) {
2391 // Pretend at least one op was done in case we are running a benchmark
2392 // that does not call FinishedOps().
2393 if (done_ < 1) done_ = 1;
2394
2395 std::string extra;
2396 double elapsed = (finish_ - start_) * 1e-6;
2397 if (bytes_ > 0) {
2398 // Rate is computed on actual elapsed time, not the sum of per-thread
2399 // elapsed times.
2400 char rate[100];
2401 snprintf(rate, sizeof(rate), "%6.1f MB/s",
2402 (bytes_ / 1048576.0) / elapsed);
2403 extra = rate;
2404 }
2405 AppendWithSpace(&extra, message_);
2406 double throughput = (double)done_ / elapsed;
2407
2408 fprintf(stdout,
2409 "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64
2410 " operations;%s%s\n",
2411 name.ToString().c_str(), seconds_ * 1e6 / done_, (long)throughput,
2412 elapsed, done_, (extra.empty() ? "" : " "), extra.c_str());
2413 if (FLAGS_histogram) {
2414 for (auto it = hist_.begin(); it != hist_.end(); ++it) {
2415 fprintf(stdout, "Microseconds per %s:\n%s\n",
2416 OperationTypeString[it->first].c_str(),
2417 it->second->ToString().c_str());
2418 }
2419 }
2420 if (FLAGS_report_file_operations) {
2421 auto* counted_fs =
2422 FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>();
2423 assert(counted_fs);
2424 fprintf(stdout, "%s", counted_fs->PrintCounters().c_str());
2425 counted_fs->ResetCounters();
2426 }
2427 fflush(stdout);
2428 }
2429 };
2430
2431 class CombinedStats {
2432 public:
2433 void AddStats(const Stats& stat) {
2434 uint64_t total_ops = stat.done_;
2435 uint64_t total_bytes_ = stat.bytes_;
2436 double elapsed;
2437
2438 if (total_ops < 1) {
2439 total_ops = 1;
2440 }
2441
2442 elapsed = (stat.finish_ - stat.start_) * 1e-6;
2443 throughput_ops_.emplace_back(total_ops / elapsed);
2444
2445 if (total_bytes_ > 0) {
2446 double mbs = (total_bytes_ / 1048576.0);
2447 throughput_mbs_.emplace_back(mbs / elapsed);
2448 }
2449 }
2450
2451 void Report(const std::string& bench_name) {
2452 if (throughput_ops_.size() < 2) {
2453 // skip if there are not enough samples
2454 return;
2455 }
2456
2457 const char* name = bench_name.c_str();
2458 int num_runs = static_cast<int>(throughput_ops_.size());
2459
2460 if (throughput_mbs_.size() == throughput_ops_.size()) {
2461 fprintf(stdout,
2462 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2463 "%.1f) MB/sec\n",
2464 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2465 static_cast<int>(CalcConfidence95(throughput_ops_)),
2466 CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_));
2467 } else {
2468 fprintf(stdout, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name,
2469 num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2470 static_cast<int>(CalcConfidence95(throughput_ops_)));
2471 }
2472 }
2473
2474 void ReportWithConfidenceIntervals(const std::string& bench_name) {
2475 if (throughput_ops_.size() < 2) {
2476 // skip if there are not enough samples
2477 return;
2478 }
2479
2480 const char* name = bench_name.c_str();
2481 int num_runs = static_cast<int>(throughput_ops_.size());
2482
2483 int ops_avg = static_cast<int>(CalcAvg(throughput_ops_));
2484 int ops_confidence_95 = static_cast<int>(CalcConfidence95(throughput_ops_));
2485
2486 if (throughput_mbs_.size() == throughput_ops_.size()) {
2487 double mbs_avg = CalcAvg(throughput_mbs_);
2488 double mbs_confidence_95 = CalcConfidence95(throughput_mbs_);
2489 fprintf(stdout,
2490 "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n",
2491 name, num_runs, ops_avg - ops_confidence_95,
2492 ops_avg + ops_confidence_95, mbs_avg - mbs_confidence_95,
2493 mbs_avg + mbs_confidence_95);
2494 } else {
2495 fprintf(stdout, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name, num_runs,
2496 ops_avg - ops_confidence_95, ops_avg + ops_confidence_95);
2497 }
2498 }
2499
2500 void ReportFinal(const std::string& bench_name) {
2501 if (throughput_ops_.size() < 2) {
2502 // skip if there are not enough samples
2503 return;
2504 }
2505
2506 const char* name = bench_name.c_str();
2507 int num_runs = static_cast<int>(throughput_ops_.size());
2508
2509 if (throughput_mbs_.size() == throughput_ops_.size()) {
2510 // \xC2\xB1 is +/- character in UTF-8
2511 fprintf(stdout,
2512 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2513 "%.1f) MB/sec\n"
2514 "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
2515 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2516 static_cast<int>(CalcConfidence95(throughput_ops_)),
2517 CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name,
2518 num_runs, static_cast<int>(CalcMedian(throughput_ops_)),
2519 CalcMedian(throughput_mbs_));
2520 } else {
2521 fprintf(stdout,
2522 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
2523 "%s [MEDIAN %d runs] : %d ops/sec\n",
2524 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2525 static_cast<int>(CalcConfidence95(throughput_ops_)), name,
2526 num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
2527 }
2528 }
2529
2530 private:
2531 double CalcAvg(std::vector<double>& data) {
2532 double avg = 0;
2533 for (double x : data) {
2534 avg += x;
2535 }
2536 avg = avg / data.size();
2537 return avg;
2538 }
2539
2540 // Calculates 95% CI assuming a normal distribution of samples.
2541 // Samples are not from a normal distribution, but it still
2542 // provides useful approximation.
2543 double CalcConfidence95(std::vector<double>& data) {
2544 assert(data.size() > 1);
2545 double avg = CalcAvg(data);
2546 double std_error = CalcStdDev(data, avg) / std::sqrt(data.size());
2547
2548 // Z score for the 97.5 percentile
2549 // see https://en.wikipedia.org/wiki/1.96
2550 return 1.959964 * std_error;
2551 }
2552
2553 double CalcMedian(std::vector<double>& data) {
2554 assert(data.size() > 0);
2555 std::sort(data.begin(), data.end());
2556
2557 size_t mid = data.size() / 2;
2558 if (data.size() % 2 == 1) {
2559 // Odd number of entries
2560 return data[mid];
2561 } else {
2562 // Even number of entries
2563 return (data[mid] + data[mid - 1]) / 2;
2564 }
2565 }
2566
2567 double CalcStdDev(std::vector<double>& data, double average) {
2568 assert(data.size() > 1);
2569 double squared_sum = 0.0;
2570 for (double x : data) {
2571 squared_sum += std::pow(x - average, 2);
2572 }
2573
2574 // using samples count - 1 following Bessel's correction
2575 // see https://en.wikipedia.org/wiki/Bessel%27s_correction
2576 return std::sqrt(squared_sum / (data.size() - 1));
2577 }
2578
2579 std::vector<double> throughput_ops_;
2580 std::vector<double> throughput_mbs_;
2581 };
2582
2583 class TimestampEmulator {
2584 private:
2585 std::atomic<uint64_t> timestamp_;
2586
2587 public:
2588 TimestampEmulator() : timestamp_(0) {}
2589 uint64_t Get() const { return timestamp_.load(); }
2590 void Inc() { timestamp_++; }
2591 Slice Allocate(char* scratch) {
2592 // TODO: support larger timestamp sizes
2593 assert(FLAGS_user_timestamp_size == 8);
2594 assert(scratch);
2595 uint64_t ts = timestamp_.fetch_add(1);
2596 EncodeFixed64(scratch, ts);
2597 return Slice(scratch, FLAGS_user_timestamp_size);
2598 }
2599 Slice GetTimestampForRead(Random64& rand, char* scratch) {
2600 assert(FLAGS_user_timestamp_size == 8);
2601 assert(scratch);
2602 if (FLAGS_read_with_latest_user_timestamp) {
2603 return Allocate(scratch);
2604 }
2605 // Choose a random timestamp from the past.
2606 uint64_t ts = rand.Next() % Get();
2607 EncodeFixed64(scratch, ts);
2608 return Slice(scratch, FLAGS_user_timestamp_size);
2609 }
2610 };
2611
2612 // State shared by all concurrent executions of the same benchmark.
2613 struct SharedState {
2614 port::Mutex mu;
2615 port::CondVar cv;
2616 int total;
2617 int perf_level;
2618 std::shared_ptr<RateLimiter> write_rate_limiter;
2619 std::shared_ptr<RateLimiter> read_rate_limiter;
2620
2621 // Each thread goes through the following states:
2622 // (1) initializing
2623 // (2) waiting for others to be initialized
2624 // (3) running
2625 // (4) done
2626
2627 long num_initialized;
2628 long num_done;
2629 bool start;
2630
2631 SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {}
2632 };
2633
2634 // Per-thread state for concurrent executions of the same benchmark.
2635 struct ThreadState {
2636 int tid; // 0..n-1 when running in n threads
2637 Random64 rand; // Has different seeds for different threads
2638 Stats stats;
2639 SharedState* shared;
2640
2641 explicit ThreadState(int index, int my_seed)
2642 : tid(index), rand(seed_base + my_seed) {}
2643 };
2644
2645 class Duration {
2646 public:
2647 Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
2648 max_seconds_ = max_seconds;
2649 max_ops_ = max_ops;
2650 ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
2651 ops_ = 0;
2652 start_at_ = FLAGS_env->NowMicros();
2653 }
2654
2655 int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
2656
2657 bool Done(int64_t increment) {
2658 if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
2659 ops_ += increment;
2660
2661 if (max_seconds_) {
2662 // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2663 auto granularity = FLAGS_ops_between_duration_checks;
2664 if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
2665 uint64_t now = FLAGS_env->NowMicros();
2666 return ((now - start_at_) / 1000000) >= max_seconds_;
2667 } else {
2668 return false;
2669 }
2670 } else {
2671 return ops_ > max_ops_;
2672 }
2673 }
2674
2675 private:
2676 uint64_t max_seconds_;
2677 int64_t max_ops_;
2678 int64_t ops_per_stage_;
2679 int64_t ops_;
2680 uint64_t start_at_;
2681 };
2682
2683 class Benchmark {
2684 private:
2685 std::shared_ptr<Cache> cache_;
2686 std::shared_ptr<Cache> compressed_cache_;
2687 std::shared_ptr<const SliceTransform> prefix_extractor_;
2688 DBWithColumnFamilies db_;
2689 std::vector<DBWithColumnFamilies> multi_dbs_;
2690 int64_t num_;
2691 int key_size_;
2692 int user_timestamp_size_;
2693 int prefix_size_;
2694 int total_thread_count_;
2695 int64_t keys_per_prefix_;
2696 int64_t entries_per_batch_;
2697 int64_t writes_before_delete_range_;
2698 int64_t writes_per_range_tombstone_;
2699 int64_t range_tombstone_width_;
2700 int64_t max_num_range_tombstones_;
2701 ReadOptions read_options_;
2702 WriteOptions write_options_;
2703 Options open_options_; // keep options around to properly destroy db later
2704 #ifndef ROCKSDB_LITE
2705 TraceOptions trace_options_;
2706 TraceOptions block_cache_trace_options_;
2707 #endif
2708 int64_t reads_;
2709 int64_t deletes_;
2710 double read_random_exp_range_;
2711 int64_t writes_;
2712 int64_t readwrites_;
2713 int64_t merge_keys_;
2714 bool report_file_operations_;
2715 bool use_blob_db_; // Stacked BlobDB
2716 bool read_operands_; // read via GetMergeOperands()
2717 std::vector<std::string> keys_;
2718
2719 class ErrorHandlerListener : public EventListener {
2720 public:
2721 #ifndef ROCKSDB_LITE
2722 ErrorHandlerListener()
2723 : mutex_(),
2724 cv_(&mutex_),
2725 no_auto_recovery_(false),
2726 recovery_complete_(false) {}
2727
2728 ~ErrorHandlerListener() override {}
2729
2730 const char* Name() const override { return kClassName(); }
2731 static const char* kClassName() { return "ErrorHandlerListener"; }
2732
2733 void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
2734 Status /*bg_error*/,
2735 bool* auto_recovery) override {
2736 if (*auto_recovery && no_auto_recovery_) {
2737 *auto_recovery = false;
2738 }
2739 }
2740
2741 void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
2742 InstrumentedMutexLock l(&mutex_);
2743 recovery_complete_ = true;
2744 cv_.SignalAll();
2745 }
2746
2747 bool WaitForRecovery(uint64_t abs_time_us) {
2748 InstrumentedMutexLock l(&mutex_);
2749 if (!recovery_complete_) {
2750 cv_.TimedWait(abs_time_us);
2751 }
2752 if (recovery_complete_) {
2753 recovery_complete_ = false;
2754 return true;
2755 }
2756 return false;
2757 }
2758
2759 void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
2760
2761 private:
2762 InstrumentedMutex mutex_;
2763 InstrumentedCondVar cv_;
2764 bool no_auto_recovery_;
2765 bool recovery_complete_;
2766 #else // ROCKSDB_LITE
2767 bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
2768 void EnableAutoRecovery(bool /*enable*/) {}
2769 #endif // ROCKSDB_LITE
2770 };
2771
2772 std::shared_ptr<ErrorHandlerListener> listener_;
2773
2774 std::unique_ptr<TimestampEmulator> mock_app_clock_;
2775
2776 bool SanityCheck() {
2777 if (FLAGS_compression_ratio > 1) {
2778 fprintf(stderr, "compression_ratio should be between 0 and 1\n");
2779 return false;
2780 }
2781 return true;
2782 }
2783
2784 inline bool CompressSlice(const CompressionInfo& compression_info,
2785 const Slice& input, std::string* compressed) {
2786 constexpr uint32_t compress_format_version = 2;
2787
2788 return CompressData(input, compression_info, compress_format_version,
2789 compressed);
2790 }
2791
2792 void PrintHeader(const Options& options) {
2793 PrintEnvironment();
2794 fprintf(stdout,
2795 "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n",
2796 FLAGS_key_size, FLAGS_user_timestamp_size);
2797 auto avg_value_size = FLAGS_value_size;
2798 if (FLAGS_value_size_distribution_type_e == kFixed) {
2799 fprintf(stdout,
2800 "Values: %d bytes each (%d bytes after compression)\n",
2801 avg_value_size,
2802 static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2803 } else {
2804 avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
2805 fprintf(stdout,
2806 "Values: %d avg bytes each (%d bytes after compression)\n",
2807 avg_value_size,
2808 static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2809 fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
2810 FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min,
2811 FLAGS_value_size_max);
2812 }
2813 fprintf(stdout, "Entries: %" PRIu64 "\n", num_);
2814 fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size);
2815 fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_);
2816 fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
2817 ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_) /
2818 1048576.0));
2819 fprintf(
2820 stdout, "FileSize: %.1f MB (estimated)\n",
2821 (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) /
2822 1048576.0));
2823 fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
2824 FLAGS_benchmark_write_rate_limit);
2825 fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
2826 FLAGS_benchmark_read_rate_limit);
2827 if (FLAGS_enable_numa) {
2828 fprintf(stderr, "Running in NUMA enabled mode.\n");
2829 #ifndef NUMA
2830 fprintf(stderr, "NUMA is not defined in the system.\n");
2831 exit(1);
2832 #else
2833 if (numa_available() == -1) {
2834 fprintf(stderr, "NUMA is not supported by the system.\n");
2835 exit(1);
2836 }
2837 #endif
2838 }
2839
2840 auto compression = CompressionTypeToString(FLAGS_compression_type_e);
2841 fprintf(stdout, "Compression: %s\n", compression.c_str());
2842 fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
2843 FLAGS_sample_for_compression);
2844 if (options.memtable_factory != nullptr) {
2845 fprintf(stdout, "Memtablerep: %s\n",
2846 options.memtable_factory->GetId().c_str());
2847 }
2848 fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
2849
2850 PrintWarnings(compression.c_str());
2851 fprintf(stdout, "------------------------------------------------\n");
2852 }
2853
2854 void PrintWarnings(const char* compression) {
2855 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
2856 fprintf(
2857 stdout,
2858 "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
2859 #endif
2860 #ifndef NDEBUG
2861 fprintf(stdout,
2862 "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
2863 #endif
2864 if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
2865 // The test string should not be too small.
2866 const int len = FLAGS_block_size;
2867 std::string input_str(len, 'y');
2868 std::string compressed;
2869 CompressionOptions opts;
2870 CompressionContext context(FLAGS_compression_type_e);
2871 CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2872 FLAGS_compression_type_e,
2873 FLAGS_sample_for_compression);
2874 bool result = CompressSlice(info, Slice(input_str), &compressed);
2875
2876 if (!result) {
2877 fprintf(stdout, "WARNING: %s compression is not enabled\n",
2878 compression);
2879 } else if (compressed.size() >= input_str.size()) {
2880 fprintf(stdout, "WARNING: %s compression is not effective\n",
2881 compression);
2882 }
2883 }
2884 }
2885
2886 // Current the following isn't equivalent to OS_LINUX.
2887 #if defined(__linux)
2888 static Slice TrimSpace(Slice s) {
2889 unsigned int start = 0;
2890 while (start < s.size() && isspace(s[start])) {
2891 start++;
2892 }
2893 unsigned int limit = static_cast<unsigned int>(s.size());
2894 while (limit > start && isspace(s[limit - 1])) {
2895 limit--;
2896 }
2897 return Slice(s.data() + start, limit - start);
2898 }
2899 #endif
2900
2901 void PrintEnvironment() {
2902 fprintf(stderr, "RocksDB: version %s\n",
2903 GetRocksVersionAsString(true).c_str());
2904
2905 #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
2906 time_t now = time(nullptr);
2907 char buf[52];
2908 // Lint complains about ctime() usage, so replace it with ctime_r(). The
2909 // requirement is to provide a buffer which is at least 26 bytes.
2910 fprintf(stderr, "Date: %s",
2911 ctime_r(&now, buf)); // ctime_r() adds newline
2912
2913 #if defined(__linux)
2914 FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2915 if (cpuinfo != nullptr) {
2916 char line[1000];
2917 int num_cpus = 0;
2918 std::string cpu_type;
2919 std::string cache_size;
2920 while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2921 const char* sep = strchr(line, ':');
2922 if (sep == nullptr) {
2923 continue;
2924 }
2925 Slice key = TrimSpace(Slice(line, sep - 1 - line));
2926 Slice val = TrimSpace(Slice(sep + 1));
2927 if (key == "model name") {
2928 ++num_cpus;
2929 cpu_type = val.ToString();
2930 } else if (key == "cache size") {
2931 cache_size = val.ToString();
2932 }
2933 }
2934 fclose(cpuinfo);
2935 fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
2936 fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
2937 }
2938 #elif defined(__APPLE__)
2939 struct host_basic_info h;
2940 size_t hlen = HOST_BASIC_INFO_COUNT;
2941 if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
2942 (uint32_t*)&hlen) == KERN_SUCCESS) {
2943 std::string cpu_type;
2944 std::string cache_size;
2945 size_t hcache_size;
2946 hlen = sizeof(hcache_size);
2947 if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
2948 cache_size = std::to_string(hcache_size);
2949 }
2950 switch (h.cpu_type) {
2951 case CPU_TYPE_X86_64:
2952 cpu_type = "x86_64";
2953 break;
2954 case CPU_TYPE_ARM64:
2955 cpu_type = "arm64";
2956 break;
2957 default:
2958 break;
2959 }
2960 fprintf(stderr, "CPU: %d * %s\n", h.max_cpus, cpu_type.c_str());
2961 fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
2962 }
2963 #elif defined(__FreeBSD__)
2964 int ncpus;
2965 size_t len = sizeof(ncpus);
2966 int mib[2] = {CTL_HW, HW_NCPU};
2967 if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
2968 char cpu_type[16];
2969 len = sizeof(cpu_type) - 1;
2970 mib[1] = HW_MACHINE;
2971 if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
2972
2973 fprintf(stderr, "CPU: %d * %s\n", ncpus, cpu_type);
2974 // no programmatic way to get the cache line size except on PPC
2975 }
2976 #endif
2977 #endif
2978 }
2979
2980 static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
2981 const Slice& key) {
2982 const char* pos = key.data();
2983 pos += 8;
2984 uint64_t timestamp = 0;
2985 if (port::kLittleEndian) {
2986 int bytes_to_fill = 8;
2987 for (int i = 0; i < bytes_to_fill; ++i) {
2988 timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
2989 << ((bytes_to_fill - i - 1) << 3));
2990 }
2991 } else {
2992 memcpy(&timestamp, pos, sizeof(timestamp));
2993 }
2994 return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
2995 }
2996
2997 class ExpiredTimeFilter : public CompactionFilter {
2998 public:
2999 explicit ExpiredTimeFilter(
3000 const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
3001 : timestamp_emulator_(timestamp_emulator) {}
3002 bool Filter(int /*level*/, const Slice& key,
3003 const Slice& /*existing_value*/, std::string* /*new_value*/,
3004 bool* /*value_changed*/) const override {
3005 return KeyExpired(timestamp_emulator_.get(), key);
3006 }
3007 const char* Name() const override { return "ExpiredTimeFilter"; }
3008
3009 private:
3010 std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3011 };
3012
3013 class KeepFilter : public CompactionFilter {
3014 public:
3015 bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
3016 std::string* /*new_value*/,
3017 bool* /*value_changed*/) const override {
3018 return false;
3019 }
3020
3021 const char* Name() const override { return "KeepFilter"; }
3022 };
3023
3024 static std::shared_ptr<MemoryAllocator> GetCacheAllocator() {
3025 std::shared_ptr<MemoryAllocator> allocator;
3026
3027 if (FLAGS_use_cache_jemalloc_no_dump_allocator) {
3028 JemallocAllocatorOptions jemalloc_options;
3029 if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) {
3030 fprintf(stderr, "JemallocNodumpAllocator not supported.\n");
3031 exit(1);
3032 }
3033 } else if (FLAGS_use_cache_memkind_kmem_allocator) {
3034 #ifdef MEMKIND
3035 allocator = std::make_shared<MemkindKmemAllocator>();
3036 #else
3037 fprintf(stderr, "Memkind library is not linked with the binary.\n");
3038 exit(1);
3039 #endif
3040 }
3041
3042 return allocator;
3043 }
3044
3045 static std::shared_ptr<Cache> NewCache(int64_t capacity) {
3046 if (capacity <= 0) {
3047 return nullptr;
3048 }
3049 if (FLAGS_cache_type == "clock_cache") {
3050 fprintf(stderr, "Old clock cache implementation has been removed.\n");
3051 exit(1);
3052 } else if (FLAGS_cache_type == "hyper_clock_cache") {
3053 return HyperClockCacheOptions(static_cast<size_t>(capacity),
3054 FLAGS_block_size /*estimated_entry_charge*/,
3055 FLAGS_cache_numshardbits)
3056 .MakeSharedCache();
3057 } else if (FLAGS_cache_type == "lru_cache") {
3058 LRUCacheOptions opts(
3059 static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
3060 false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
3061 GetCacheAllocator(), kDefaultToAdaptiveMutex,
3062 kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio);
3063
3064 #ifndef ROCKSDB_LITE
3065 if (!FLAGS_secondary_cache_uri.empty()) {
3066 Status s = SecondaryCache::CreateFromString(
3067 ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
3068 if (secondary_cache == nullptr) {
3069 fprintf(
3070 stderr,
3071 "No secondary cache registered matching string: %s status=%s\n",
3072 FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
3073 exit(1);
3074 }
3075 opts.secondary_cache = secondary_cache;
3076 }
3077 #endif // ROCKSDB_LITE
3078
3079 if (FLAGS_use_compressed_secondary_cache) {
3080 CompressedSecondaryCacheOptions secondary_cache_opts;
3081 secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size;
3082 secondary_cache_opts.num_shard_bits =
3083 FLAGS_compressed_secondary_cache_numshardbits;
3084 secondary_cache_opts.high_pri_pool_ratio =
3085 FLAGS_compressed_secondary_cache_high_pri_pool_ratio;
3086 secondary_cache_opts.low_pri_pool_ratio =
3087 FLAGS_compressed_secondary_cache_low_pri_pool_ratio;
3088 secondary_cache_opts.compression_type =
3089 FLAGS_compressed_secondary_cache_compression_type_e;
3090 secondary_cache_opts.compress_format_version =
3091 FLAGS_compressed_secondary_cache_compress_format_version;
3092 opts.secondary_cache =
3093 NewCompressedSecondaryCache(secondary_cache_opts);
3094 }
3095
3096 return NewLRUCache(opts);
3097 } else {
3098 fprintf(stderr, "Cache type not supported.");
3099 exit(1);
3100 }
3101 }
3102
3103 public:
3104 Benchmark()
3105 : cache_(NewCache(FLAGS_cache_size)),
3106 compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
3107 prefix_extractor_(FLAGS_prefix_size != 0
3108 ? NewFixedPrefixTransform(FLAGS_prefix_size)
3109 : nullptr),
3110 num_(FLAGS_num),
3111 key_size_(FLAGS_key_size),
3112 user_timestamp_size_(FLAGS_user_timestamp_size),
3113 prefix_size_(FLAGS_prefix_size),
3114 total_thread_count_(0),
3115 keys_per_prefix_(FLAGS_keys_per_prefix),
3116 entries_per_batch_(1),
3117 reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
3118 read_random_exp_range_(0.0),
3119 writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
3120 readwrites_(
3121 (FLAGS_writes < 0 && FLAGS_reads < 0)
3122 ? FLAGS_num
3123 : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
3124 merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
3125 report_file_operations_(FLAGS_report_file_operations),
3126 #ifndef ROCKSDB_LITE
3127 use_blob_db_(FLAGS_use_blob_db), // Stacked BlobDB
3128 #else
3129 use_blob_db_(false), // Stacked BlobDB
3130 #endif // !ROCKSDB_LITE
3131 read_operands_(false) {
3132 // use simcache instead of cache
3133 if (FLAGS_simcache_size >= 0) {
3134 if (FLAGS_cache_numshardbits >= 1) {
3135 cache_ =
3136 NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
3137 } else {
3138 cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
3139 }
3140 }
3141
3142 if (report_file_operations_) {
3143 FLAGS_env = new CompositeEnvWrapper(
3144 FLAGS_env,
3145 std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem()));
3146 }
3147
3148 if (FLAGS_prefix_size > FLAGS_key_size) {
3149 fprintf(stderr, "prefix size is larger than key size");
3150 exit(1);
3151 }
3152
3153 std::vector<std::string> files;
3154 FLAGS_env->GetChildren(FLAGS_db, &files);
3155 for (size_t i = 0; i < files.size(); i++) {
3156 if (Slice(files[i]).starts_with("heap-")) {
3157 FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
3158 }
3159 }
3160 if (!FLAGS_use_existing_db) {
3161 Options options;
3162 options.env = FLAGS_env;
3163 if (!FLAGS_wal_dir.empty()) {
3164 options.wal_dir = FLAGS_wal_dir;
3165 }
3166 #ifndef ROCKSDB_LITE
3167 if (use_blob_db_) {
3168 // Stacked BlobDB
3169 blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
3170 }
3171 #endif // !ROCKSDB_LITE
3172 DestroyDB(FLAGS_db, options);
3173 if (!FLAGS_wal_dir.empty()) {
3174 FLAGS_env->DeleteDir(FLAGS_wal_dir);
3175 }
3176
3177 if (FLAGS_num_multi_db > 1) {
3178 FLAGS_env->CreateDir(FLAGS_db);
3179 if (!FLAGS_wal_dir.empty()) {
3180 FLAGS_env->CreateDir(FLAGS_wal_dir);
3181 }
3182 }
3183 }
3184
3185 listener_.reset(new ErrorHandlerListener());
3186 if (user_timestamp_size_ > 0) {
3187 mock_app_clock_.reset(new TimestampEmulator());
3188 }
3189 }
3190
3191 void DeleteDBs() {
3192 db_.DeleteDBs();
3193 for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
3194 delete dbwcf.db;
3195 }
3196 }
3197
3198 ~Benchmark() {
3199 DeleteDBs();
3200 if (cache_.get() != nullptr) {
3201 // Clear cache reference first
3202 open_options_.write_buffer_manager.reset();
3203 // this will leak, but we're shutting down so nobody cares
3204 cache_->DisownData();
3205 }
3206 }
3207
3208 Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
3209 char* data = new char[key_size_];
3210 const char* const_data = data;
3211 key_guard->reset(const_data);
3212 return Slice(key_guard->get(), key_size_);
3213 }
3214
3215 // Generate key according to the given specification and random number.
3216 // The resulting key will have the following format:
3217 // - If keys_per_prefix_ is positive, extra trailing bytes are either cut
3218 // off or padded with '0'.
3219 // The prefix value is derived from key value.
3220 // ----------------------------
3221 // | prefix 00000 | key 00000 |
3222 // ----------------------------
3223 //
3224 // - If keys_per_prefix_ is 0, the key is simply a binary representation of
3225 // random number followed by trailing '0's
3226 // ----------------------------
3227 // | key 00000 |
3228 // ----------------------------
3229 void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
3230 if (!keys_.empty()) {
3231 assert(FLAGS_use_existing_keys);
3232 assert(keys_.size() == static_cast<size_t>(num_keys));
3233 assert(v < static_cast<uint64_t>(num_keys));
3234 *key = keys_[v];
3235 return;
3236 }
3237 char* start = const_cast<char*>(key->data());
3238 char* pos = start;
3239 if (keys_per_prefix_ > 0) {
3240 int64_t num_prefix = num_keys / keys_per_prefix_;
3241 int64_t prefix = v % num_prefix;
3242 int bytes_to_fill = std::min(prefix_size_, 8);
3243 if (port::kLittleEndian) {
3244 for (int i = 0; i < bytes_to_fill; ++i) {
3245 pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
3246 }
3247 } else {
3248 memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
3249 }
3250 if (prefix_size_ > 8) {
3251 // fill the rest with 0s
3252 memset(pos + 8, '0', prefix_size_ - 8);
3253 }
3254 pos += prefix_size_;
3255 }
3256
3257 int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
3258 if (port::kLittleEndian) {
3259 for (int i = 0; i < bytes_to_fill; ++i) {
3260 pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
3261 }
3262 } else {
3263 memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
3264 }
3265 pos += bytes_to_fill;
3266 if (key_size_ > pos - start) {
3267 memset(pos, '0', key_size_ - (pos - start));
3268 }
3269 }
3270
3271 void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
3272 GenerateKeyFromInt(v, num_keys, key);
3273 if (FLAGS_seek_missing_prefix) {
3274 assert(prefix_size_ > 8);
3275 char* key_ptr = const_cast<char*>(key->data());
3276 // This rely on GenerateKeyFromInt filling paddings with '0's.
3277 // Putting a '1' will create a non-existing prefix.
3278 key_ptr[8] = '1';
3279 }
3280 }
3281
3282 std::string GetPathForMultiple(std::string base_name, size_t id) {
3283 if (!base_name.empty()) {
3284 #ifndef OS_WIN
3285 if (base_name.back() != '/') {
3286 base_name += '/';
3287 }
3288 #else
3289 if (base_name.back() != '\\') {
3290 base_name += '\\';
3291 }
3292 #endif
3293 }
3294 return base_name + std::to_string(id);
3295 }
3296
3297 void VerifyDBFromDB(std::string& truth_db_name) {
3298 DBWithColumnFamilies truth_db;
3299 auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
3300 if (!s.ok()) {
3301 fprintf(stderr, "open error: %s\n", s.ToString().c_str());
3302 exit(1);
3303 }
3304 ReadOptions ro;
3305 ro.total_order_seek = true;
3306 std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
3307 std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
3308 // Verify that all the key/values in truth_db are retrivable in db with
3309 // ::Get
3310 fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
3311 for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
3312 std::string value;
3313 s = db_.db->Get(ro, truth_iter->key(), &value);
3314 assert(s.ok());
3315 // TODO(myabandeh): provide debugging hints
3316 assert(Slice(value) == truth_iter->value());
3317 }
3318 // Verify that the db iterator does not give any extra key/value
3319 fprintf(stderr, "Verifying db == truth_db...\n");
3320 for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
3321 db_iter->Next(), truth_iter->Next()) {
3322 assert(truth_iter->Valid());
3323 assert(truth_iter->value() == db_iter->value());
3324 }
3325 // No more key should be left unchecked in truth_db
3326 assert(!truth_iter->Valid());
3327 fprintf(stderr, "...Verified\n");
3328 }
3329
3330 void ErrorExit() {
3331 DeleteDBs();
3332 exit(1);
3333 }
3334
3335 void Run() {
3336 if (!SanityCheck()) {
3337 ErrorExit();
3338 }
3339 Open(&open_options_);
3340 PrintHeader(open_options_);
3341 std::stringstream benchmark_stream(FLAGS_benchmarks);
3342 std::string name;
3343 std::unique_ptr<ExpiredTimeFilter> filter;
3344 while (std::getline(benchmark_stream, name, ',')) {
3345 // Sanitize parameters
3346 num_ = FLAGS_num;
3347 reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
3348 writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
3349 deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
3350 value_size = FLAGS_value_size;
3351 key_size_ = FLAGS_key_size;
3352 entries_per_batch_ = FLAGS_batch_size;
3353 writes_before_delete_range_ = FLAGS_writes_before_delete_range;
3354 writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
3355 range_tombstone_width_ = FLAGS_range_tombstone_width;
3356 max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
3357 write_options_ = WriteOptions();
3358 read_random_exp_range_ = FLAGS_read_random_exp_range;
3359 if (FLAGS_sync) {
3360 write_options_.sync = true;
3361 }
3362 write_options_.disableWAL = FLAGS_disable_wal;
3363 write_options_.rate_limiter_priority =
3364 FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL;
3365 read_options_ = ReadOptions(FLAGS_verify_checksum, true);
3366 read_options_.total_order_seek = FLAGS_total_order_seek;
3367 read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start;
3368 read_options_.rate_limiter_priority =
3369 FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
3370 read_options_.tailing = FLAGS_use_tailing_iterator;
3371 read_options_.readahead_size = FLAGS_readahead_size;
3372 read_options_.adaptive_readahead = FLAGS_adaptive_readahead;
3373 read_options_.async_io = FLAGS_async_io;
3374 read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io;
3375
3376 void (Benchmark::*method)(ThreadState*) = nullptr;
3377 void (Benchmark::*post_process_method)() = nullptr;
3378
3379 bool fresh_db = false;
3380 int num_threads = FLAGS_threads;
3381
3382 int num_repeat = 1;
3383 int num_warmup = 0;
3384 if (!name.empty() && *name.rbegin() == ']') {
3385 auto it = name.find('[');
3386 if (it == std::string::npos) {
3387 fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
3388 ErrorExit();
3389 }
3390 std::string args = name.substr(it + 1);
3391 args.resize(args.size() - 1);
3392 name.resize(it);
3393
3394 std::string bench_arg;
3395 std::stringstream args_stream(args);
3396 while (std::getline(args_stream, bench_arg, '-')) {
3397 if (bench_arg.empty()) {
3398 continue;
3399 }
3400 if (bench_arg[0] == 'X') {
3401 // Repeat the benchmark n times
3402 std::string num_str = bench_arg.substr(1);
3403 num_repeat = std::stoi(num_str);
3404 } else if (bench_arg[0] == 'W') {
3405 // Warm up the benchmark for n times
3406 std::string num_str = bench_arg.substr(1);
3407 num_warmup = std::stoi(num_str);
3408 }
3409 }
3410 }
3411
3412 // Both fillseqdeterministic and filluniquerandomdeterministic
3413 // fill the levels except the max level with UNIQUE_RANDOM
3414 // and fill the max level with fillseq and filluniquerandom, respectively
3415 if (name == "fillseqdeterministic" ||
3416 name == "filluniquerandomdeterministic") {
3417 if (!FLAGS_disable_auto_compactions) {
3418 fprintf(stderr,
3419 "Please disable_auto_compactions in FillDeterministic "
3420 "benchmark\n");
3421 ErrorExit();
3422 }
3423 if (num_threads > 1) {
3424 fprintf(stderr,
3425 "filldeterministic multithreaded not supported"
3426 ", use 1 thread\n");
3427 num_threads = 1;
3428 }
3429 fresh_db = true;
3430 if (name == "fillseqdeterministic") {
3431 method = &Benchmark::WriteSeqDeterministic;
3432 } else {
3433 method = &Benchmark::WriteUniqueRandomDeterministic;
3434 }
3435 } else if (name == "fillseq") {
3436 fresh_db = true;
3437 method = &Benchmark::WriteSeq;
3438 } else if (name == "fillbatch") {
3439 fresh_db = true;
3440 entries_per_batch_ = 1000;
3441 method = &Benchmark::WriteSeq;
3442 } else if (name == "fillrandom") {
3443 fresh_db = true;
3444 method = &Benchmark::WriteRandom;
3445 } else if (name == "filluniquerandom" ||
3446 name == "fillanddeleteuniquerandom") {
3447 fresh_db = true;
3448 if (num_threads > 1) {
3449 fprintf(stderr,
3450 "filluniquerandom and fillanddeleteuniquerandom "
3451 "multithreaded not supported, use 1 thread");
3452 num_threads = 1;
3453 }
3454 method = &Benchmark::WriteUniqueRandom;
3455 } else if (name == "overwrite") {
3456 method = &Benchmark::WriteRandom;
3457 } else if (name == "fillsync") {
3458 fresh_db = true;
3459 num_ /= 1000;
3460 write_options_.sync = true;
3461 method = &Benchmark::WriteRandom;
3462 } else if (name == "fill100K") {
3463 fresh_db = true;
3464 num_ /= 1000;
3465 value_size = 100 * 1000;
3466 method = &Benchmark::WriteRandom;
3467 } else if (name == "readseq") {
3468 method = &Benchmark::ReadSequential;
3469 } else if (name == "readtorowcache") {
3470 if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
3471 fprintf(stderr,
3472 "Please set use_existing_keys to true and specify a "
3473 "row cache size in readtorowcache benchmark\n");
3474 ErrorExit();
3475 }
3476 method = &Benchmark::ReadToRowCache;
3477 } else if (name == "readtocache") {
3478 method = &Benchmark::ReadSequential;
3479 num_threads = 1;
3480 reads_ = num_;
3481 } else if (name == "readreverse") {
3482 method = &Benchmark::ReadReverse;
3483 } else if (name == "readrandom") {
3484 if (FLAGS_multiread_stride) {
3485 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3486 entries_per_batch_);
3487 }
3488 method = &Benchmark::ReadRandom;
3489 } else if (name == "readrandomfast") {
3490 method = &Benchmark::ReadRandomFast;
3491 } else if (name == "multireadrandom") {
3492 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3493 entries_per_batch_);
3494 method = &Benchmark::MultiReadRandom;
3495 } else if (name == "multireadwhilewriting") {
3496 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3497 entries_per_batch_);
3498 num_threads++;
3499 method = &Benchmark::MultiReadWhileWriting;
3500 } else if (name == "approximatesizerandom") {
3501 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3502 entries_per_batch_);
3503 method = &Benchmark::ApproximateSizeRandom;
3504 } else if (name == "mixgraph") {
3505 method = &Benchmark::MixGraph;
3506 } else if (name == "readmissing") {
3507 ++key_size_;
3508 method = &Benchmark::ReadRandom;
3509 } else if (name == "newiterator") {
3510 method = &Benchmark::IteratorCreation;
3511 } else if (name == "newiteratorwhilewriting") {
3512 num_threads++; // Add extra thread for writing
3513 method = &Benchmark::IteratorCreationWhileWriting;
3514 } else if (name == "seekrandom") {
3515 method = &Benchmark::SeekRandom;
3516 } else if (name == "seekrandomwhilewriting") {
3517 num_threads++; // Add extra thread for writing
3518 method = &Benchmark::SeekRandomWhileWriting;
3519 } else if (name == "seekrandomwhilemerging") {
3520 num_threads++; // Add extra thread for merging
3521 method = &Benchmark::SeekRandomWhileMerging;
3522 } else if (name == "readrandomsmall") {
3523 reads_ /= 1000;
3524 method = &Benchmark::ReadRandom;
3525 } else if (name == "deleteseq") {
3526 method = &Benchmark::DeleteSeq;
3527 } else if (name == "deleterandom") {
3528 method = &Benchmark::DeleteRandom;
3529 } else if (name == "readwhilewriting") {
3530 num_threads++; // Add extra thread for writing
3531 method = &Benchmark::ReadWhileWriting;
3532 } else if (name == "readwhilemerging") {
3533 num_threads++; // Add extra thread for writing
3534 method = &Benchmark::ReadWhileMerging;
3535 } else if (name == "readwhilescanning") {
3536 num_threads++; // Add extra thread for scaning
3537 method = &Benchmark::ReadWhileScanning;
3538 } else if (name == "readrandomwriterandom") {
3539 method = &Benchmark::ReadRandomWriteRandom;
3540 } else if (name == "readrandommergerandom") {
3541 if (FLAGS_merge_operator.empty()) {
3542 fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3543 name.c_str());
3544 ErrorExit();
3545 }
3546 method = &Benchmark::ReadRandomMergeRandom;
3547 } else if (name == "updaterandom") {
3548 method = &Benchmark::UpdateRandom;
3549 } else if (name == "xorupdaterandom") {
3550 method = &Benchmark::XORUpdateRandom;
3551 } else if (name == "appendrandom") {
3552 method = &Benchmark::AppendRandom;
3553 } else if (name == "mergerandom") {
3554 if (FLAGS_merge_operator.empty()) {
3555 fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3556 name.c_str());
3557 exit(1);
3558 }
3559 method = &Benchmark::MergeRandom;
3560 } else if (name == "randomwithverify") {
3561 method = &Benchmark::RandomWithVerify;
3562 } else if (name == "fillseekseq") {
3563 method = &Benchmark::WriteSeqSeekSeq;
3564 } else if (name == "compact") {
3565 method = &Benchmark::Compact;
3566 } else if (name == "compactall") {
3567 CompactAll();
3568 #ifndef ROCKSDB_LITE
3569 } else if (name == "compact0") {
3570 CompactLevel(0);
3571 } else if (name == "compact1") {
3572 CompactLevel(1);
3573 } else if (name == "waitforcompaction") {
3574 WaitForCompaction();
3575 #endif
3576 } else if (name == "flush") {
3577 Flush();
3578 } else if (name == "crc32c") {
3579 method = &Benchmark::Crc32c;
3580 } else if (name == "xxhash") {
3581 method = &Benchmark::xxHash;
3582 } else if (name == "xxhash64") {
3583 method = &Benchmark::xxHash64;
3584 } else if (name == "xxh3") {
3585 method = &Benchmark::xxh3;
3586 } else if (name == "acquireload") {
3587 method = &Benchmark::AcquireLoad;
3588 } else if (name == "compress") {
3589 method = &Benchmark::Compress;
3590 } else if (name == "uncompress") {
3591 method = &Benchmark::Uncompress;
3592 #ifndef ROCKSDB_LITE
3593 } else if (name == "randomtransaction") {
3594 method = &Benchmark::RandomTransaction;
3595 post_process_method = &Benchmark::RandomTransactionVerify;
3596 #endif // ROCKSDB_LITE
3597 } else if (name == "randomreplacekeys") {
3598 fresh_db = true;
3599 method = &Benchmark::RandomReplaceKeys;
3600 } else if (name == "timeseries") {
3601 timestamp_emulator_.reset(new TimestampEmulator());
3602 if (FLAGS_expire_style == "compaction_filter") {
3603 filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
3604 fprintf(stdout, "Compaction filter is used to remove expired data");
3605 open_options_.compaction_filter = filter.get();
3606 }
3607 fresh_db = true;
3608 method = &Benchmark::TimeSeries;
3609 } else if (name == "block_cache_entry_stats") {
3610 // DB::Properties::kBlockCacheEntryStats
3611 PrintStats("rocksdb.block-cache-entry-stats");
3612 } else if (name == "stats") {
3613 PrintStats("rocksdb.stats");
3614 } else if (name == "resetstats") {
3615 ResetStats();
3616 } else if (name == "verify") {
3617 VerifyDBFromDB(FLAGS_truth_db);
3618 } else if (name == "levelstats") {
3619 PrintStats("rocksdb.levelstats");
3620 } else if (name == "memstats") {
3621 std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
3622 "rocksdb.cur-size-active-mem-table",
3623 "rocksdb.cur-size-all-mem-tables",
3624 "rocksdb.size-all-mem-tables",
3625 "rocksdb.num-entries-active-mem-table",
3626 "rocksdb.num-entries-imm-mem-tables"};
3627 PrintStats(keys);
3628 } else if (name == "sstables") {
3629 PrintStats("rocksdb.sstables");
3630 } else if (name == "stats_history") {
3631 PrintStatsHistory();
3632 #ifndef ROCKSDB_LITE
3633 } else if (name == "replay") {
3634 if (num_threads > 1) {
3635 fprintf(stderr, "Multi-threaded replay is not yet supported\n");
3636 ErrorExit();
3637 }
3638 if (FLAGS_trace_file == "") {
3639 fprintf(stderr, "Please set --trace_file to be replayed from\n");
3640 ErrorExit();
3641 }
3642 method = &Benchmark::Replay;
3643 #endif // ROCKSDB_LITE
3644 } else if (name == "getmergeoperands") {
3645 method = &Benchmark::GetMergeOperands;
3646 #ifndef ROCKSDB_LITE
3647 } else if (name == "verifychecksum") {
3648 method = &Benchmark::VerifyChecksum;
3649 } else if (name == "verifyfilechecksums") {
3650 method = &Benchmark::VerifyFileChecksums;
3651 #endif // ROCKSDB_LITE
3652 } else if (name == "readrandomoperands") {
3653 read_operands_ = true;
3654 method = &Benchmark::ReadRandom;
3655 #ifndef ROCKSDB_LITE
3656 } else if (name == "backup") {
3657 method = &Benchmark::Backup;
3658 } else if (name == "restore") {
3659 method = &Benchmark::Restore;
3660 #endif
3661 } else if (!name.empty()) { // No error message for empty name
3662 fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
3663 ErrorExit();
3664 }
3665
3666 if (fresh_db) {
3667 if (FLAGS_use_existing_db) {
3668 fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
3669 name.c_str());
3670 method = nullptr;
3671 } else {
3672 if (db_.db != nullptr) {
3673 db_.DeleteDBs();
3674 DestroyDB(FLAGS_db, open_options_);
3675 }
3676 Options options = open_options_;
3677 for (size_t i = 0; i < multi_dbs_.size(); i++) {
3678 delete multi_dbs_[i].db;
3679 if (!open_options_.wal_dir.empty()) {
3680 options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
3681 }
3682 DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
3683 }
3684 multi_dbs_.clear();
3685 }
3686 Open(&open_options_); // use open_options for the last accessed
3687 }
3688
3689 if (method != nullptr) {
3690 fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
3691
3692 #ifndef ROCKSDB_LITE
3693 if (name == "backup") {
3694 std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
3695 } else if (name == "restore") {
3696 std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
3697 std::cout << "Restore path: [" << FLAGS_restore_dir << "]"
3698 << std::endl;
3699 }
3700 // A trace_file option can be provided both for trace and replay
3701 // operations. But db_bench does not support tracing and replaying at
3702 // the same time, for now. So, start tracing only when it is not a
3703 // replay.
3704 if (FLAGS_trace_file != "" && name != "replay") {
3705 std::unique_ptr<TraceWriter> trace_writer;
3706 Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3707 FLAGS_trace_file, &trace_writer);
3708 if (!s.ok()) {
3709 fprintf(stderr, "Encountered an error starting a trace, %s\n",
3710 s.ToString().c_str());
3711 ErrorExit();
3712 }
3713 s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
3714 if (!s.ok()) {
3715 fprintf(stderr, "Encountered an error starting a trace, %s\n",
3716 s.ToString().c_str());
3717 ErrorExit();
3718 }
3719 fprintf(stdout, "Tracing the workload to: [%s]\n",
3720 FLAGS_trace_file.c_str());
3721 }
3722 // Start block cache tracing.
3723 if (!FLAGS_block_cache_trace_file.empty()) {
3724 // Sanity checks.
3725 if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
3726 fprintf(stderr,
3727 "Block cache trace sampling frequency must be higher than "
3728 "0.\n");
3729 ErrorExit();
3730 }
3731 if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
3732 fprintf(stderr,
3733 "The maximum file size for block cache tracing must be "
3734 "higher than 0.\n");
3735 ErrorExit();
3736 }
3737 block_cache_trace_options_.max_trace_file_size =
3738 FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
3739 block_cache_trace_options_.sampling_frequency =
3740 FLAGS_block_cache_trace_sampling_frequency;
3741 std::unique_ptr<TraceWriter> block_cache_trace_writer;
3742 Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3743 FLAGS_block_cache_trace_file,
3744 &block_cache_trace_writer);
3745 if (!s.ok()) {
3746 fprintf(stderr,
3747 "Encountered an error when creating trace writer, %s\n",
3748 s.ToString().c_str());
3749 ErrorExit();
3750 }
3751 s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
3752 std::move(block_cache_trace_writer));
3753 if (!s.ok()) {
3754 fprintf(
3755 stderr,
3756 "Encountered an error when starting block cache tracing, %s\n",
3757 s.ToString().c_str());
3758 ErrorExit();
3759 }
3760 fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
3761 FLAGS_block_cache_trace_file.c_str());
3762 }
3763 #endif // ROCKSDB_LITE
3764
3765 if (num_warmup > 0) {
3766 printf("Warming up benchmark by running %d times\n", num_warmup);
3767 }
3768
3769 for (int i = 0; i < num_warmup; i++) {
3770 RunBenchmark(num_threads, name, method);
3771 }
3772
3773 if (num_repeat > 1) {
3774 printf("Running benchmark for %d times\n", num_repeat);
3775 }
3776
3777 CombinedStats combined_stats;
3778 for (int i = 0; i < num_repeat; i++) {
3779 Stats stats = RunBenchmark(num_threads, name, method);
3780 combined_stats.AddStats(stats);
3781 if (FLAGS_confidence_interval_only) {
3782 combined_stats.ReportWithConfidenceIntervals(name);
3783 } else {
3784 combined_stats.Report(name);
3785 }
3786 }
3787 if (num_repeat > 1) {
3788 combined_stats.ReportFinal(name);
3789 }
3790 }
3791 if (post_process_method != nullptr) {
3792 (this->*post_process_method)();
3793 }
3794 }
3795
3796 if (secondary_update_thread_) {
3797 secondary_update_stopped_.store(1, std::memory_order_relaxed);
3798 secondary_update_thread_->join();
3799 secondary_update_thread_.reset();
3800 }
3801
3802 #ifndef ROCKSDB_LITE
3803 if (name != "replay" && FLAGS_trace_file != "") {
3804 Status s = db_.db->EndTrace();
3805 if (!s.ok()) {
3806 fprintf(stderr, "Encountered an error ending the trace, %s\n",
3807 s.ToString().c_str());
3808 }
3809 }
3810 if (!FLAGS_block_cache_trace_file.empty()) {
3811 Status s = db_.db->EndBlockCacheTrace();
3812 if (!s.ok()) {
3813 fprintf(stderr,
3814 "Encountered an error ending the block cache tracing, %s\n",
3815 s.ToString().c_str());
3816 }
3817 }
3818 #endif // ROCKSDB_LITE
3819
3820 if (FLAGS_statistics) {
3821 fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3822 }
3823 if (FLAGS_simcache_size >= 0) {
3824 fprintf(
3825 stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
3826 static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
3827 }
3828
3829 #ifndef ROCKSDB_LITE
3830 if (FLAGS_use_secondary_db) {
3831 fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n",
3832 secondary_db_updates_);
3833 }
3834 #endif // ROCKSDB_LITE
3835 }
3836
3837 private:
3838 std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3839 std::unique_ptr<port::Thread> secondary_update_thread_;
3840 std::atomic<int> secondary_update_stopped_{0};
3841 #ifndef ROCKSDB_LITE
3842 uint64_t secondary_db_updates_ = 0;
3843 #endif // ROCKSDB_LITE
3844 struct ThreadArg {
3845 Benchmark* bm;
3846 SharedState* shared;
3847 ThreadState* thread;
3848 void (Benchmark::*method)(ThreadState*);
3849 };
3850
3851 static void ThreadBody(void* v) {
3852 ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
3853 SharedState* shared = arg->shared;
3854 ThreadState* thread = arg->thread;
3855 {
3856 MutexLock l(&shared->mu);
3857 shared->num_initialized++;
3858 if (shared->num_initialized >= shared->total) {
3859 shared->cv.SignalAll();
3860 }
3861 while (!shared->start) {
3862 shared->cv.Wait();
3863 }
3864 }
3865
3866 SetPerfLevel(static_cast<PerfLevel>(shared->perf_level));
3867 perf_context.EnablePerLevelPerfContext();
3868 thread->stats.Start(thread->tid);
3869 (arg->bm->*(arg->method))(thread);
3870 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
3871 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
3872 get_perf_context()->ToString());
3873 }
3874 thread->stats.Stop();
3875
3876 {
3877 MutexLock l(&shared->mu);
3878 shared->num_done++;
3879 if (shared->num_done >= shared->total) {
3880 shared->cv.SignalAll();
3881 }
3882 }
3883 }
3884
3885 Stats RunBenchmark(int n, Slice name,
3886 void (Benchmark::*method)(ThreadState*)) {
3887 SharedState shared;
3888 shared.total = n;
3889 shared.num_initialized = 0;
3890 shared.num_done = 0;
3891 shared.start = false;
3892 if (FLAGS_benchmark_write_rate_limit > 0) {
3893 shared.write_rate_limiter.reset(
3894 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
3895 }
3896 if (FLAGS_benchmark_read_rate_limit > 0) {
3897 shared.read_rate_limiter.reset(NewGenericRateLimiter(
3898 FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
3899 10 /* fairness */, RateLimiter::Mode::kReadsOnly));
3900 }
3901
3902 std::unique_ptr<ReporterAgent> reporter_agent;
3903 if (FLAGS_report_interval_seconds > 0) {
3904 reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
3905 FLAGS_report_interval_seconds));
3906 }
3907
3908 ThreadArg* arg = new ThreadArg[n];
3909
3910 for (int i = 0; i < n; i++) {
3911 #ifdef NUMA
3912 if (FLAGS_enable_numa) {
3913 // Performs a local allocation of memory to threads in numa node.
3914 int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
3915 numa_exit_on_error = 1;
3916 int numa_node = i % n_nodes;
3917 bitmask* nodes = numa_allocate_nodemask();
3918 numa_bitmask_clearall(nodes);
3919 numa_bitmask_setbit(nodes, numa_node);
3920 // numa_bind() call binds the process to the node and these
3921 // properties are passed on to the thread that is created in
3922 // StartThread method called later in the loop.
3923 numa_bind(nodes);
3924 numa_set_strict(1);
3925 numa_free_nodemask(nodes);
3926 }
3927 #endif
3928 arg[i].bm = this;
3929 arg[i].method = method;
3930 arg[i].shared = &shared;
3931 total_thread_count_++;
3932 arg[i].thread = new ThreadState(i, total_thread_count_);
3933 arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3934 arg[i].thread->shared = &shared;
3935 FLAGS_env->StartThread(ThreadBody, &arg[i]);
3936 }
3937
3938 shared.mu.Lock();
3939 while (shared.num_initialized < n) {
3940 shared.cv.Wait();
3941 }
3942
3943 shared.start = true;
3944 shared.cv.SignalAll();
3945 while (shared.num_done < n) {
3946 shared.cv.Wait();
3947 }
3948 shared.mu.Unlock();
3949
3950 // Stats for some threads can be excluded.
3951 Stats merge_stats;
3952 for (int i = 0; i < n; i++) {
3953 merge_stats.Merge(arg[i].thread->stats);
3954 }
3955 merge_stats.Report(name);
3956
3957 for (int i = 0; i < n; i++) {
3958 delete arg[i].thread;
3959 }
3960 delete[] arg;
3961
3962 return merge_stats;
3963 }
3964
3965 template <OperationType kOpType, typename FnType, typename... Args>
3966 static inline void ChecksumBenchmark(FnType fn, ThreadState* thread,
3967 Args... args) {
3968 const int size = FLAGS_block_size; // use --block_size option for db_bench
3969 std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)";
3970 const char* label = labels.c_str();
3971
3972 std::string data(size, 'x');
3973 uint64_t bytes = 0;
3974 uint32_t val = 0;
3975 while (bytes < 5000U * uint64_t{1048576}) { // ~5GB
3976 val += static_cast<uint32_t>(fn(data.data(), size, args...));
3977 thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType);
3978 bytes += size;
3979 }
3980 // Print so result is not dead
3981 fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val));
3982
3983 thread->stats.AddBytes(bytes);
3984 thread->stats.AddMessage(label);
3985 }
3986
3987 void Crc32c(ThreadState* thread) {
3988 ChecksumBenchmark<kCrc>(crc32c::Value, thread);
3989 }
3990
3991 void xxHash(ThreadState* thread) {
3992 ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0);
3993 }
3994
3995 void xxHash64(ThreadState* thread) {
3996 ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0);
3997 }
3998
3999 void xxh3(ThreadState* thread) {
4000 ChecksumBenchmark<kHash>(XXH3_64bits, thread);
4001 }
4002
4003 void AcquireLoad(ThreadState* thread) {
4004 int dummy;
4005 std::atomic<void*> ap(&dummy);
4006 int count = 0;
4007 void* ptr = nullptr;
4008 thread->stats.AddMessage("(each op is 1000 loads)");
4009 while (count < 100000) {
4010 for (int i = 0; i < 1000; i++) {
4011 ptr = ap.load(std::memory_order_acquire);
4012 }
4013 count++;
4014 thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
4015 }
4016 if (ptr == nullptr) exit(1); // Disable unused variable warning.
4017 }
4018
4019 void Compress(ThreadState* thread) {
4020 RandomGenerator gen;
4021 Slice input = gen.Generate(FLAGS_block_size);
4022 int64_t bytes = 0;
4023 int64_t produced = 0;
4024 bool ok = true;
4025 std::string compressed;
4026 CompressionOptions opts;
4027 CompressionContext context(FLAGS_compression_type_e);
4028 CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
4029 FLAGS_compression_type_e,
4030 FLAGS_sample_for_compression);
4031 // Compress 1G
4032 while (ok && bytes < int64_t(1) << 30) {
4033 compressed.clear();
4034 ok = CompressSlice(info, input, &compressed);
4035 produced += compressed.size();
4036 bytes += input.size();
4037 thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
4038 }
4039
4040 if (!ok) {
4041 thread->stats.AddMessage("(compression failure)");
4042 } else {
4043 char buf[340];
4044 snprintf(buf, sizeof(buf), "(output: %.1f%%)",
4045 (produced * 100.0) / bytes);
4046 thread->stats.AddMessage(buf);
4047 thread->stats.AddBytes(bytes);
4048 }
4049 }
4050
4051 void Uncompress(ThreadState* thread) {
4052 RandomGenerator gen;
4053 Slice input = gen.Generate(FLAGS_block_size);
4054 std::string compressed;
4055
4056 CompressionContext compression_ctx(FLAGS_compression_type_e);
4057 CompressionOptions compression_opts;
4058 CompressionInfo compression_info(
4059 compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
4060 FLAGS_compression_type_e, FLAGS_sample_for_compression);
4061 UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
4062 UncompressionInfo uncompression_info(uncompression_ctx,
4063 UncompressionDict::GetEmptyDict(),
4064 FLAGS_compression_type_e);
4065
4066 bool ok = CompressSlice(compression_info, input, &compressed);
4067 int64_t bytes = 0;
4068 size_t uncompressed_size = 0;
4069 while (ok && bytes < 1024 * 1048576) {
4070 constexpr uint32_t compress_format_version = 2;
4071
4072 CacheAllocationPtr uncompressed = UncompressData(
4073 uncompression_info, compressed.data(), compressed.size(),
4074 &uncompressed_size, compress_format_version);
4075
4076 ok = uncompressed.get() != nullptr;
4077 bytes += input.size();
4078 thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
4079 }
4080
4081 if (!ok) {
4082 thread->stats.AddMessage("(compression failure)");
4083 } else {
4084 thread->stats.AddBytes(bytes);
4085 }
4086 }
4087
4088 // Returns true if the options is initialized from the specified
4089 // options file.
4090 bool InitializeOptionsFromFile(Options* opts) {
4091 #ifndef ROCKSDB_LITE
4092 printf("Initializing RocksDB Options from the specified file\n");
4093 DBOptions db_opts;
4094 std::vector<ColumnFamilyDescriptor> cf_descs;
4095 if (FLAGS_options_file != "") {
4096 auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
4097 &cf_descs);
4098 db_opts.env = FLAGS_env;
4099 if (s.ok()) {
4100 *opts = Options(db_opts, cf_descs[0].options);
4101 return true;
4102 }
4103 fprintf(stderr, "Unable to load options file %s --- %s\n",
4104 FLAGS_options_file.c_str(), s.ToString().c_str());
4105 exit(1);
4106 }
4107 #else
4108 (void)opts;
4109 #endif
4110 return false;
4111 }
4112
4113 void InitializeOptionsFromFlags(Options* opts) {
4114 printf("Initializing RocksDB Options from command-line flags\n");
4115 Options& options = *opts;
4116 ConfigOptions config_options(options);
4117 config_options.ignore_unsupported_options = false;
4118
4119 assert(db_.db == nullptr);
4120
4121 options.env = FLAGS_env;
4122 options.wal_dir = FLAGS_wal_dir;
4123 options.dump_malloc_stats = FLAGS_dump_malloc_stats;
4124 options.stats_dump_period_sec =
4125 static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
4126 options.stats_persist_period_sec =
4127 static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
4128 options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
4129 options.stats_history_buffer_size =
4130 static_cast<size_t>(FLAGS_stats_history_buffer_size);
4131 options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
4132
4133 options.compression_opts.level = FLAGS_compression_level;
4134 options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
4135 options.compression_opts.zstd_max_train_bytes =
4136 FLAGS_compression_zstd_max_train_bytes;
4137 options.compression_opts.parallel_threads =
4138 FLAGS_compression_parallel_threads;
4139 options.compression_opts.max_dict_buffer_bytes =
4140 FLAGS_compression_max_dict_buffer_bytes;
4141 options.compression_opts.use_zstd_dict_trainer =
4142 FLAGS_compression_use_zstd_dict_trainer;
4143
4144 options.max_open_files = FLAGS_open_files;
4145 if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
4146 options.write_buffer_manager.reset(
4147 new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
4148 }
4149 options.arena_block_size = FLAGS_arena_block_size;
4150 options.write_buffer_size = FLAGS_write_buffer_size;
4151 options.max_write_buffer_number = FLAGS_max_write_buffer_number;
4152 options.min_write_buffer_number_to_merge =
4153 FLAGS_min_write_buffer_number_to_merge;
4154 options.max_write_buffer_number_to_maintain =
4155 FLAGS_max_write_buffer_number_to_maintain;
4156 options.max_write_buffer_size_to_maintain =
4157 FLAGS_max_write_buffer_size_to_maintain;
4158 options.max_background_jobs = FLAGS_max_background_jobs;
4159 options.max_background_compactions = FLAGS_max_background_compactions;
4160 options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
4161 options.max_background_flushes = FLAGS_max_background_flushes;
4162 options.compaction_style = FLAGS_compaction_style_e;
4163 options.compaction_pri = FLAGS_compaction_pri_e;
4164 options.allow_mmap_reads = FLAGS_mmap_read;
4165 options.allow_mmap_writes = FLAGS_mmap_write;
4166 options.use_direct_reads = FLAGS_use_direct_reads;
4167 options.use_direct_io_for_flush_and_compaction =
4168 FLAGS_use_direct_io_for_flush_and_compaction;
4169 options.manual_wal_flush = FLAGS_manual_wal_flush;
4170 options.wal_compression = FLAGS_wal_compression_e;
4171 #ifndef ROCKSDB_LITE
4172 options.ttl = FLAGS_fifo_compaction_ttl;
4173 options.compaction_options_fifo = CompactionOptionsFIFO(
4174 FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
4175 FLAGS_fifo_compaction_allow_compaction);
4176 options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
4177 #endif // ROCKSDB_LITE
4178 options.prefix_extractor = prefix_extractor_;
4179 if (FLAGS_use_uint64_comparator) {
4180 options.comparator = test::Uint64Comparator();
4181 if (FLAGS_key_size != 8) {
4182 fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
4183 exit(1);
4184 }
4185 }
4186 if (FLAGS_use_stderr_info_logger) {
4187 options.info_log.reset(new StderrLogger());
4188 }
4189 options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
4190 options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
4191 options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
4192 if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
4193 options.memtable_insert_with_hint_prefix_extractor.reset(
4194 NewCappedPrefixTransform(
4195 FLAGS_memtable_insert_with_hint_prefix_size));
4196 }
4197 options.bloom_locality = FLAGS_bloom_locality;
4198 options.max_file_opening_threads = FLAGS_file_opening_threads;
4199 options.compaction_readahead_size = FLAGS_compaction_readahead_size;
4200 options.log_readahead_size = FLAGS_log_readahead_size;
4201 options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
4202 options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
4203 options.use_fsync = FLAGS_use_fsync;
4204 options.num_levels = FLAGS_num_levels;
4205 options.target_file_size_base = FLAGS_target_file_size_base;
4206 options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
4207 options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
4208 options.level_compaction_dynamic_level_bytes =
4209 FLAGS_level_compaction_dynamic_level_bytes;
4210 options.max_bytes_for_level_multiplier =
4211 FLAGS_max_bytes_for_level_multiplier;
4212 Status s =
4213 CreateMemTableRepFactory(config_options, &options.memtable_factory);
4214 if (!s.ok()) {
4215 fprintf(stderr, "Could not create memtable factory: %s\n",
4216 s.ToString().c_str());
4217 exit(1);
4218 } else if ((FLAGS_prefix_size == 0) &&
4219 (options.memtable_factory->IsInstanceOf("prefix_hash") ||
4220 options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
4221 fprintf(stderr,
4222 "prefix_size should be non-zero if PrefixHash or "
4223 "HashLinkedList memtablerep is used\n");
4224 exit(1);
4225 }
4226 if (FLAGS_use_plain_table) {
4227 #ifndef ROCKSDB_LITE
4228 if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
4229 !options.memtable_factory->IsInstanceOf("hash_linkedlist")) {
4230 fprintf(stderr, "Warning: plain table is used with %s\n",
4231 options.memtable_factory->Name());
4232 }
4233
4234 int bloom_bits_per_key = FLAGS_bloom_bits;
4235 if (bloom_bits_per_key < 0) {
4236 bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
4237 }
4238
4239 PlainTableOptions plain_table_options;
4240 plain_table_options.user_key_len = FLAGS_key_size;
4241 plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
4242 plain_table_options.hash_table_ratio = 0.75;
4243 options.table_factory = std::shared_ptr<TableFactory>(
4244 NewPlainTableFactory(plain_table_options));
4245 #else
4246 fprintf(stderr, "Plain table is not supported in lite mode\n");
4247 exit(1);
4248 #endif // ROCKSDB_LITE
4249 } else if (FLAGS_use_cuckoo_table) {
4250 #ifndef ROCKSDB_LITE
4251 if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
4252 fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
4253 exit(1);
4254 }
4255
4256 if (!FLAGS_mmap_read) {
4257 fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
4258 exit(1);
4259 }
4260
4261 ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
4262 table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
4263 table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
4264 options.table_factory =
4265 std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
4266 #else
4267 fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
4268 exit(1);
4269 #endif // ROCKSDB_LITE
4270 } else {
4271 BlockBasedTableOptions block_based_options;
4272 block_based_options.checksum =
4273 static_cast<ChecksumType>(FLAGS_checksum_type);
4274 if (FLAGS_use_hash_search) {
4275 if (FLAGS_prefix_size == 0) {
4276 fprintf(stderr,
4277 "prefix_size not assigned when enable use_hash_search \n");
4278 exit(1);
4279 }
4280 block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
4281 } else {
4282 block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
4283 }
4284 if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
4285 if (FLAGS_index_with_first_key) {
4286 fprintf(stderr,
4287 "--index_with_first_key is not compatible with"
4288 " partition index.");
4289 }
4290 if (FLAGS_use_hash_search) {
4291 fprintf(stderr,
4292 "use_hash_search is incompatible with "
4293 "partition index and is ignored");
4294 }
4295 block_based_options.index_type =
4296 BlockBasedTableOptions::kTwoLevelIndexSearch;
4297 block_based_options.metadata_block_size = FLAGS_metadata_block_size;
4298 if (FLAGS_partition_index_and_filters) {
4299 block_based_options.partition_filters = true;
4300 }
4301 } else if (FLAGS_index_with_first_key) {
4302 block_based_options.index_type =
4303 BlockBasedTableOptions::kBinarySearchWithFirstKey;
4304 }
4305 BlockBasedTableOptions::IndexShorteningMode index_shortening =
4306 block_based_options.index_shortening;
4307 switch (FLAGS_index_shortening_mode) {
4308 case 0:
4309 index_shortening =
4310 BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
4311 break;
4312 case 1:
4313 index_shortening =
4314 BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
4315 break;
4316 case 2:
4317 index_shortening = BlockBasedTableOptions::IndexShorteningMode::
4318 kShortenSeparatorsAndSuccessor;
4319 break;
4320 default:
4321 fprintf(stderr, "Unknown key shortening mode\n");
4322 }
4323 block_based_options.optimize_filters_for_memory =
4324 FLAGS_optimize_filters_for_memory;
4325 block_based_options.index_shortening = index_shortening;
4326 if (cache_ == nullptr) {
4327 block_based_options.no_block_cache = true;
4328 }
4329 block_based_options.cache_index_and_filter_blocks =
4330 FLAGS_cache_index_and_filter_blocks;
4331 block_based_options.pin_l0_filter_and_index_blocks_in_cache =
4332 FLAGS_pin_l0_filter_and_index_blocks_in_cache;
4333 block_based_options.pin_top_level_index_and_filter =
4334 FLAGS_pin_top_level_index_and_filter;
4335 if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps
4336 block_based_options.cache_index_and_filter_blocks_with_high_priority =
4337 true;
4338 }
4339 if (FLAGS_cache_high_pri_pool_ratio + FLAGS_cache_low_pri_pool_ratio >
4340 1.0) {
4341 fprintf(stderr,
4342 "Sum of high_pri_pool_ratio and low_pri_pool_ratio "
4343 "cannot exceed 1.0.\n");
4344 }
4345 block_based_options.block_cache = cache_;
4346 block_based_options.cache_usage_options.options_overrides.insert(
4347 {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
4348 {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
4349 ? CacheEntryRoleOptions::Decision::kEnabled
4350 : CacheEntryRoleOptions::Decision::kDisabled}});
4351 block_based_options.cache_usage_options.options_overrides.insert(
4352 {CacheEntryRole::kFilterConstruction,
4353 {/*.charged = */ FLAGS_charge_filter_construction
4354 ? CacheEntryRoleOptions::Decision::kEnabled
4355 : CacheEntryRoleOptions::Decision::kDisabled}});
4356 block_based_options.cache_usage_options.options_overrides.insert(
4357 {CacheEntryRole::kBlockBasedTableReader,
4358 {/*.charged = */ FLAGS_charge_table_reader
4359 ? CacheEntryRoleOptions::Decision::kEnabled
4360 : CacheEntryRoleOptions::Decision::kDisabled}});
4361 block_based_options.cache_usage_options.options_overrides.insert(
4362 {CacheEntryRole::kFileMetadata,
4363 {/*.charged = */ FLAGS_charge_file_metadata
4364 ? CacheEntryRoleOptions::Decision::kEnabled
4365 : CacheEntryRoleOptions::Decision::kDisabled}});
4366 block_based_options.cache_usage_options.options_overrides.insert(
4367 {CacheEntryRole::kBlobCache,
4368 {/*.charged = */ FLAGS_charge_blob_cache
4369 ? CacheEntryRoleOptions::Decision::kEnabled
4370 : CacheEntryRoleOptions::Decision::kDisabled}});
4371 block_based_options.block_cache_compressed = compressed_cache_;
4372 block_based_options.block_size = FLAGS_block_size;
4373 block_based_options.block_restart_interval = FLAGS_block_restart_interval;
4374 block_based_options.index_block_restart_interval =
4375 FLAGS_index_block_restart_interval;
4376 block_based_options.format_version =
4377 static_cast<uint32_t>(FLAGS_format_version);
4378 block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
4379 block_based_options.enable_index_compression =
4380 FLAGS_enable_index_compression;
4381 block_based_options.block_align = FLAGS_block_align;
4382 block_based_options.whole_key_filtering = FLAGS_whole_key_filtering;
4383 block_based_options.max_auto_readahead_size =
4384 FLAGS_max_auto_readahead_size;
4385 block_based_options.initial_auto_readahead_size =
4386 FLAGS_initial_auto_readahead_size;
4387 block_based_options.num_file_reads_for_auto_readahead =
4388 FLAGS_num_file_reads_for_auto_readahead;
4389 BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
4390 block_based_options.prepopulate_block_cache;
4391 switch (FLAGS_prepopulate_block_cache) {
4392 case 0:
4393 prepopulate_block_cache =
4394 BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
4395 break;
4396 case 1:
4397 prepopulate_block_cache =
4398 BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
4399 break;
4400 default:
4401 fprintf(stderr, "Unknown prepopulate block cache mode\n");
4402 }
4403 block_based_options.prepopulate_block_cache = prepopulate_block_cache;
4404 if (FLAGS_use_data_block_hash_index) {
4405 block_based_options.data_block_index_type =
4406 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
4407 } else {
4408 block_based_options.data_block_index_type =
4409 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
4410 }
4411 block_based_options.data_block_hash_table_util_ratio =
4412 FLAGS_data_block_hash_table_util_ratio;
4413 if (FLAGS_read_cache_path != "") {
4414 #ifndef ROCKSDB_LITE
4415 Status rc_status;
4416
4417 // Read cache need to be provided with a the Logger, we will put all
4418 // reac cache logs in the read cache path in a file named rc_LOG
4419 rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
4420 std::shared_ptr<Logger> read_cache_logger;
4421 if (rc_status.ok()) {
4422 rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
4423 &read_cache_logger);
4424 }
4425
4426 if (rc_status.ok()) {
4427 PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
4428 FLAGS_read_cache_size,
4429 read_cache_logger);
4430
4431 rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
4432 rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
4433 rc_cfg.writer_qdepth = 4;
4434 rc_cfg.writer_dispatch_size = 4 * 1024;
4435
4436 auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
4437 block_based_options.persistent_cache = pcache;
4438 rc_status = pcache->Open();
4439 }
4440
4441 if (!rc_status.ok()) {
4442 fprintf(stderr, "Error initializing read cache, %s\n",
4443 rc_status.ToString().c_str());
4444 exit(1);
4445 }
4446 #else
4447 fprintf(stderr, "Read cache is not supported in LITE\n");
4448 exit(1);
4449
4450 #endif
4451 }
4452
4453 if (FLAGS_use_blob_cache) {
4454 if (FLAGS_use_shared_block_and_blob_cache) {
4455 options.blob_cache = cache_;
4456 } else {
4457 if (FLAGS_blob_cache_size > 0) {
4458 LRUCacheOptions co;
4459 co.capacity = FLAGS_blob_cache_size;
4460 co.num_shard_bits = FLAGS_blob_cache_numshardbits;
4461 co.memory_allocator = GetCacheAllocator();
4462
4463 options.blob_cache = NewLRUCache(co);
4464 } else {
4465 fprintf(
4466 stderr,
4467 "Unable to create a standalone blob cache if blob_cache_size "
4468 "<= 0.\n");
4469 exit(1);
4470 }
4471 }
4472 switch (FLAGS_prepopulate_blob_cache) {
4473 case 0:
4474 options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
4475 break;
4476 case 1:
4477 options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
4478 break;
4479 default:
4480 fprintf(stderr, "Unknown prepopulate blob cache mode\n");
4481 exit(1);
4482 }
4483
4484 fprintf(stdout,
4485 "Integrated BlobDB: blob cache enabled"
4486 ", block and blob caches shared: %d",
4487 FLAGS_use_shared_block_and_blob_cache);
4488 if (!FLAGS_use_shared_block_and_blob_cache) {
4489 fprintf(stdout,
4490 ", blob cache size %" PRIu64
4491 ", blob cache num shard bits: %d",
4492 FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits);
4493 }
4494 fprintf(stdout, ", blob cache prepopulated: %d\n",
4495 FLAGS_prepopulate_blob_cache);
4496 } else {
4497 fprintf(stdout, "Integrated BlobDB: blob cache disabled\n");
4498 }
4499
4500 options.table_factory.reset(
4501 NewBlockBasedTableFactory(block_based_options));
4502 }
4503 if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
4504 if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
4505 static_cast<unsigned int>(FLAGS_num_levels)) {
4506 fprintf(stderr, "Insufficient number of fanouts specified %d\n",
4507 static_cast<int>(
4508 FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
4509 exit(1);
4510 }
4511 options.max_bytes_for_level_multiplier_additional =
4512 FLAGS_max_bytes_for_level_multiplier_additional_v;
4513 }
4514 options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
4515 options.level0_file_num_compaction_trigger =
4516 FLAGS_level0_file_num_compaction_trigger;
4517 options.level0_slowdown_writes_trigger =
4518 FLAGS_level0_slowdown_writes_trigger;
4519 options.compression = FLAGS_compression_type_e;
4520 if (FLAGS_simulate_hybrid_fs_file != "") {
4521 options.bottommost_temperature = Temperature::kWarm;
4522 }
4523 options.preclude_last_level_data_seconds =
4524 FLAGS_preclude_last_level_data_seconds;
4525 options.preserve_internal_time_seconds =
4526 FLAGS_preserve_internal_time_seconds;
4527 options.sample_for_compression = FLAGS_sample_for_compression;
4528 options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
4529 options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
4530 options.max_total_wal_size = FLAGS_max_total_wal_size;
4531
4532 if (FLAGS_min_level_to_compress >= 0) {
4533 assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
4534 options.compression_per_level.resize(FLAGS_num_levels);
4535 for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
4536 options.compression_per_level[i] = kNoCompression;
4537 }
4538 for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) {
4539 options.compression_per_level[i] = FLAGS_compression_type_e;
4540 }
4541 }
4542 options.soft_pending_compaction_bytes_limit =
4543 FLAGS_soft_pending_compaction_bytes_limit;
4544 options.hard_pending_compaction_bytes_limit =
4545 FLAGS_hard_pending_compaction_bytes_limit;
4546 options.delayed_write_rate = FLAGS_delayed_write_rate;
4547 options.allow_concurrent_memtable_write =
4548 FLAGS_allow_concurrent_memtable_write;
4549 options.experimental_mempurge_threshold =
4550 FLAGS_experimental_mempurge_threshold;
4551 options.inplace_update_support = FLAGS_inplace_update_support;
4552 options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
4553 options.enable_write_thread_adaptive_yield =
4554 FLAGS_enable_write_thread_adaptive_yield;
4555 options.enable_pipelined_write = FLAGS_enable_pipelined_write;
4556 options.unordered_write = FLAGS_unordered_write;
4557 options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
4558 options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
4559 options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
4560 options.max_compaction_bytes = FLAGS_max_compaction_bytes;
4561 options.disable_auto_compactions = FLAGS_disable_auto_compactions;
4562 options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
4563 options.paranoid_checks = FLAGS_paranoid_checks;
4564 options.force_consistency_checks = FLAGS_force_consistency_checks;
4565 options.check_flush_compaction_key_order =
4566 FLAGS_check_flush_compaction_key_order;
4567 options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
4568 options.ttl = FLAGS_ttl_seconds;
4569 // fill storage options
4570 options.advise_random_on_open = FLAGS_advise_random_on_open;
4571 options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
4572 options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
4573 options.bytes_per_sync = FLAGS_bytes_per_sync;
4574 options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
4575
4576 // merge operator options
4577 if (!FLAGS_merge_operator.empty()) {
4578 s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator,
4579 &options.merge_operator);
4580 if (!s.ok()) {
4581 fprintf(stderr, "invalid merge operator[%s]: %s\n",
4582 FLAGS_merge_operator.c_str(), s.ToString().c_str());
4583 exit(1);
4584 }
4585 }
4586 options.max_successive_merges = FLAGS_max_successive_merges;
4587 options.report_bg_io_stats = FLAGS_report_bg_io_stats;
4588
4589 // set universal style compaction configurations, if applicable
4590 if (FLAGS_universal_size_ratio != 0) {
4591 options.compaction_options_universal.size_ratio =
4592 FLAGS_universal_size_ratio;
4593 }
4594 if (FLAGS_universal_min_merge_width != 0) {
4595 options.compaction_options_universal.min_merge_width =
4596 FLAGS_universal_min_merge_width;
4597 }
4598 if (FLAGS_universal_max_merge_width != 0) {
4599 options.compaction_options_universal.max_merge_width =
4600 FLAGS_universal_max_merge_width;
4601 }
4602 if (FLAGS_universal_max_size_amplification_percent != 0) {
4603 options.compaction_options_universal.max_size_amplification_percent =
4604 FLAGS_universal_max_size_amplification_percent;
4605 }
4606 if (FLAGS_universal_compression_size_percent != -1) {
4607 options.compaction_options_universal.compression_size_percent =
4608 FLAGS_universal_compression_size_percent;
4609 }
4610 options.compaction_options_universal.allow_trivial_move =
4611 FLAGS_universal_allow_trivial_move;
4612 options.compaction_options_universal.incremental =
4613 FLAGS_universal_incremental;
4614 if (FLAGS_thread_status_per_interval > 0) {
4615 options.enable_thread_tracking = true;
4616 }
4617
4618 if (FLAGS_user_timestamp_size > 0) {
4619 if (FLAGS_user_timestamp_size != 8) {
4620 fprintf(stderr, "Only 64 bits timestamps are supported.\n");
4621 exit(1);
4622 }
4623 options.comparator = test::BytewiseComparatorWithU64TsWrapper();
4624 }
4625
4626 options.allow_data_in_errors = FLAGS_allow_data_in_errors;
4627 options.track_and_verify_wals_in_manifest =
4628 FLAGS_track_and_verify_wals_in_manifest;
4629
4630 // Integrated BlobDB
4631 options.enable_blob_files = FLAGS_enable_blob_files;
4632 options.min_blob_size = FLAGS_min_blob_size;
4633 options.blob_file_size = FLAGS_blob_file_size;
4634 options.blob_compression_type =
4635 StringToCompressionType(FLAGS_blob_compression_type.c_str());
4636 options.enable_blob_garbage_collection =
4637 FLAGS_enable_blob_garbage_collection;
4638 options.blob_garbage_collection_age_cutoff =
4639 FLAGS_blob_garbage_collection_age_cutoff;
4640 options.blob_garbage_collection_force_threshold =
4641 FLAGS_blob_garbage_collection_force_threshold;
4642 options.blob_compaction_readahead_size =
4643 FLAGS_blob_compaction_readahead_size;
4644 options.blob_file_starting_level = FLAGS_blob_file_starting_level;
4645
4646 #ifndef ROCKSDB_LITE
4647 if (FLAGS_readonly && FLAGS_transaction_db) {
4648 fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
4649 exit(1);
4650 }
4651 if (FLAGS_use_secondary_db &&
4652 (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
4653 fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
4654 exit(1);
4655 }
4656 #endif // ROCKSDB_LITE
4657 options.memtable_protection_bytes_per_key =
4658 FLAGS_memtable_protection_bytes_per_key;
4659 }
4660
4661 void InitializeOptionsGeneral(Options* opts) {
4662 // Be careful about what is set here to avoid accidentally overwriting
4663 // settings already configured by OPTIONS file. Only configure settings that
4664 // are needed for the benchmark to run, settings for shared objects that
4665 // were not configured already, settings that require dynamically invoking
4666 // APIs, and settings for the benchmark itself.
4667 Options& options = *opts;
4668
4669 // Always set these since they are harmless when not needed and prevent
4670 // a guaranteed failure when they are needed.
4671 options.create_missing_column_families = true;
4672 options.create_if_missing = true;
4673
4674 if (options.statistics == nullptr) {
4675 options.statistics = dbstats;
4676 }
4677
4678 auto table_options =
4679 options.table_factory->GetOptions<BlockBasedTableOptions>();
4680 if (table_options != nullptr) {
4681 if (FLAGS_cache_size > 0) {
4682 // This violates this function's rules on when to set options. But we
4683 // have to do it because the case of unconfigured block cache in OPTIONS
4684 // file is indistinguishable (it is sanitized to 8MB by this point, not
4685 // nullptr), and our regression tests assume this will be the shared
4686 // block cache, even with OPTIONS file provided.
4687 table_options->block_cache = cache_;
4688 }
4689 if (table_options->filter_policy == nullptr) {
4690 if (FLAGS_bloom_bits < 0) {
4691 table_options->filter_policy = BlockBasedTableOptions().filter_policy;
4692 } else if (FLAGS_bloom_bits == 0) {
4693 table_options->filter_policy.reset();
4694 } else {
4695 table_options->filter_policy.reset(
4696 FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
4697 : NewBloomFilterPolicy(FLAGS_bloom_bits));
4698 }
4699 }
4700 }
4701
4702 if (options.row_cache == nullptr) {
4703 if (FLAGS_row_cache_size) {
4704 if (FLAGS_cache_numshardbits >= 1) {
4705 options.row_cache =
4706 NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
4707 } else {
4708 options.row_cache = NewLRUCache(FLAGS_row_cache_size);
4709 }
4710 }
4711 }
4712
4713 if (options.env == Env::Default()) {
4714 options.env = FLAGS_env;
4715 }
4716 if (FLAGS_enable_io_prio) {
4717 options.env->LowerThreadPoolIOPriority(Env::LOW);
4718 options.env->LowerThreadPoolIOPriority(Env::HIGH);
4719 }
4720 if (FLAGS_enable_cpu_prio) {
4721 options.env->LowerThreadPoolCPUPriority(Env::LOW);
4722 options.env->LowerThreadPoolCPUPriority(Env::HIGH);
4723 }
4724
4725 if (FLAGS_sine_write_rate) {
4726 FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
4727 }
4728
4729 if (options.rate_limiter == nullptr) {
4730 if (FLAGS_rate_limiter_bytes_per_sec > 0) {
4731 options.rate_limiter.reset(NewGenericRateLimiter(
4732 FLAGS_rate_limiter_bytes_per_sec,
4733 FLAGS_rate_limiter_refill_period_us, 10 /* fairness */,
4734 // TODO: replace this with a more general FLAG for deciding
4735 // RateLimiter::Mode as now we also rate-limit foreground reads e.g,
4736 // Get()/MultiGet()
4737 FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
4738 : RateLimiter::Mode::kWritesOnly,
4739 FLAGS_rate_limiter_auto_tuned));
4740 }
4741 }
4742
4743 options.listeners.emplace_back(listener_);
4744
4745 if (options.file_checksum_gen_factory == nullptr) {
4746 if (FLAGS_file_checksum) {
4747 options.file_checksum_gen_factory.reset(
4748 new FileChecksumGenCrc32cFactory());
4749 }
4750 }
4751
4752 if (FLAGS_num_multi_db <= 1) {
4753 OpenDb(options, FLAGS_db, &db_);
4754 } else {
4755 multi_dbs_.clear();
4756 multi_dbs_.resize(FLAGS_num_multi_db);
4757 auto wal_dir = options.wal_dir;
4758 for (int i = 0; i < FLAGS_num_multi_db; i++) {
4759 if (!wal_dir.empty()) {
4760 options.wal_dir = GetPathForMultiple(wal_dir, i);
4761 }
4762 OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
4763 }
4764 options.wal_dir = wal_dir;
4765 }
4766
4767 // KeepFilter is a noop filter, this can be used to test compaction filter
4768 if (options.compaction_filter == nullptr) {
4769 if (FLAGS_use_keep_filter) {
4770 options.compaction_filter = new KeepFilter();
4771 fprintf(stdout, "A noop compaction filter is used\n");
4772 }
4773 }
4774
4775 if (FLAGS_use_existing_keys) {
4776 // Only work on single database
4777 assert(db_.db != nullptr);
4778 ReadOptions read_opts; // before read_options_ initialized
4779 read_opts.total_order_seek = true;
4780 Iterator* iter = db_.db->NewIterator(read_opts);
4781 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
4782 keys_.emplace_back(iter->key().ToString());
4783 }
4784 delete iter;
4785 FLAGS_num = keys_.size();
4786 }
4787 }
4788
4789 void Open(Options* opts) {
4790 if (!InitializeOptionsFromFile(opts)) {
4791 InitializeOptionsFromFlags(opts);
4792 }
4793
4794 InitializeOptionsGeneral(opts);
4795 }
4796
4797 void OpenDb(Options options, const std::string& db_name,
4798 DBWithColumnFamilies* db) {
4799 uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0;
4800 Status s;
4801 // Open with column families if necessary.
4802 if (FLAGS_num_column_families > 1) {
4803 size_t num_hot = FLAGS_num_column_families;
4804 if (FLAGS_num_hot_column_families > 0 &&
4805 FLAGS_num_hot_column_families < FLAGS_num_column_families) {
4806 num_hot = FLAGS_num_hot_column_families;
4807 } else {
4808 FLAGS_num_hot_column_families = FLAGS_num_column_families;
4809 }
4810 std::vector<ColumnFamilyDescriptor> column_families;
4811 for (size_t i = 0; i < num_hot; i++) {
4812 column_families.push_back(ColumnFamilyDescriptor(
4813 ColumnFamilyName(i), ColumnFamilyOptions(options)));
4814 }
4815 std::vector<int> cfh_idx_to_prob;
4816 if (!FLAGS_column_family_distribution.empty()) {
4817 std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
4818 std::string cf_prob;
4819 int sum = 0;
4820 while (std::getline(cf_prob_stream, cf_prob, ',')) {
4821 cfh_idx_to_prob.push_back(std::stoi(cf_prob));
4822 sum += cfh_idx_to_prob.back();
4823 }
4824 if (sum != 100) {
4825 fprintf(stderr, "column_family_distribution items must sum to 100\n");
4826 exit(1);
4827 }
4828 if (cfh_idx_to_prob.size() != num_hot) {
4829 fprintf(stderr,
4830 "got %" ROCKSDB_PRIszt
4831 " column_family_distribution items; expected "
4832 "%" ROCKSDB_PRIszt "\n",
4833 cfh_idx_to_prob.size(), num_hot);
4834 exit(1);
4835 }
4836 }
4837 #ifndef ROCKSDB_LITE
4838 if (FLAGS_readonly) {
4839 s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh,
4840 &db->db);
4841 } else if (FLAGS_optimistic_transaction_db) {
4842 s = OptimisticTransactionDB::Open(options, db_name, column_families,
4843 &db->cfh, &db->opt_txn_db);
4844 if (s.ok()) {
4845 db->db = db->opt_txn_db->GetBaseDB();
4846 }
4847 } else if (FLAGS_transaction_db) {
4848 TransactionDB* ptr;
4849 TransactionDBOptions txn_db_options;
4850 if (options.unordered_write) {
4851 options.two_write_queues = true;
4852 txn_db_options.skip_concurrency_control = true;
4853 txn_db_options.write_policy = WRITE_PREPARED;
4854 }
4855 s = TransactionDB::Open(options, txn_db_options, db_name,
4856 column_families, &db->cfh, &ptr);
4857 if (s.ok()) {
4858 db->db = ptr;
4859 }
4860 } else {
4861 s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4862 }
4863 #else
4864 s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4865 #endif // ROCKSDB_LITE
4866 db->cfh.resize(FLAGS_num_column_families);
4867 db->num_created = num_hot;
4868 db->num_hot = num_hot;
4869 db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
4870 #ifndef ROCKSDB_LITE
4871 } else if (FLAGS_readonly) {
4872 s = DB::OpenForReadOnly(options, db_name, &db->db);
4873 } else if (FLAGS_optimistic_transaction_db) {
4874 s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
4875 if (s.ok()) {
4876 db->db = db->opt_txn_db->GetBaseDB();
4877 }
4878 } else if (FLAGS_transaction_db) {
4879 TransactionDB* ptr = nullptr;
4880 TransactionDBOptions txn_db_options;
4881 if (options.unordered_write) {
4882 options.two_write_queues = true;
4883 txn_db_options.skip_concurrency_control = true;
4884 txn_db_options.write_policy = WRITE_PREPARED;
4885 }
4886 s = CreateLoggerFromOptions(db_name, options, &options.info_log);
4887 if (s.ok()) {
4888 s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
4889 }
4890 if (s.ok()) {
4891 db->db = ptr;
4892 }
4893 } else if (FLAGS_use_blob_db) {
4894 // Stacked BlobDB
4895 blob_db::BlobDBOptions blob_db_options;
4896 blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
4897 blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
4898 blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
4899 blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
4900 blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
4901 blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
4902 blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
4903 blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
4904 blob_db_options.compression = FLAGS_blob_db_compression_type_e;
4905 blob_db::BlobDB* ptr = nullptr;
4906 s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
4907 if (s.ok()) {
4908 db->db = ptr;
4909 }
4910 } else if (FLAGS_use_secondary_db) {
4911 if (FLAGS_secondary_path.empty()) {
4912 std::string default_secondary_path;
4913 FLAGS_env->GetTestDirectory(&default_secondary_path);
4914 default_secondary_path += "/dbbench_secondary";
4915 FLAGS_secondary_path = default_secondary_path;
4916 }
4917 s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
4918 if (s.ok() && FLAGS_secondary_update_interval > 0) {
4919 secondary_update_thread_.reset(new port::Thread(
4920 [this](int interval, DBWithColumnFamilies* _db) {
4921 while (0 == secondary_update_stopped_.load(
4922 std::memory_order_relaxed)) {
4923 Status secondary_update_status =
4924 _db->db->TryCatchUpWithPrimary();
4925 if (!secondary_update_status.ok()) {
4926 fprintf(stderr, "Failed to catch up with primary: %s\n",
4927 secondary_update_status.ToString().c_str());
4928 break;
4929 }
4930 ++secondary_db_updates_;
4931 FLAGS_env->SleepForMicroseconds(interval * 1000000);
4932 }
4933 },
4934 FLAGS_secondary_update_interval, db));
4935 }
4936 #endif // ROCKSDB_LITE
4937 } else {
4938 s = DB::Open(options, db_name, &db->db);
4939 }
4940 if (FLAGS_report_open_timing) {
4941 std::cout << "OpenDb: "
4942 << (FLAGS_env->NowNanos() - open_start) / 1000000.0
4943 << " milliseconds\n";
4944 }
4945 if (!s.ok()) {
4946 fprintf(stderr, "open error: %s\n", s.ToString().c_str());
4947 exit(1);
4948 }
4949 }
4950
4951 enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM };
4952
4953 void WriteSeqDeterministic(ThreadState* thread) {
4954 DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
4955 }
4956
4957 void WriteUniqueRandomDeterministic(ThreadState* thread) {
4958 DoDeterministicCompact(thread, open_options_.compaction_style,
4959 UNIQUE_RANDOM);
4960 }
4961
4962 void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); }
4963
4964 void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); }
4965
4966 void WriteUniqueRandom(ThreadState* thread) {
4967 DoWrite(thread, UNIQUE_RANDOM);
4968 }
4969
4970 class KeyGenerator {
4971 public:
4972 KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
4973 uint64_t /*num_per_set*/ = 64 * 1024)
4974 : rand_(rand), mode_(mode), num_(num), next_(0) {
4975 if (mode_ == UNIQUE_RANDOM) {
4976 // NOTE: if memory consumption of this approach becomes a concern,
4977 // we can either break it into pieces and only random shuffle a section
4978 // each time. Alternatively, use a bit map implementation
4979 // (https://reviews.facebook.net/differential/diff/54627/)
4980 values_.resize(num_);
4981 for (uint64_t i = 0; i < num_; ++i) {
4982 values_[i] = i;
4983 }
4984 RandomShuffle(values_.begin(), values_.end(),
4985 static_cast<uint32_t>(seed_base));
4986 }
4987 }
4988
4989 uint64_t Next() {
4990 switch (mode_) {
4991 case SEQUENTIAL:
4992 return next_++;
4993 case RANDOM:
4994 return rand_->Next() % num_;
4995 case UNIQUE_RANDOM:
4996 assert(next_ < num_);
4997 return values_[next_++];
4998 }
4999 assert(false);
5000 return std::numeric_limits<uint64_t>::max();
5001 }
5002
5003 // Only available for UNIQUE_RANDOM mode.
5004 uint64_t Fetch(uint64_t index) {
5005 assert(mode_ == UNIQUE_RANDOM);
5006 assert(index < values_.size());
5007 return values_[index];
5008 }
5009
5010 private:
5011 Random64* rand_;
5012 WriteMode mode_;
5013 const uint64_t num_;
5014 uint64_t next_;
5015 std::vector<uint64_t> values_;
5016 };
5017
5018 DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; }
5019
5020 DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
5021 return SelectDBWithCfh(thread->rand.Next());
5022 }
5023
5024 DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
5025 if (db_.db != nullptr) {
5026 return &db_;
5027 } else {
5028 return &multi_dbs_[rand_int % multi_dbs_.size()];
5029 }
5030 }
5031
5032 double SineRate(double x) {
5033 return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d;
5034 }
5035
5036 void DoWrite(ThreadState* thread, WriteMode write_mode) {
5037 const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
5038 const int64_t num_ops = writes_ == 0 ? num_ : writes_;
5039
5040 size_t num_key_gens = 1;
5041 if (db_.db == nullptr) {
5042 num_key_gens = multi_dbs_.size();
5043 }
5044 std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
5045 int64_t max_ops = num_ops * num_key_gens;
5046 int64_t ops_per_stage = max_ops;
5047 if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
5048 ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
5049 FLAGS_num_hot_column_families) +
5050 1;
5051 }
5052
5053 Duration duration(test_duration, max_ops, ops_per_stage);
5054 const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_;
5055 for (size_t i = 0; i < num_key_gens; i++) {
5056 key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
5057 num_per_key_gen, ops_per_stage));
5058 }
5059
5060 if (num_ != FLAGS_num) {
5061 char msg[100];
5062 snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
5063 thread->stats.AddMessage(msg);
5064 }
5065
5066 RandomGenerator gen;
5067 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
5068 FLAGS_write_batch_protection_bytes_per_key,
5069 user_timestamp_size_);
5070 Status s;
5071 int64_t bytes = 0;
5072
5073 std::unique_ptr<const char[]> key_guard;
5074 Slice key = AllocateKey(&key_guard);
5075 std::unique_ptr<const char[]> begin_key_guard;
5076 Slice begin_key = AllocateKey(&begin_key_guard);
5077 std::unique_ptr<const char[]> end_key_guard;
5078 Slice end_key = AllocateKey(&end_key_guard);
5079 double p = 0.0;
5080 uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0;
5081 // If user set overwrite_probability flag,
5082 // check if value is in [0.0,1.0].
5083 if (FLAGS_overwrite_probability > 0.0) {
5084 p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability;
5085 // If overwrite set by user, and UNIQUE_RANDOM mode on,
5086 // the overwrite_window_size must be > 0.
5087 if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) {
5088 fprintf(stderr,
5089 "Overwrite_window_size must be strictly greater than 0.\n");
5090 ErrorExit();
5091 }
5092 }
5093
5094 // Default_random_engine provides slightly
5095 // improved throughput over mt19937.
5096 std::default_random_engine overwrite_gen{
5097 static_cast<unsigned int>(seed_base)};
5098 std::bernoulli_distribution overwrite_decider(p);
5099
5100 // Inserted key window is filled with the last N
5101 // keys previously inserted into the DB (with
5102 // N=FLAGS_overwrite_window_size).
5103 // We use a deque struct because:
5104 // - random access is O(1)
5105 // - insertion/removal at beginning/end is also O(1).
5106 std::deque<int64_t> inserted_key_window;
5107 Random64 reservoir_id_gen(seed_base);
5108
5109 // --- Variables used in disposable/persistent keys simulation:
5110 // The following variables are used when
5111 // disposable_entries_batch_size is >0. We simualte a workload
5112 // where the following sequence is repeated multiple times:
5113 // "A set of keys S1 is inserted ('disposable entries'), then after
5114 // some delay another set of keys S2 is inserted ('persistent entries')
5115 // and the first set of keys S1 is deleted. S2 artificially represents
5116 // the insertion of hypothetical results from some undefined computation
5117 // done on the first set of keys S1. The next sequence can start as soon
5118 // as the last disposable entry in the set S1 of this sequence is
5119 // inserted, if the delay is non negligible"
5120 bool skip_for_loop = false, is_disposable_entry = true;
5121 std::vector<uint64_t> disposable_entries_index(num_key_gens, 0);
5122 std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0);
5123 const uint64_t kNumDispAndPersEntries =
5124 FLAGS_disposable_entries_batch_size +
5125 FLAGS_persistent_entries_batch_size;
5126 if (kNumDispAndPersEntries > 0) {
5127 if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) ||
5128 (p > 0.0)) {
5129 fprintf(
5130 stderr,
5131 "Disposable/persistent deletes are not compatible with overwrites "
5132 "and DeleteRanges; and are only supported in filluniquerandom.\n");
5133 ErrorExit();
5134 }
5135 if (FLAGS_disposable_entries_value_size < 0 ||
5136 FLAGS_persistent_entries_value_size < 0) {
5137 fprintf(
5138 stderr,
5139 "disposable_entries_value_size and persistent_entries_value_size"
5140 "have to be positive.\n");
5141 ErrorExit();
5142 }
5143 }
5144 Random rnd_disposable_entry(static_cast<uint32_t>(seed_base));
5145 std::string random_value;
5146 // Queue that stores scheduled timestamp of disposable entries deletes,
5147 // along with starting index of disposable entry keys to delete.
5148 std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q(
5149 num_key_gens);
5150 // --- End of variables used in disposable/persistent keys simulation.
5151
5152 std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
5153 std::vector<Slice> expanded_keys;
5154 if (FLAGS_expand_range_tombstones) {
5155 expanded_key_guards.resize(range_tombstone_width_);
5156 for (auto& expanded_key_guard : expanded_key_guards) {
5157 expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
5158 }
5159 }
5160
5161 std::unique_ptr<char[]> ts_guard;
5162 if (user_timestamp_size_ > 0) {
5163 ts_guard.reset(new char[user_timestamp_size_]);
5164 }
5165
5166 int64_t stage = 0;
5167 int64_t num_written = 0;
5168 int64_t next_seq_db_at = num_ops;
5169 size_t id = 0;
5170 int64_t num_range_deletions = 0;
5171
5172 while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) {
5173 if (duration.GetStage() != stage) {
5174 stage = duration.GetStage();
5175 if (db_.db != nullptr) {
5176 db_.CreateNewCf(open_options_, stage);
5177 } else {
5178 for (auto& db : multi_dbs_) {
5179 db.CreateNewCf(open_options_, stage);
5180 }
5181 }
5182 }
5183
5184 if (write_mode != SEQUENTIAL) {
5185 id = thread->rand.Next() % num_key_gens;
5186 } else {
5187 // When doing a sequential load with multiple databases, load them in
5188 // order rather than all at the same time to avoid:
5189 // 1) long delays between flushing memtables
5190 // 2) flushing memtables for all of them at the same point in time
5191 // 3) not putting the same number of keys in each database
5192 if (num_written >= next_seq_db_at) {
5193 next_seq_db_at += num_ops;
5194 id++;
5195 if (id >= num_key_gens) {
5196 fprintf(stderr, "Logic error. Filled all databases\n");
5197 ErrorExit();
5198 }
5199 }
5200 }
5201 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
5202
5203 batch.Clear();
5204 int64_t batch_bytes = 0;
5205
5206 for (int64_t j = 0; j < entries_per_batch_; j++) {
5207 int64_t rand_num = 0;
5208 if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
5209 if ((inserted_key_window.size() > 0) &&
5210 overwrite_decider(overwrite_gen)) {
5211 num_overwrites++;
5212 rand_num = inserted_key_window[reservoir_id_gen.Next() %
5213 inserted_key_window.size()];
5214 } else {
5215 num_unique_keys++;
5216 rand_num = key_gens[id]->Next();
5217 if (inserted_key_window.size() < FLAGS_overwrite_window_size) {
5218 inserted_key_window.push_back(rand_num);
5219 } else {
5220 inserted_key_window.pop_front();
5221 inserted_key_window.push_back(rand_num);
5222 }
5223 }
5224 } else if (kNumDispAndPersEntries > 0) {
5225 // Check if queue is non-empty and if we need to insert
5226 // 'persistent' KV entries (KV entries that are never deleted)
5227 // and delete disposable entries previously inserted.
5228 if (!disposable_entries_q[id].empty() &&
5229 (disposable_entries_q[id].front().first <
5230 FLAGS_env->NowMicros())) {
5231 // If we need to perform a "merge op" pattern,
5232 // we first write all the persistent KV entries not targeted
5233 // by deletes, and then we write the disposable entries deletes.
5234 if (persistent_ent_and_del_index[id] <
5235 FLAGS_persistent_entries_batch_size) {
5236 // Generate key to insert.
5237 rand_num =
5238 key_gens[id]->Fetch(disposable_entries_q[id].front().second +
5239 FLAGS_disposable_entries_batch_size +
5240 persistent_ent_and_del_index[id]);
5241 persistent_ent_and_del_index[id]++;
5242 is_disposable_entry = false;
5243 skip_for_loop = false;
5244 } else if (persistent_ent_and_del_index[id] <
5245 kNumDispAndPersEntries) {
5246 // Find key of the entry to delete.
5247 rand_num =
5248 key_gens[id]->Fetch(disposable_entries_q[id].front().second +
5249 (persistent_ent_and_del_index[id] -
5250 FLAGS_persistent_entries_batch_size));
5251 persistent_ent_and_del_index[id]++;
5252 GenerateKeyFromInt(rand_num, FLAGS_num, &key);
5253 // For the delete operation, everything happens here and we
5254 // skip the rest of the for-loop, which is designed for
5255 // inserts.
5256 if (FLAGS_num_column_families <= 1) {
5257 batch.Delete(key);
5258 } else {
5259 // We use same rand_num as seed for key and column family so
5260 // that we can deterministically find the cfh corresponding to a
5261 // particular key while reading the key.
5262 batch.Delete(db_with_cfh->GetCfh(rand_num), key);
5263 }
5264 // A delete only includes Key+Timestamp (no value).
5265 batch_bytes += key_size_ + user_timestamp_size_;
5266 bytes += key_size_ + user_timestamp_size_;
5267 num_selective_deletes++;
5268 // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
5269 skip_for_loop = true;
5270 } else {
5271 assert(false); // should never reach this point.
5272 }
5273 // If disposable_entries_q needs to be updated (ie: when a selective
5274 // insert+delete was successfully completed, pop the job out of the
5275 // queue).
5276 if (!disposable_entries_q[id].empty() &&
5277 (disposable_entries_q[id].front().first <
5278 FLAGS_env->NowMicros()) &&
5279 persistent_ent_and_del_index[id] == kNumDispAndPersEntries) {
5280 disposable_entries_q[id].pop();
5281 persistent_ent_and_del_index[id] = 0;
5282 }
5283
5284 // If we are deleting disposable entries, skip the rest of the
5285 // for-loop since there is no key-value inserts at this moment in
5286 // time.
5287 if (skip_for_loop) {
5288 continue;
5289 }
5290
5291 }
5292 // If no job is in the queue, then we keep inserting disposable KV
5293 // entries that will be deleted later by a series of deletes.
5294 else {
5295 rand_num = key_gens[id]->Fetch(disposable_entries_index[id]);
5296 disposable_entries_index[id]++;
5297 is_disposable_entry = true;
5298 if ((disposable_entries_index[id] %
5299 FLAGS_disposable_entries_batch_size) == 0) {
5300 // Skip the persistent KV entries inserts for now
5301 disposable_entries_index[id] +=
5302 FLAGS_persistent_entries_batch_size;
5303 }
5304 }
5305 } else {
5306 rand_num = key_gens[id]->Next();
5307 }
5308 GenerateKeyFromInt(rand_num, FLAGS_num, &key);
5309 Slice val;
5310 if (kNumDispAndPersEntries > 0) {
5311 random_value = rnd_disposable_entry.RandomString(
5312 is_disposable_entry ? FLAGS_disposable_entries_value_size
5313 : FLAGS_persistent_entries_value_size);
5314 val = Slice(random_value);
5315 num_unique_keys++;
5316 } else {
5317 val = gen.Generate();
5318 }
5319 if (use_blob_db_) {
5320 #ifndef ROCKSDB_LITE
5321 // Stacked BlobDB
5322 blob_db::BlobDB* blobdb =
5323 static_cast<blob_db::BlobDB*>(db_with_cfh->db);
5324 if (FLAGS_blob_db_max_ttl_range > 0) {
5325 int ttl = rand() % FLAGS_blob_db_max_ttl_range;
5326 s = blobdb->PutWithTTL(write_options_, key, val, ttl);
5327 } else {
5328 s = blobdb->Put(write_options_, key, val);
5329 }
5330 #endif // ROCKSDB_LITE
5331 } else if (FLAGS_num_column_families <= 1) {
5332 batch.Put(key, val);
5333 } else {
5334 // We use same rand_num as seed for key and column family so that we
5335 // can deterministically find the cfh corresponding to a particular
5336 // key while reading the key.
5337 batch.Put(db_with_cfh->GetCfh(rand_num), key, val);
5338 }
5339 batch_bytes += val.size() + key_size_ + user_timestamp_size_;
5340 bytes += val.size() + key_size_ + user_timestamp_size_;
5341 ++num_written;
5342
5343 // If all disposable entries have been inserted, then we need to
5344 // add in the job queue a call for 'persistent entry insertions +
5345 // disposable entry deletions'.
5346 if (kNumDispAndPersEntries > 0 && is_disposable_entry &&
5347 ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) {
5348 // Queue contains [timestamp, starting_idx],
5349 // timestamp = current_time + delay (minimum aboslute time when to
5350 // start inserting the selective deletes) starting_idx = index in the
5351 // keygen of the rand_num to generate the key of the first KV entry to
5352 // delete (= key of the first selective delete).
5353 disposable_entries_q[id].push(std::make_pair(
5354 FLAGS_env->NowMicros() +
5355 FLAGS_disposable_entries_delete_delay /* timestamp */,
5356 disposable_entries_index[id] - kNumDispAndPersEntries
5357 /*starting idx*/));
5358 }
5359 if (writes_per_range_tombstone_ > 0 &&
5360 num_written > writes_before_delete_range_ &&
5361 (num_written - writes_before_delete_range_) /
5362 writes_per_range_tombstone_ <=
5363 max_num_range_tombstones_ &&
5364 (num_written - writes_before_delete_range_) %
5365 writes_per_range_tombstone_ ==
5366 0) {
5367 num_range_deletions++;
5368 int64_t begin_num = key_gens[id]->Next();
5369 if (FLAGS_expand_range_tombstones) {
5370 for (int64_t offset = 0; offset < range_tombstone_width_;
5371 ++offset) {
5372 GenerateKeyFromInt(begin_num + offset, FLAGS_num,
5373 &expanded_keys[offset]);
5374 if (use_blob_db_) {
5375 #ifndef ROCKSDB_LITE
5376 // Stacked BlobDB
5377 s = db_with_cfh->db->Delete(write_options_,
5378 expanded_keys[offset]);
5379 #endif // ROCKSDB_LITE
5380 } else if (FLAGS_num_column_families <= 1) {
5381 batch.Delete(expanded_keys[offset]);
5382 } else {
5383 batch.Delete(db_with_cfh->GetCfh(rand_num),
5384 expanded_keys[offset]);
5385 }
5386 }
5387 } else {
5388 GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
5389 GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
5390 &end_key);
5391 if (use_blob_db_) {
5392 #ifndef ROCKSDB_LITE
5393 // Stacked BlobDB
5394 s = db_with_cfh->db->DeleteRange(
5395 write_options_, db_with_cfh->db->DefaultColumnFamily(),
5396 begin_key, end_key);
5397 #endif // ROCKSDB_LITE
5398 } else if (FLAGS_num_column_families <= 1) {
5399 batch.DeleteRange(begin_key, end_key);
5400 } else {
5401 batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
5402 end_key);
5403 }
5404 }
5405 }
5406 }
5407 if (thread->shared->write_rate_limiter.get() != nullptr) {
5408 thread->shared->write_rate_limiter->Request(
5409 batch_bytes, Env::IO_HIGH, nullptr /* stats */,
5410 RateLimiter::OpType::kWrite);
5411 // Set time at which last op finished to Now() to hide latency and
5412 // sleep from rate limiter. Also, do the check once per batch, not
5413 // once per write.
5414 thread->stats.ResetLastOpTime();
5415 }
5416 if (user_timestamp_size_ > 0) {
5417 Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
5418 s = batch.UpdateTimestamps(
5419 user_ts, [this](uint32_t) { return user_timestamp_size_; });
5420 if (!s.ok()) {
5421 fprintf(stderr, "assign timestamp to write batch: %s\n",
5422 s.ToString().c_str());
5423 ErrorExit();
5424 }
5425 }
5426 if (!use_blob_db_) {
5427 // Not stacked BlobDB
5428 s = db_with_cfh->db->Write(write_options_, &batch);
5429 }
5430 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
5431 entries_per_batch_, kWrite);
5432 if (FLAGS_sine_write_rate) {
5433 uint64_t now = FLAGS_env->NowMicros();
5434
5435 uint64_t usecs_since_last;
5436 if (now > thread->stats.GetSineInterval()) {
5437 usecs_since_last = now - thread->stats.GetSineInterval();
5438 } else {
5439 usecs_since_last = 0;
5440 }
5441
5442 if (usecs_since_last >
5443 (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
5444 double usecs_since_start =
5445 static_cast<double>(now - thread->stats.GetStart());
5446 thread->stats.ResetSineInterval();
5447 uint64_t write_rate =
5448 static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
5449 thread->shared->write_rate_limiter.reset(
5450 NewGenericRateLimiter(write_rate));
5451 }
5452 }
5453 if (!s.ok()) {
5454 s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
5455 }
5456
5457 if (!s.ok()) {
5458 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
5459 ErrorExit();
5460 }
5461 }
5462 if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
5463 fprintf(stdout,
5464 "Number of unique keys inserted: %" PRIu64
5465 ".\nNumber of overwrites: %" PRIu64 "\n",
5466 num_unique_keys, num_overwrites);
5467 } else if (kNumDispAndPersEntries > 0) {
5468 fprintf(stdout,
5469 "Number of unique keys inserted (disposable+persistent): %" PRIu64
5470 ".\nNumber of 'disposable entry delete': %" PRIu64 "\n",
5471 num_written, num_selective_deletes);
5472 }
5473 if (num_range_deletions > 0) {
5474 std::cout << "Number of range deletions: " << num_range_deletions
5475 << std::endl;
5476 }
5477 thread->stats.AddBytes(bytes);
5478 }
5479
5480 Status DoDeterministicCompact(ThreadState* thread,
5481 CompactionStyle compaction_style,
5482 WriteMode write_mode) {
5483 #ifndef ROCKSDB_LITE
5484 ColumnFamilyMetaData meta;
5485 std::vector<DB*> db_list;
5486 if (db_.db != nullptr) {
5487 db_list.push_back(db_.db);
5488 } else {
5489 for (auto& db : multi_dbs_) {
5490 db_list.push_back(db.db);
5491 }
5492 }
5493 std::vector<Options> options_list;
5494 for (auto db : db_list) {
5495 options_list.push_back(db->GetOptions());
5496 if (compaction_style != kCompactionStyleFIFO) {
5497 db->SetOptions({{"disable_auto_compactions", "1"},
5498 {"level0_slowdown_writes_trigger", "400000000"},
5499 {"level0_stop_writes_trigger", "400000000"}});
5500 } else {
5501 db->SetOptions({{"disable_auto_compactions", "1"}});
5502 }
5503 }
5504
5505 assert(!db_list.empty());
5506 auto num_db = db_list.size();
5507 size_t num_levels = static_cast<size_t>(open_options_.num_levels);
5508 size_t output_level = open_options_.num_levels - 1;
5509 std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
5510 std::vector<size_t> num_files_at_level0(num_db, 0);
5511 if (compaction_style == kCompactionStyleLevel) {
5512 if (num_levels == 0) {
5513 return Status::InvalidArgument("num_levels should be larger than 1");
5514 }
5515 bool should_stop = false;
5516 while (!should_stop) {
5517 if (sorted_runs[0].empty()) {
5518 DoWrite(thread, write_mode);
5519 } else {
5520 DoWrite(thread, UNIQUE_RANDOM);
5521 }
5522 for (size_t i = 0; i < num_db; i++) {
5523 auto db = db_list[i];
5524 db->Flush(FlushOptions());
5525 db->GetColumnFamilyMetaData(&meta);
5526 if (num_files_at_level0[i] == meta.levels[0].files.size() ||
5527 writes_ == 0) {
5528 should_stop = true;
5529 continue;
5530 }
5531 sorted_runs[i].emplace_back(
5532 meta.levels[0].files.begin(),
5533 meta.levels[0].files.end() - num_files_at_level0[i]);
5534 num_files_at_level0[i] = meta.levels[0].files.size();
5535 if (sorted_runs[i].back().size() == 1) {
5536 should_stop = true;
5537 continue;
5538 }
5539 if (sorted_runs[i].size() == output_level) {
5540 auto& L1 = sorted_runs[i].back();
5541 L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
5542 should_stop = true;
5543 continue;
5544 }
5545 }
5546 writes_ /=
5547 static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
5548 }
5549 for (size_t i = 0; i < num_db; i++) {
5550 if (sorted_runs[i].size() < num_levels - 1) {
5551 fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
5552 num_levels);
5553 exit(1);
5554 }
5555 }
5556 for (size_t i = 0; i < num_db; i++) {
5557 auto db = db_list[i];
5558 auto compactionOptions = CompactionOptions();
5559 compactionOptions.compression = FLAGS_compression_type_e;
5560 auto options = db->GetOptions();
5561 MutableCFOptions mutable_cf_options(options);
5562 for (size_t j = 0; j < sorted_runs[i].size(); j++) {
5563 compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
5564 mutable_cf_options, static_cast<int>(output_level),
5565 compaction_style);
5566 std::cout << sorted_runs[i][j].size() << std::endl;
5567 db->CompactFiles(
5568 compactionOptions,
5569 {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
5570 static_cast<int>(output_level - j) /*level*/);
5571 }
5572 }
5573 } else if (compaction_style == kCompactionStyleUniversal) {
5574 auto ratio = open_options_.compaction_options_universal.size_ratio;
5575 bool should_stop = false;
5576 while (!should_stop) {
5577 if (sorted_runs[0].empty()) {
5578 DoWrite(thread, write_mode);
5579 } else {
5580 DoWrite(thread, UNIQUE_RANDOM);
5581 }
5582 for (size_t i = 0; i < num_db; i++) {
5583 auto db = db_list[i];
5584 db->Flush(FlushOptions());
5585 db->GetColumnFamilyMetaData(&meta);
5586 if (num_files_at_level0[i] == meta.levels[0].files.size() ||
5587 writes_ == 0) {
5588 should_stop = true;
5589 continue;
5590 }
5591 sorted_runs[i].emplace_back(
5592 meta.levels[0].files.begin(),
5593 meta.levels[0].files.end() - num_files_at_level0[i]);
5594 num_files_at_level0[i] = meta.levels[0].files.size();
5595 if (sorted_runs[i].back().size() == 1) {
5596 should_stop = true;
5597 continue;
5598 }
5599 num_files_at_level0[i] = meta.levels[0].files.size();
5600 }
5601 writes_ = static_cast<int64_t>(writes_ * static_cast<double>(100) /
5602 (ratio + 200));
5603 }
5604 for (size_t i = 0; i < num_db; i++) {
5605 if (sorted_runs[i].size() < num_levels) {
5606 fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
5607 num_levels);
5608 exit(1);
5609 }
5610 }
5611 for (size_t i = 0; i < num_db; i++) {
5612 auto db = db_list[i];
5613 auto compactionOptions = CompactionOptions();
5614 compactionOptions.compression = FLAGS_compression_type_e;
5615 auto options = db->GetOptions();
5616 MutableCFOptions mutable_cf_options(options);
5617 for (size_t j = 0; j < sorted_runs[i].size(); j++) {
5618 compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
5619 mutable_cf_options, static_cast<int>(output_level),
5620 compaction_style);
5621 db->CompactFiles(
5622 compactionOptions,
5623 {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
5624 (output_level > j ? static_cast<int>(output_level - j)
5625 : 0) /*level*/);
5626 }
5627 }
5628 } else if (compaction_style == kCompactionStyleFIFO) {
5629 if (num_levels != 1) {
5630 return Status::InvalidArgument(
5631 "num_levels should be 1 for FIFO compaction");
5632 }
5633 if (FLAGS_num_multi_db != 0) {
5634 return Status::InvalidArgument("Doesn't support multiDB");
5635 }
5636 auto db = db_list[0];
5637 std::vector<std::string> file_names;
5638 while (true) {
5639 if (sorted_runs[0].empty()) {
5640 DoWrite(thread, write_mode);
5641 } else {
5642 DoWrite(thread, UNIQUE_RANDOM);
5643 }
5644 db->Flush(FlushOptions());
5645 db->GetColumnFamilyMetaData(&meta);
5646 auto total_size = meta.levels[0].size;
5647 if (total_size >=
5648 db->GetOptions().compaction_options_fifo.max_table_files_size) {
5649 for (auto file_meta : meta.levels[0].files) {
5650 file_names.emplace_back(file_meta.name);
5651 }
5652 break;
5653 }
5654 }
5655 // TODO(shuzhang1989): Investigate why CompactFiles not working
5656 // auto compactionOptions = CompactionOptions();
5657 // db->CompactFiles(compactionOptions, file_names, 0);
5658 auto compactionOptions = CompactRangeOptions();
5659 db->CompactRange(compactionOptions, nullptr, nullptr);
5660 } else {
5661 fprintf(stdout,
5662 "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
5663 "filldeterministic");
5664 return Status::InvalidArgument("None compaction is not supported");
5665 }
5666
5667 // Verify seqno and key range
5668 // Note: the seqno get changed at the max level by implementation
5669 // optimization, so skip the check of the max level.
5670 #ifndef NDEBUG
5671 for (size_t k = 0; k < num_db; k++) {
5672 auto db = db_list[k];
5673 db->GetColumnFamilyMetaData(&meta);
5674 // verify the number of sorted runs
5675 if (compaction_style == kCompactionStyleLevel) {
5676 assert(num_levels - 1 == sorted_runs[k].size());
5677 } else if (compaction_style == kCompactionStyleUniversal) {
5678 assert(meta.levels[0].files.size() + num_levels - 1 ==
5679 sorted_runs[k].size());
5680 } else if (compaction_style == kCompactionStyleFIFO) {
5681 // TODO(gzh): FIFO compaction
5682 db->GetColumnFamilyMetaData(&meta);
5683 auto total_size = meta.levels[0].size;
5684 assert(total_size <=
5685 db->GetOptions().compaction_options_fifo.max_table_files_size);
5686 break;
5687 }
5688
5689 // verify smallest/largest seqno and key range of each sorted run
5690 auto max_level = num_levels - 1;
5691 int level;
5692 for (size_t i = 0; i < sorted_runs[k].size(); i++) {
5693 level = static_cast<int>(max_level - i);
5694 SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
5695 SequenceNumber sorted_run_largest_seqno = 0;
5696 std::string sorted_run_smallest_key, sorted_run_largest_key;
5697 bool first_key = true;
5698 for (auto fileMeta : sorted_runs[k][i]) {
5699 sorted_run_smallest_seqno =
5700 std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
5701 sorted_run_largest_seqno =
5702 std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
5703 if (first_key ||
5704 db->DefaultColumnFamily()->GetComparator()->Compare(
5705 fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
5706 sorted_run_smallest_key = fileMeta.smallestkey;
5707 }
5708 if (first_key ||
5709 db->DefaultColumnFamily()->GetComparator()->Compare(
5710 fileMeta.largestkey, sorted_run_largest_key) > 0) {
5711 sorted_run_largest_key = fileMeta.largestkey;
5712 }
5713 first_key = false;
5714 }
5715 if (compaction_style == kCompactionStyleLevel ||
5716 (compaction_style == kCompactionStyleUniversal && level > 0)) {
5717 SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
5718 SequenceNumber level_largest_seqno = 0;
5719 for (auto fileMeta : meta.levels[level].files) {
5720 level_smallest_seqno =
5721 std::min(level_smallest_seqno, fileMeta.smallest_seqno);
5722 level_largest_seqno =
5723 std::max(level_largest_seqno, fileMeta.largest_seqno);
5724 }
5725 assert(sorted_run_smallest_key ==
5726 meta.levels[level].files.front().smallestkey);
5727 assert(sorted_run_largest_key ==
5728 meta.levels[level].files.back().largestkey);
5729 if (level != static_cast<int>(max_level)) {
5730 // compaction at max_level would change sequence number
5731 assert(sorted_run_smallest_seqno == level_smallest_seqno);
5732 assert(sorted_run_largest_seqno == level_largest_seqno);
5733 }
5734 } else if (compaction_style == kCompactionStyleUniversal) {
5735 // level <= 0 means sorted runs on level 0
5736 auto level0_file =
5737 meta.levels[0].files[sorted_runs[k].size() - 1 - i];
5738 assert(sorted_run_smallest_key == level0_file.smallestkey);
5739 assert(sorted_run_largest_key == level0_file.largestkey);
5740 if (level != static_cast<int>(max_level)) {
5741 assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
5742 assert(sorted_run_largest_seqno == level0_file.largest_seqno);
5743 }
5744 }
5745 }
5746 }
5747 #endif
5748 // print the size of each sorted_run
5749 for (size_t k = 0; k < num_db; k++) {
5750 auto db = db_list[k];
5751 fprintf(stdout,
5752 "---------------------- DB %" ROCKSDB_PRIszt
5753 " LSM ---------------------\n",
5754 k);
5755 db->GetColumnFamilyMetaData(&meta);
5756 for (auto& levelMeta : meta.levels) {
5757 if (levelMeta.files.empty()) {
5758 continue;
5759 }
5760 if (levelMeta.level == 0) {
5761 for (auto& fileMeta : levelMeta.files) {
5762 fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n",
5763 levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
5764 }
5765 } else {
5766 fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
5767 levelMeta.level, levelMeta.files.front().name.c_str(),
5768 levelMeta.files.back().name.c_str(), levelMeta.size);
5769 }
5770 }
5771 }
5772 for (size_t i = 0; i < num_db; i++) {
5773 db_list[i]->SetOptions(
5774 {{"disable_auto_compactions",
5775 std::to_string(options_list[i].disable_auto_compactions)},
5776 {"level0_slowdown_writes_trigger",
5777 std::to_string(options_list[i].level0_slowdown_writes_trigger)},
5778 {"level0_stop_writes_trigger",
5779 std::to_string(options_list[i].level0_stop_writes_trigger)}});
5780 }
5781 return Status::OK();
5782 #else
5783 (void)thread;
5784 (void)compaction_style;
5785 (void)write_mode;
5786 fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
5787 return Status::NotSupported(
5788 "Rocksdb Lite doesn't support filldeterministic");
5789 #endif // ROCKSDB_LITE
5790 }
5791
5792 void ReadSequential(ThreadState* thread) {
5793 if (db_.db != nullptr) {
5794 ReadSequential(thread, db_.db);
5795 } else {
5796 for (const auto& db_with_cfh : multi_dbs_) {
5797 ReadSequential(thread, db_with_cfh.db);
5798 }
5799 }
5800 }
5801
5802 void ReadSequential(ThreadState* thread, DB* db) {
5803 ReadOptions options = read_options_;
5804 std::unique_ptr<char[]> ts_guard;
5805 Slice ts;
5806 if (user_timestamp_size_ > 0) {
5807 ts_guard.reset(new char[user_timestamp_size_]);
5808 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5809 options.timestamp = &ts;
5810 }
5811
5812 options.adaptive_readahead = FLAGS_adaptive_readahead;
5813 options.async_io = FLAGS_async_io;
5814
5815 Iterator* iter = db->NewIterator(options);
5816 int64_t i = 0;
5817 int64_t bytes = 0;
5818 for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
5819 bytes += iter->key().size() + iter->value().size();
5820 thread->stats.FinishedOps(nullptr, db, 1, kRead);
5821 ++i;
5822
5823 if (thread->shared->read_rate_limiter.get() != nullptr &&
5824 i % 1024 == 1023) {
5825 thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5826 nullptr /* stats */,
5827 RateLimiter::OpType::kRead);
5828 }
5829 }
5830
5831 delete iter;
5832 thread->stats.AddBytes(bytes);
5833 }
5834
5835 void ReadToRowCache(ThreadState* thread) {
5836 int64_t read = 0;
5837 int64_t found = 0;
5838 int64_t bytes = 0;
5839 int64_t key_rand = 0;
5840 std::unique_ptr<const char[]> key_guard;
5841 Slice key = AllocateKey(&key_guard);
5842 PinnableSlice pinnable_val;
5843
5844 while (key_rand < FLAGS_num) {
5845 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5846 // We use same key_rand as seed for key and column family so that we can
5847 // deterministically find the cfh corresponding to a particular key, as it
5848 // is done in DoWrite method.
5849 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5850 key_rand++;
5851 read++;
5852 Status s;
5853 if (FLAGS_num_column_families > 1) {
5854 s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
5855 key, &pinnable_val);
5856 } else {
5857 pinnable_val.Reset();
5858 s = db_with_cfh->db->Get(read_options_,
5859 db_with_cfh->db->DefaultColumnFamily(), key,
5860 &pinnable_val);
5861 }
5862
5863 if (s.ok()) {
5864 found++;
5865 bytes += key.size() + pinnable_val.size();
5866 } else if (!s.IsNotFound()) {
5867 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5868 abort();
5869 }
5870
5871 if (thread->shared->read_rate_limiter.get() != nullptr &&
5872 read % 256 == 255) {
5873 thread->shared->read_rate_limiter->Request(
5874 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5875 }
5876
5877 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5878 }
5879
5880 char msg[100];
5881 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
5882 read);
5883
5884 thread->stats.AddBytes(bytes);
5885 thread->stats.AddMessage(msg);
5886 }
5887
5888 void ReadReverse(ThreadState* thread) {
5889 if (db_.db != nullptr) {
5890 ReadReverse(thread, db_.db);
5891 } else {
5892 for (const auto& db_with_cfh : multi_dbs_) {
5893 ReadReverse(thread, db_with_cfh.db);
5894 }
5895 }
5896 }
5897
5898 void ReadReverse(ThreadState* thread, DB* db) {
5899 Iterator* iter = db->NewIterator(read_options_);
5900 int64_t i = 0;
5901 int64_t bytes = 0;
5902 for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
5903 bytes += iter->key().size() + iter->value().size();
5904 thread->stats.FinishedOps(nullptr, db, 1, kRead);
5905 ++i;
5906 if (thread->shared->read_rate_limiter.get() != nullptr &&
5907 i % 1024 == 1023) {
5908 thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5909 nullptr /* stats */,
5910 RateLimiter::OpType::kRead);
5911 }
5912 }
5913 delete iter;
5914 thread->stats.AddBytes(bytes);
5915 }
5916
5917 void ReadRandomFast(ThreadState* thread) {
5918 int64_t read = 0;
5919 int64_t found = 0;
5920 int64_t nonexist = 0;
5921 ReadOptions options = read_options_;
5922 std::unique_ptr<const char[]> key_guard;
5923 Slice key = AllocateKey(&key_guard);
5924 std::string value;
5925 Slice ts;
5926 std::unique_ptr<char[]> ts_guard;
5927 if (user_timestamp_size_ > 0) {
5928 ts_guard.reset(new char[user_timestamp_size_]);
5929 }
5930 DB* db = SelectDBWithCfh(thread)->db;
5931
5932 int64_t pot = 1;
5933 while (pot < FLAGS_num) {
5934 pot <<= 1;
5935 }
5936
5937 Duration duration(FLAGS_duration, reads_);
5938 do {
5939 for (int i = 0; i < 100; ++i) {
5940 int64_t key_rand = thread->rand.Next() & (pot - 1);
5941 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5942 ++read;
5943 std::string ts_ret;
5944 std::string* ts_ptr = nullptr;
5945 if (user_timestamp_size_ > 0) {
5946 ts = mock_app_clock_->GetTimestampForRead(thread->rand,
5947 ts_guard.get());
5948 options.timestamp = &ts;
5949 ts_ptr = &ts_ret;
5950 }
5951 auto status = db->Get(options, key, &value, ts_ptr);
5952 if (status.ok()) {
5953 ++found;
5954 } else if (!status.IsNotFound()) {
5955 fprintf(stderr, "Get returned an error: %s\n",
5956 status.ToString().c_str());
5957 abort();
5958 }
5959 if (key_rand >= FLAGS_num) {
5960 ++nonexist;
5961 }
5962 }
5963 if (thread->shared->read_rate_limiter.get() != nullptr) {
5964 thread->shared->read_rate_limiter->Request(
5965 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5966 }
5967
5968 thread->stats.FinishedOps(nullptr, db, 100, kRead);
5969 } while (!duration.Done(100));
5970
5971 char msg[100];
5972 snprintf(msg, sizeof(msg),
5973 "(%" PRIu64 " of %" PRIu64
5974 " found, "
5975 "issued %" PRIu64 " non-exist keys)\n",
5976 found, read, nonexist);
5977
5978 thread->stats.AddMessage(msg);
5979 }
5980
5981 int64_t GetRandomKey(Random64* rand) {
5982 uint64_t rand_int = rand->Next();
5983 int64_t key_rand;
5984 if (read_random_exp_range_ == 0) {
5985 key_rand = rand_int % FLAGS_num;
5986 } else {
5987 const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
5988 long double order = -static_cast<long double>(rand_int % kBigInt) /
5989 static_cast<long double>(kBigInt) *
5990 read_random_exp_range_;
5991 long double exp_ran = std::exp(order);
5992 uint64_t rand_num =
5993 static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
5994 // Map to a different number to avoid locality.
5995 const uint64_t kBigPrime = 0x5bd1e995;
5996 // Overflow is like %(2^64). Will have little impact of results.
5997 key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
5998 }
5999 return key_rand;
6000 }
6001
6002 void ReadRandom(ThreadState* thread) {
6003 int64_t read = 0;
6004 int64_t found = 0;
6005 int64_t bytes = 0;
6006 int num_keys = 0;
6007 int64_t key_rand = 0;
6008 ReadOptions options = read_options_;
6009 std::unique_ptr<const char[]> key_guard;
6010 Slice key = AllocateKey(&key_guard);
6011 PinnableSlice pinnable_val;
6012 std::vector<PinnableSlice> pinnable_vals;
6013 if (read_operands_) {
6014 // Start off with a small-ish value that'll be increased later if
6015 // `GetMergeOperands()` tells us it is not large enough.
6016 pinnable_vals.resize(8);
6017 }
6018 std::unique_ptr<char[]> ts_guard;
6019 Slice ts;
6020 if (user_timestamp_size_ > 0) {
6021 ts_guard.reset(new char[user_timestamp_size_]);
6022 }
6023
6024 Duration duration(FLAGS_duration, reads_);
6025 while (!duration.Done(1)) {
6026 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6027 // We use same key_rand as seed for key and column family so that we can
6028 // deterministically find the cfh corresponding to a particular key, as it
6029 // is done in DoWrite method.
6030 if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
6031 if (++num_keys == entries_per_batch_) {
6032 num_keys = 0;
6033 key_rand = GetRandomKey(&thread->rand);
6034 if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
6035 FLAGS_num) {
6036 key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
6037 }
6038 } else {
6039 key_rand += FLAGS_multiread_stride;
6040 }
6041 } else {
6042 key_rand = GetRandomKey(&thread->rand);
6043 }
6044 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
6045 read++;
6046 std::string ts_ret;
6047 std::string* ts_ptr = nullptr;
6048 if (user_timestamp_size_ > 0) {
6049 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6050 options.timestamp = &ts;
6051 ts_ptr = &ts_ret;
6052 }
6053 Status s;
6054 pinnable_val.Reset();
6055 for (size_t i = 0; i < pinnable_vals.size(); ++i) {
6056 pinnable_vals[i].Reset();
6057 }
6058 ColumnFamilyHandle* cfh;
6059 if (FLAGS_num_column_families > 1) {
6060 cfh = db_with_cfh->GetCfh(key_rand);
6061 } else {
6062 cfh = db_with_cfh->db->DefaultColumnFamily();
6063 }
6064 if (read_operands_) {
6065 GetMergeOperandsOptions get_merge_operands_options;
6066 get_merge_operands_options.expected_max_number_of_operands =
6067 static_cast<int>(pinnable_vals.size());
6068 int number_of_operands;
6069 s = db_with_cfh->db->GetMergeOperands(
6070 options, cfh, key, pinnable_vals.data(),
6071 &get_merge_operands_options, &number_of_operands);
6072 if (s.IsIncomplete()) {
6073 // Should only happen a few times when we encounter a key that had
6074 // more merge operands than any key seen so far. Production use case
6075 // would typically retry in such event to get all the operands so do
6076 // that here.
6077 pinnable_vals.resize(number_of_operands);
6078 get_merge_operands_options.expected_max_number_of_operands =
6079 static_cast<int>(pinnable_vals.size());
6080 s = db_with_cfh->db->GetMergeOperands(
6081 options, cfh, key, pinnable_vals.data(),
6082 &get_merge_operands_options, &number_of_operands);
6083 }
6084 } else {
6085 s = db_with_cfh->db->Get(options, cfh, key, &pinnable_val, ts_ptr);
6086 }
6087
6088 if (s.ok()) {
6089 found++;
6090 bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
6091 for (size_t i = 0; i < pinnable_vals.size(); ++i) {
6092 bytes += pinnable_vals[i].size();
6093 pinnable_vals[i].Reset();
6094 }
6095 } else if (!s.IsNotFound()) {
6096 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
6097 abort();
6098 }
6099
6100 if (thread->shared->read_rate_limiter.get() != nullptr &&
6101 read % 256 == 255) {
6102 thread->shared->read_rate_limiter->Request(
6103 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6104 }
6105
6106 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6107 }
6108
6109 char msg[100];
6110 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
6111 read);
6112
6113 thread->stats.AddBytes(bytes);
6114 thread->stats.AddMessage(msg);
6115 }
6116
6117 // Calls MultiGet over a list of keys from a random distribution.
6118 // Returns the total number of keys found.
6119 void MultiReadRandom(ThreadState* thread) {
6120 int64_t read = 0;
6121 int64_t bytes = 0;
6122 int64_t num_multireads = 0;
6123 int64_t found = 0;
6124 ReadOptions options = read_options_;
6125 std::vector<Slice> keys;
6126 std::vector<std::unique_ptr<const char[]>> key_guards;
6127 std::vector<std::string> values(entries_per_batch_);
6128 PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
6129 std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
6130 std::vector<Status> stat_list(entries_per_batch_);
6131 while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
6132 key_guards.push_back(std::unique_ptr<const char[]>());
6133 keys.push_back(AllocateKey(&key_guards.back()));
6134 }
6135
6136 std::unique_ptr<char[]> ts_guard;
6137 if (user_timestamp_size_ > 0) {
6138 ts_guard.reset(new char[user_timestamp_size_]);
6139 }
6140
6141 Duration duration(FLAGS_duration, reads_);
6142 while (!duration.Done(entries_per_batch_)) {
6143 DB* db = SelectDB(thread);
6144 if (FLAGS_multiread_stride) {
6145 int64_t key = GetRandomKey(&thread->rand);
6146 if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
6147 static_cast<int64_t>(FLAGS_num)) {
6148 key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
6149 }
6150 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6151 GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
6152 key += FLAGS_multiread_stride;
6153 }
6154 } else {
6155 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6156 GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
6157 }
6158 }
6159 Slice ts;
6160 if (user_timestamp_size_ > 0) {
6161 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6162 options.timestamp = &ts;
6163 }
6164 if (!FLAGS_multiread_batched) {
6165 std::vector<Status> statuses = db->MultiGet(options, keys, &values);
6166 assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
6167
6168 read += entries_per_batch_;
6169 num_multireads++;
6170 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6171 if (statuses[i].ok()) {
6172 bytes += keys[i].size() + values[i].size() + user_timestamp_size_;
6173 ++found;
6174 } else if (!statuses[i].IsNotFound()) {
6175 fprintf(stderr, "MultiGet returned an error: %s\n",
6176 statuses[i].ToString().c_str());
6177 abort();
6178 }
6179 }
6180 } else {
6181 db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
6182 keys.data(), pin_values, stat_list.data());
6183
6184 read += entries_per_batch_;
6185 num_multireads++;
6186 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6187 if (stat_list[i].ok()) {
6188 bytes +=
6189 keys[i].size() + pin_values[i].size() + user_timestamp_size_;
6190 ++found;
6191 } else if (!stat_list[i].IsNotFound()) {
6192 fprintf(stderr, "MultiGet returned an error: %s\n",
6193 stat_list[i].ToString().c_str());
6194 abort();
6195 }
6196 stat_list[i] = Status::OK();
6197 pin_values[i].Reset();
6198 }
6199 }
6200 if (thread->shared->read_rate_limiter.get() != nullptr &&
6201 num_multireads % 256 == 255) {
6202 thread->shared->read_rate_limiter->Request(
6203 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
6204 RateLimiter::OpType::kRead);
6205 }
6206 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
6207 }
6208
6209 char msg[100];
6210 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
6211 read);
6212 thread->stats.AddBytes(bytes);
6213 thread->stats.AddMessage(msg);
6214 }
6215
6216 // Calls ApproximateSize over random key ranges.
6217 void ApproximateSizeRandom(ThreadState* thread) {
6218 int64_t size_sum = 0;
6219 int64_t num_sizes = 0;
6220 const size_t batch_size = entries_per_batch_;
6221 std::vector<Range> ranges;
6222 std::vector<Slice> lkeys;
6223 std::vector<std::unique_ptr<const char[]>> lkey_guards;
6224 std::vector<Slice> rkeys;
6225 std::vector<std::unique_ptr<const char[]>> rkey_guards;
6226 std::vector<uint64_t> sizes;
6227 while (ranges.size() < batch_size) {
6228 // Ugly without C++17 return from emplace_back
6229 lkey_guards.emplace_back();
6230 rkey_guards.emplace_back();
6231 lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
6232 rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
6233 ranges.emplace_back(lkeys.back(), rkeys.back());
6234 sizes.push_back(0);
6235 }
6236 Duration duration(FLAGS_duration, reads_);
6237 while (!duration.Done(1)) {
6238 DB* db = SelectDB(thread);
6239 for (size_t i = 0; i < batch_size; ++i) {
6240 int64_t lkey = GetRandomKey(&thread->rand);
6241 int64_t rkey = GetRandomKey(&thread->rand);
6242 if (lkey > rkey) {
6243 std::swap(lkey, rkey);
6244 }
6245 GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
6246 GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
6247 }
6248 db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
6249 &sizes[0]);
6250 num_sizes += entries_per_batch_;
6251 for (int64_t size : sizes) {
6252 size_sum += size;
6253 }
6254 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
6255 }
6256
6257 char msg[100];
6258 snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
6259 static_cast<double>(size_sum) / static_cast<double>(num_sizes));
6260 thread->stats.AddMessage(msg);
6261 }
6262
6263 // The inverse function of Pareto distribution
6264 int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
6265 double ret;
6266 if (k == 0.0) {
6267 ret = theta - sigma * std::log(u);
6268 } else {
6269 ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
6270 }
6271 return static_cast<int64_t>(ceil(ret));
6272 }
6273 // The inverse function of power distribution (y=ax^b)
6274 int64_t PowerCdfInversion(double u, double a, double b) {
6275 double ret;
6276 ret = std::pow((u / a), (1 / b));
6277 return static_cast<int64_t>(ceil(ret));
6278 }
6279
6280 // Add the noice to the QPS
6281 double AddNoise(double origin, double noise_ratio) {
6282 if (noise_ratio < 0.0 || noise_ratio > 1.0) {
6283 return origin;
6284 }
6285 int band_int = static_cast<int>(FLAGS_sine_a);
6286 double delta = (rand() % band_int - band_int / 2) * noise_ratio;
6287 if (origin + delta < 0) {
6288 return origin;
6289 } else {
6290 return (origin + delta);
6291 }
6292 }
6293
6294 // Decide the ratio of different query types
6295 // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
6296 class QueryDecider {
6297 public:
6298 std::vector<int> type_;
6299 std::vector<double> ratio_;
6300 int range_;
6301
6302 QueryDecider() {}
6303 ~QueryDecider() {}
6304
6305 Status Initiate(std::vector<double> ratio_input) {
6306 int range_max = 1000;
6307 double sum = 0.0;
6308 for (auto& ratio : ratio_input) {
6309 sum += ratio;
6310 }
6311 range_ = 0;
6312 for (auto& ratio : ratio_input) {
6313 range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
6314 type_.push_back(range_);
6315 ratio_.push_back(ratio / sum);
6316 }
6317 return Status::OK();
6318 }
6319
6320 int GetType(int64_t rand_num) {
6321 if (rand_num < 0) {
6322 rand_num = rand_num * (-1);
6323 }
6324 assert(range_ != 0);
6325 int pos = static_cast<int>(rand_num % range_);
6326 for (int i = 0; i < static_cast<int>(type_.size()); i++) {
6327 if (pos < type_[i]) {
6328 return i;
6329 }
6330 }
6331 return 0;
6332 }
6333 };
6334
6335 // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
6336 // to transfer a random value to one keyrange based on the hotness.
6337 struct KeyrangeUnit {
6338 int64_t keyrange_start;
6339 int64_t keyrange_access;
6340 int64_t keyrange_keys;
6341 };
6342
6343 // From our observations, the prefix hotness (key-range hotness) follows
6344 // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
6345 // However, we cannot directly use the inverse function to decide a
6346 // key-range from a random distribution. To achieve it, we create a list of
6347 // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
6348 // decided based on the hotness of the key-range. When a random value is
6349 // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
6350 // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being
6351 // selected is the same as the hotness of this KeyrangeUnit. After that, the
6352 // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
6353 // can based on the power distribution (y=ax^b) to generate the offset of
6354 // the key in the selected key-range. In this way, we generate the keyID
6355 // based on the hotness of the prefix and also the key hotness distribution.
6356 class GenerateTwoTermExpKeys {
6357 public:
6358 // Avoid uninitialized warning-as-error in some compilers
6359 int64_t keyrange_rand_max_ = 0;
6360 int64_t keyrange_size_ = 0;
6361 int64_t keyrange_num_ = 0;
6362 std::vector<KeyrangeUnit> keyrange_set_;
6363
6364 // Initiate the KeyrangeUnit vector and calculate the size of each
6365 // KeyrangeUnit.
6366 Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
6367 double prefix_b, double prefix_c,
6368 double prefix_d) {
6369 int64_t amplify = 0;
6370 int64_t keyrange_start = 0;
6371 if (FLAGS_keyrange_num <= 0) {
6372 keyrange_num_ = 1;
6373 } else {
6374 keyrange_num_ = FLAGS_keyrange_num;
6375 }
6376 keyrange_size_ = total_keys / keyrange_num_;
6377
6378 // Calculate the key-range shares size based on the input parameters
6379 for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
6380 // Step 1. Calculate the probability that this key range will be
6381 // accessed in a query. It is based on the two-term expoential
6382 // distribution
6383 double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
6384 prefix_c * std::exp(prefix_d * pfx);
6385 if (keyrange_p < std::pow(10.0, -16.0)) {
6386 keyrange_p = 0.0;
6387 }
6388 // Step 2. Calculate the amplify
6389 // In order to allocate a query to a key-range based on the random
6390 // number generated for this query, we need to extend the probability
6391 // of each key range from [0,1] to [0, amplify]. Amplify is calculated
6392 // by 1/(smallest key-range probability). In this way, we ensure that
6393 // all key-ranges are assigned with an Integer that >=0
6394 if (amplify == 0 && keyrange_p > 0) {
6395 amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
6396 }
6397
6398 // Step 3. For each key-range, we calculate its position in the
6399 // [0, amplify] range, including the start, the size (keyrange_access)
6400 KeyrangeUnit p_unit;
6401 p_unit.keyrange_start = keyrange_start;
6402 if (0.0 >= keyrange_p) {
6403 p_unit.keyrange_access = 0;
6404 } else {
6405 p_unit.keyrange_access =
6406 static_cast<int64_t>(std::floor(amplify * keyrange_p));
6407 }
6408 p_unit.keyrange_keys = keyrange_size_;
6409 keyrange_set_.push_back(p_unit);
6410 keyrange_start += p_unit.keyrange_access;
6411 }
6412 keyrange_rand_max_ = keyrange_start;
6413
6414 // Step 4. Shuffle the key-ranges randomly
6415 // Since the access probability is calculated from small to large,
6416 // If we do not re-allocate them, hot key-ranges are always at the end
6417 // and cold key-ranges are at the begin of the key space. Therefore, the
6418 // key-ranges are shuffled and the rand seed is only decide by the
6419 // key-range hotness distribution. With the same distribution parameters
6420 // the shuffle results are the same.
6421 Random64 rand_loca(keyrange_rand_max_);
6422 for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
6423 int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
6424 assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
6425 pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
6426 std::swap(keyrange_set_[i], keyrange_set_[pos]);
6427 }
6428
6429 // Step 5. Recalculate the prefix start postion after shuffling
6430 int64_t offset = 0;
6431 for (auto& p_unit : keyrange_set_) {
6432 p_unit.keyrange_start = offset;
6433 offset += p_unit.keyrange_access;
6434 }
6435
6436 return Status::OK();
6437 }
6438
6439 // Generate the Key ID according to the input ini_rand and key distribution
6440 int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
6441 double key_dist_b) {
6442 int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
6443
6444 // Calculate and select one key-range that contains the new key
6445 int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
6446 while (start + 1 < end) {
6447 int64_t mid = start + (end - start) / 2;
6448 assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
6449 if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
6450 end = mid;
6451 } else {
6452 start = mid;
6453 }
6454 }
6455 int64_t keyrange_id = start;
6456
6457 // Select one key in the key-range and compose the keyID
6458 int64_t key_offset = 0, key_seed;
6459 if (key_dist_a == 0.0 || key_dist_b == 0.0) {
6460 key_offset = ini_rand % keyrange_size_;
6461 } else {
6462 double u =
6463 static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
6464 key_seed = static_cast<int64_t>(
6465 ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
6466 Random64 rand_key(key_seed);
6467 key_offset = rand_key.Next() % keyrange_size_;
6468 }
6469 return keyrange_size_ * keyrange_id + key_offset;
6470 }
6471 };
6472
6473 // The social graph workload mixed with Get, Put, Iterator queries.
6474 // The value size and iterator length follow Pareto distribution.
6475 // The overall key access follow power distribution. If user models the
6476 // workload based on different key-ranges (or different prefixes), user
6477 // can use two-term-exponential distribution to fit the workload. User
6478 // needs to decide the ratio between Get, Put, Iterator queries before
6479 // starting the benchmark.
6480 void MixGraph(ThreadState* thread) {
6481 int64_t gets = 0;
6482 int64_t puts = 0;
6483 int64_t get_found = 0;
6484 int64_t seek = 0;
6485 int64_t seek_found = 0;
6486 int64_t bytes = 0;
6487 double total_scan_length = 0;
6488 double total_val_size = 0;
6489 const int64_t default_value_max = 1 * 1024 * 1024;
6490 int64_t value_max = default_value_max;
6491 int64_t scan_len_max = FLAGS_mix_max_scan_len;
6492 double write_rate = 1000000.0;
6493 double read_rate = 1000000.0;
6494 bool use_prefix_modeling = false;
6495 bool use_random_modeling = false;
6496 GenerateTwoTermExpKeys gen_exp;
6497 std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
6498 FLAGS_mix_seek_ratio};
6499 char value_buffer[default_value_max];
6500 QueryDecider query;
6501 RandomGenerator gen;
6502 Status s;
6503 if (value_max > FLAGS_mix_max_value_size) {
6504 value_max = FLAGS_mix_max_value_size;
6505 }
6506
6507 std::unique_ptr<const char[]> key_guard;
6508 Slice key = AllocateKey(&key_guard);
6509 PinnableSlice pinnable_val;
6510 query.Initiate(ratio);
6511
6512 // the limit of qps initiation
6513 if (FLAGS_sine_mix_rate) {
6514 thread->shared->read_rate_limiter.reset(
6515 NewGenericRateLimiter(static_cast<int64_t>(read_rate)));
6516 thread->shared->write_rate_limiter.reset(
6517 NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
6518 }
6519
6520 // Decide if user wants to use prefix based key generation
6521 if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
6522 FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
6523 use_prefix_modeling = true;
6524 gen_exp.InitiateExpDistribution(
6525 FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
6526 FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
6527 }
6528 if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
6529 use_random_modeling = true;
6530 }
6531
6532 Duration duration(FLAGS_duration, reads_);
6533 while (!duration.Done(1)) {
6534 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6535 int64_t ini_rand, rand_v, key_rand, key_seed;
6536 ini_rand = GetRandomKey(&thread->rand);
6537 rand_v = ini_rand % FLAGS_num;
6538 double u = static_cast<double>(rand_v) / FLAGS_num;
6539
6540 // Generate the keyID based on the key hotness and prefix hotness
6541 if (use_random_modeling) {
6542 key_rand = ini_rand;
6543 } else if (use_prefix_modeling) {
6544 key_rand =
6545 gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
6546 } else {
6547 key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
6548 Random64 rand(key_seed);
6549 key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
6550 }
6551 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
6552 int query_type = query.GetType(rand_v);
6553
6554 // change the qps
6555 uint64_t now = FLAGS_env->NowMicros();
6556 uint64_t usecs_since_last;
6557 if (now > thread->stats.GetSineInterval()) {
6558 usecs_since_last = now - thread->stats.GetSineInterval();
6559 } else {
6560 usecs_since_last = 0;
6561 }
6562
6563 if (FLAGS_sine_mix_rate &&
6564 usecs_since_last >
6565 (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
6566 double usecs_since_start =
6567 static_cast<double>(now - thread->stats.GetStart());
6568 thread->stats.ResetSineInterval();
6569 double mix_rate_with_noise = AddNoise(
6570 SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
6571 read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
6572 write_rate = mix_rate_with_noise * query.ratio_[1];
6573
6574 if (read_rate > 0) {
6575 thread->shared->read_rate_limiter->SetBytesPerSecond(
6576 static_cast<int64_t>(read_rate));
6577 }
6578 if (write_rate > 0) {
6579 thread->shared->write_rate_limiter->SetBytesPerSecond(
6580 static_cast<int64_t>(write_rate));
6581 }
6582 }
6583 // Start the query
6584 if (query_type == 0) {
6585 // the Get query
6586 gets++;
6587 if (FLAGS_num_column_families > 1) {
6588 s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
6589 key, &pinnable_val);
6590 } else {
6591 pinnable_val.Reset();
6592 s = db_with_cfh->db->Get(read_options_,
6593 db_with_cfh->db->DefaultColumnFamily(), key,
6594 &pinnable_val);
6595 }
6596
6597 if (s.ok()) {
6598 get_found++;
6599 bytes += key.size() + pinnable_val.size();
6600 } else if (!s.IsNotFound()) {
6601 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
6602 abort();
6603 }
6604
6605 if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) {
6606 thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH,
6607 nullptr /*stats*/);
6608 }
6609 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6610 } else if (query_type == 1) {
6611 // the Put query
6612 puts++;
6613 int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta,
6614 FLAGS_value_k, FLAGS_value_sigma);
6615 if (val_size < 10) {
6616 val_size = 10;
6617 } else if (val_size > value_max) {
6618 val_size = val_size % value_max;
6619 }
6620 total_val_size += val_size;
6621
6622 s = db_with_cfh->db->Put(
6623 write_options_, key,
6624 gen.Generate(static_cast<unsigned int>(val_size)));
6625 if (!s.ok()) {
6626 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6627 ErrorExit();
6628 }
6629
6630 if (thread->shared->write_rate_limiter && puts % 100 == 0) {
6631 thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH,
6632 nullptr /*stats*/);
6633 }
6634 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
6635 } else if (query_type == 2) {
6636 // Seek query
6637 if (db_with_cfh->db != nullptr) {
6638 Iterator* single_iter = nullptr;
6639 single_iter = db_with_cfh->db->NewIterator(read_options_);
6640 if (single_iter != nullptr) {
6641 single_iter->Seek(key);
6642 seek++;
6643 if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
6644 seek_found++;
6645 }
6646 int64_t scan_length =
6647 ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
6648 FLAGS_iter_sigma) %
6649 scan_len_max;
6650 for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
6651 Slice value = single_iter->value();
6652 memcpy(value_buffer, value.data(),
6653 std::min(value.size(), sizeof(value_buffer)));
6654 bytes += single_iter->key().size() + single_iter->value().size();
6655 single_iter->Next();
6656 assert(single_iter->status().ok());
6657 total_scan_length++;
6658 }
6659 }
6660 delete single_iter;
6661 }
6662 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
6663 }
6664 }
6665 char msg[256];
6666 snprintf(msg, sizeof(msg),
6667 "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64
6668 ", reads %" PRIu64 " in %" PRIu64
6669 " found, "
6670 "avg size: %.1f value, %.1f scan)\n",
6671 gets, puts, seek, get_found + seek_found, gets + seek,
6672 total_val_size / puts, total_scan_length / seek);
6673
6674 thread->stats.AddBytes(bytes);
6675 thread->stats.AddMessage(msg);
6676 }
6677
6678 void IteratorCreation(ThreadState* thread) {
6679 Duration duration(FLAGS_duration, reads_);
6680 ReadOptions options = read_options_;
6681 std::unique_ptr<char[]> ts_guard;
6682 if (user_timestamp_size_ > 0) {
6683 ts_guard.reset(new char[user_timestamp_size_]);
6684 }
6685 while (!duration.Done(1)) {
6686 DB* db = SelectDB(thread);
6687 Slice ts;
6688 if (user_timestamp_size_ > 0) {
6689 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6690 options.timestamp = &ts;
6691 }
6692 Iterator* iter = db->NewIterator(options);
6693 delete iter;
6694 thread->stats.FinishedOps(nullptr, db, 1, kOthers);
6695 }
6696 }
6697
6698 void IteratorCreationWhileWriting(ThreadState* thread) {
6699 if (thread->tid > 0) {
6700 IteratorCreation(thread);
6701 } else {
6702 BGWriter(thread, kWrite);
6703 }
6704 }
6705
6706 void SeekRandom(ThreadState* thread) {
6707 int64_t read = 0;
6708 int64_t found = 0;
6709 int64_t bytes = 0;
6710 ReadOptions options = read_options_;
6711 std::unique_ptr<char[]> ts_guard;
6712 Slice ts;
6713 if (user_timestamp_size_ > 0) {
6714 ts_guard.reset(new char[user_timestamp_size_]);
6715 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6716 options.timestamp = &ts;
6717 }
6718
6719 std::vector<Iterator*> tailing_iters;
6720 if (FLAGS_use_tailing_iterator) {
6721 if (db_.db != nullptr) {
6722 tailing_iters.push_back(db_.db->NewIterator(options));
6723 } else {
6724 for (const auto& db_with_cfh : multi_dbs_) {
6725 tailing_iters.push_back(db_with_cfh.db->NewIterator(options));
6726 }
6727 }
6728 }
6729 options.auto_prefix_mode = FLAGS_auto_prefix_mode;
6730
6731 std::unique_ptr<const char[]> key_guard;
6732 Slice key = AllocateKey(&key_guard);
6733
6734 std::unique_ptr<const char[]> upper_bound_key_guard;
6735 Slice upper_bound = AllocateKey(&upper_bound_key_guard);
6736 std::unique_ptr<const char[]> lower_bound_key_guard;
6737 Slice lower_bound = AllocateKey(&lower_bound_key_guard);
6738
6739 Duration duration(FLAGS_duration, reads_);
6740 char value_buffer[256];
6741 while (!duration.Done(1)) {
6742 int64_t seek_pos = thread->rand.Next() % FLAGS_num;
6743 GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
6744 &key);
6745 if (FLAGS_max_scan_distance != 0) {
6746 if (FLAGS_reverse_iterator) {
6747 GenerateKeyFromInt(
6748 static_cast<uint64_t>(std::max(
6749 static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
6750 FLAGS_num, &lower_bound);
6751 options.iterate_lower_bound = &lower_bound;
6752 } else {
6753 auto min_num =
6754 std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
6755 GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
6756 &upper_bound);
6757 options.iterate_upper_bound = &upper_bound;
6758 }
6759 } else if (FLAGS_auto_prefix_mode && prefix_extractor_ &&
6760 !FLAGS_reverse_iterator) {
6761 // Set upper bound to next prefix
6762 auto mutable_upper_bound = const_cast<char*>(upper_bound.data());
6763 std::memcpy(mutable_upper_bound, key.data(), prefix_size_);
6764 mutable_upper_bound[prefix_size_ - 1]++;
6765 upper_bound = Slice(upper_bound.data(), prefix_size_);
6766 options.iterate_upper_bound = &upper_bound;
6767 }
6768
6769 // Pick a Iterator to use
6770 uint64_t db_idx_to_use =
6771 (db_.db == nullptr)
6772 ? (uint64_t{thread->rand.Next()} % multi_dbs_.size())
6773 : 0;
6774 std::unique_ptr<Iterator> single_iter;
6775 Iterator* iter_to_use;
6776 if (FLAGS_use_tailing_iterator) {
6777 iter_to_use = tailing_iters[db_idx_to_use];
6778 } else {
6779 if (db_.db != nullptr) {
6780 single_iter.reset(db_.db->NewIterator(options));
6781 } else {
6782 single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options));
6783 }
6784 iter_to_use = single_iter.get();
6785 }
6786
6787 iter_to_use->Seek(key);
6788 read++;
6789 if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
6790 found++;
6791 }
6792
6793 for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
6794 // Copy out iterator's value to make sure we read them.
6795 Slice value = iter_to_use->value();
6796 memcpy(value_buffer, value.data(),
6797 std::min(value.size(), sizeof(value_buffer)));
6798 bytes += iter_to_use->key().size() + iter_to_use->value().size();
6799
6800 if (!FLAGS_reverse_iterator) {
6801 iter_to_use->Next();
6802 } else {
6803 iter_to_use->Prev();
6804 }
6805 assert(iter_to_use->status().ok());
6806 }
6807
6808 if (thread->shared->read_rate_limiter.get() != nullptr &&
6809 read % 256 == 255) {
6810 thread->shared->read_rate_limiter->Request(
6811 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6812 }
6813
6814 thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6815 }
6816 for (auto iter : tailing_iters) {
6817 delete iter;
6818 }
6819
6820 char msg[100];
6821 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
6822 read);
6823 thread->stats.AddBytes(bytes);
6824 thread->stats.AddMessage(msg);
6825 }
6826
6827 void SeekRandomWhileWriting(ThreadState* thread) {
6828 if (thread->tid > 0) {
6829 SeekRandom(thread);
6830 } else {
6831 BGWriter(thread, kWrite);
6832 }
6833 }
6834
6835 void SeekRandomWhileMerging(ThreadState* thread) {
6836 if (thread->tid > 0) {
6837 SeekRandom(thread);
6838 } else {
6839 BGWriter(thread, kMerge);
6840 }
6841 }
6842
6843 void DoDelete(ThreadState* thread, bool seq) {
6844 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6845 FLAGS_write_batch_protection_bytes_per_key,
6846 user_timestamp_size_);
6847 Duration duration(seq ? 0 : FLAGS_duration, deletes_);
6848 int64_t i = 0;
6849 std::unique_ptr<const char[]> key_guard;
6850 Slice key = AllocateKey(&key_guard);
6851 std::unique_ptr<char[]> ts_guard;
6852 Slice ts;
6853 if (user_timestamp_size_ > 0) {
6854 ts_guard.reset(new char[user_timestamp_size_]);
6855 }
6856
6857 while (!duration.Done(entries_per_batch_)) {
6858 DB* db = SelectDB(thread);
6859 batch.Clear();
6860 for (int64_t j = 0; j < entries_per_batch_; ++j) {
6861 const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
6862 GenerateKeyFromInt(k, FLAGS_num, &key);
6863 batch.Delete(key);
6864 }
6865 Status s;
6866 if (user_timestamp_size_ > 0) {
6867 ts = mock_app_clock_->Allocate(ts_guard.get());
6868 s = batch.UpdateTimestamps(
6869 ts, [this](uint32_t) { return user_timestamp_size_; });
6870 if (!s.ok()) {
6871 fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
6872 ErrorExit();
6873 }
6874 }
6875 s = db->Write(write_options_, &batch);
6876 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
6877 if (!s.ok()) {
6878 fprintf(stderr, "del error: %s\n", s.ToString().c_str());
6879 exit(1);
6880 }
6881 i += entries_per_batch_;
6882 }
6883 }
6884
6885 void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); }
6886
6887 void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); }
6888
6889 void ReadWhileWriting(ThreadState* thread) {
6890 if (thread->tid > 0) {
6891 ReadRandom(thread);
6892 } else {
6893 BGWriter(thread, kWrite);
6894 }
6895 }
6896
6897 void MultiReadWhileWriting(ThreadState* thread) {
6898 if (thread->tid > 0) {
6899 MultiReadRandom(thread);
6900 } else {
6901 BGWriter(thread, kWrite);
6902 }
6903 }
6904
6905 void ReadWhileMerging(ThreadState* thread) {
6906 if (thread->tid > 0) {
6907 ReadRandom(thread);
6908 } else {
6909 BGWriter(thread, kMerge);
6910 }
6911 }
6912
6913 void BGWriter(ThreadState* thread, enum OperationType write_merge) {
6914 // Special thread that keeps writing until other threads are done.
6915 RandomGenerator gen;
6916 int64_t bytes = 0;
6917
6918 std::unique_ptr<RateLimiter> write_rate_limiter;
6919 if (FLAGS_benchmark_write_rate_limit > 0) {
6920 write_rate_limiter.reset(
6921 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
6922 }
6923
6924 // Don't merge stats from this thread with the readers.
6925 thread->stats.SetExcludeFromMerge();
6926
6927 std::unique_ptr<const char[]> key_guard;
6928 Slice key = AllocateKey(&key_guard);
6929 std::unique_ptr<char[]> ts_guard;
6930 std::unique_ptr<const char[]> begin_key_guard;
6931 Slice begin_key = AllocateKey(&begin_key_guard);
6932 std::unique_ptr<const char[]> end_key_guard;
6933 Slice end_key = AllocateKey(&end_key_guard);
6934 uint64_t num_range_deletions = 0;
6935 std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
6936 std::vector<Slice> expanded_keys;
6937 if (FLAGS_expand_range_tombstones) {
6938 expanded_key_guards.resize(range_tombstone_width_);
6939 for (auto& expanded_key_guard : expanded_key_guards) {
6940 expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
6941 }
6942 }
6943 if (user_timestamp_size_ > 0) {
6944 ts_guard.reset(new char[user_timestamp_size_]);
6945 }
6946 uint32_t written = 0;
6947 bool hint_printed = false;
6948
6949 while (true) {
6950 DB* db = SelectDB(thread);
6951 {
6952 MutexLock l(&thread->shared->mu);
6953 if (FLAGS_finish_after_writes && written == writes_) {
6954 fprintf(stderr, "Exiting the writer after %u writes...\n", written);
6955 break;
6956 }
6957 if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
6958 // Other threads have finished
6959 if (FLAGS_finish_after_writes) {
6960 // Wait for the writes to be finished
6961 if (!hint_printed) {
6962 fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
6963 static_cast<int>(writes_) - written);
6964 hint_printed = true;
6965 }
6966 } else {
6967 // Finish the write immediately
6968 break;
6969 }
6970 }
6971 }
6972
6973 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6974 Status s;
6975
6976 Slice val = gen.Generate();
6977 Slice ts;
6978 if (user_timestamp_size_ > 0) {
6979 ts = mock_app_clock_->Allocate(ts_guard.get());
6980 }
6981 if (write_merge == kWrite) {
6982 if (user_timestamp_size_ == 0) {
6983 s = db->Put(write_options_, key, val);
6984 } else {
6985 s = db->Put(write_options_, key, ts, val);
6986 }
6987 } else {
6988 s = db->Merge(write_options_, key, val);
6989 }
6990 // Restore write_options_
6991 written++;
6992
6993 if (!s.ok()) {
6994 fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
6995 exit(1);
6996 }
6997 bytes += key.size() + val.size() + user_timestamp_size_;
6998 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6999
7000 if (FLAGS_benchmark_write_rate_limit > 0) {
7001 write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
7002 nullptr /* stats */,
7003 RateLimiter::OpType::kWrite);
7004 }
7005
7006 if (writes_per_range_tombstone_ > 0 &&
7007 written > writes_before_delete_range_ &&
7008 (written - writes_before_delete_range_) /
7009 writes_per_range_tombstone_ <=
7010 max_num_range_tombstones_ &&
7011 (written - writes_before_delete_range_) %
7012 writes_per_range_tombstone_ ==
7013 0) {
7014 num_range_deletions++;
7015 int64_t begin_num = thread->rand.Next() % FLAGS_num;
7016 if (FLAGS_expand_range_tombstones) {
7017 for (int64_t offset = 0; offset < range_tombstone_width_; ++offset) {
7018 GenerateKeyFromInt(begin_num + offset, FLAGS_num,
7019 &expanded_keys[offset]);
7020 if (!db->Delete(write_options_, expanded_keys[offset]).ok()) {
7021 fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
7022 exit(1);
7023 }
7024 }
7025 } else {
7026 GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
7027 GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
7028 &end_key);
7029 if (!db->DeleteRange(write_options_, db->DefaultColumnFamily(),
7030 begin_key, end_key)
7031 .ok()) {
7032 fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str());
7033 exit(1);
7034 }
7035 }
7036 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7037 // TODO: DeleteRange is not included in calculcation of bytes/rate
7038 // limiter request
7039 }
7040 }
7041 if (num_range_deletions > 0) {
7042 std::cout << "Number of range deletions: " << num_range_deletions
7043 << std::endl;
7044 }
7045 thread->stats.AddBytes(bytes);
7046 }
7047
7048 void ReadWhileScanning(ThreadState* thread) {
7049 if (thread->tid > 0) {
7050 ReadRandom(thread);
7051 } else {
7052 BGScan(thread);
7053 }
7054 }
7055
7056 void BGScan(ThreadState* thread) {
7057 if (FLAGS_num_multi_db > 0) {
7058 fprintf(stderr, "Not supporting multiple DBs.\n");
7059 abort();
7060 }
7061 assert(db_.db != nullptr);
7062 ReadOptions read_options = read_options_;
7063 std::unique_ptr<char[]> ts_guard;
7064 Slice ts;
7065 if (user_timestamp_size_ > 0) {
7066 ts_guard.reset(new char[user_timestamp_size_]);
7067 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
7068 read_options.timestamp = &ts;
7069 }
7070 Iterator* iter = db_.db->NewIterator(read_options);
7071
7072 fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
7073 Duration duration(FLAGS_duration, reads_);
7074 uint64_t num_seek_to_first = 0;
7075 uint64_t num_next = 0;
7076 while (!duration.Done(1)) {
7077 if (!iter->Valid()) {
7078 iter->SeekToFirst();
7079 num_seek_to_first++;
7080 } else if (!iter->status().ok()) {
7081 fprintf(stderr, "Iterator error: %s\n",
7082 iter->status().ToString().c_str());
7083 abort();
7084 } else {
7085 iter->Next();
7086 num_next++;
7087 }
7088
7089 thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
7090 }
7091 delete iter;
7092 }
7093
7094 // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
7095 // in DB atomically i.e in a single batch. Also refer GetMany.
7096 Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
7097 const Slice& value) {
7098 std::string suffixes[3] = {"2", "1", "0"};
7099 std::string keys[3];
7100
7101 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
7102 FLAGS_write_batch_protection_bytes_per_key,
7103 user_timestamp_size_);
7104 Status s;
7105 for (int i = 0; i < 3; i++) {
7106 keys[i] = key.ToString() + suffixes[i];
7107 batch.Put(keys[i], value);
7108 }
7109
7110 std::unique_ptr<char[]> ts_guard;
7111 if (user_timestamp_size_ > 0) {
7112 ts_guard.reset(new char[user_timestamp_size_]);
7113 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7114 s = batch.UpdateTimestamps(
7115 ts, [this](uint32_t) { return user_timestamp_size_; });
7116 if (!s.ok()) {
7117 fprintf(stderr, "assign timestamp to batch: %s\n",
7118 s.ToString().c_str());
7119 ErrorExit();
7120 }
7121 }
7122
7123 s = db->Write(writeoptions, &batch);
7124 return s;
7125 }
7126
7127 // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
7128 // in DB atomically i.e in a single batch. Also refer GetMany.
7129 Status DeleteMany(DB* db, const WriteOptions& writeoptions,
7130 const Slice& key) {
7131 std::string suffixes[3] = {"1", "2", "0"};
7132 std::string keys[3];
7133
7134 WriteBatch batch(0, 0, FLAGS_write_batch_protection_bytes_per_key,
7135 user_timestamp_size_);
7136 Status s;
7137 for (int i = 0; i < 3; i++) {
7138 keys[i] = key.ToString() + suffixes[i];
7139 batch.Delete(keys[i]);
7140 }
7141
7142 std::unique_ptr<char[]> ts_guard;
7143 if (user_timestamp_size_ > 0) {
7144 ts_guard.reset(new char[user_timestamp_size_]);
7145 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7146 s = batch.UpdateTimestamps(
7147 ts, [this](uint32_t) { return user_timestamp_size_; });
7148 if (!s.ok()) {
7149 fprintf(stderr, "assign timestamp to batch: %s\n",
7150 s.ToString().c_str());
7151 ErrorExit();
7152 }
7153 }
7154
7155 s = db->Write(writeoptions, &batch);
7156 return s;
7157 }
7158
7159 // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
7160 // in the same snapshot, and verifies that all the values are identical.
7161 // ASSUMES that PutMany was used to put (K, V) into the DB.
7162 Status GetMany(DB* db, const Slice& key, std::string* value) {
7163 std::string suffixes[3] = {"0", "1", "2"};
7164 std::string keys[3];
7165 Slice key_slices[3];
7166 std::string values[3];
7167 ReadOptions readoptionscopy = read_options_;
7168
7169 std::unique_ptr<char[]> ts_guard;
7170 Slice ts;
7171 if (user_timestamp_size_ > 0) {
7172 ts_guard.reset(new char[user_timestamp_size_]);
7173 ts = mock_app_clock_->Allocate(ts_guard.get());
7174 readoptionscopy.timestamp = &ts;
7175 }
7176
7177 readoptionscopy.snapshot = db->GetSnapshot();
7178 Status s;
7179 for (int i = 0; i < 3; i++) {
7180 keys[i] = key.ToString() + suffixes[i];
7181 key_slices[i] = keys[i];
7182 s = db->Get(readoptionscopy, key_slices[i], value);
7183 if (!s.ok() && !s.IsNotFound()) {
7184 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7185 values[i] = "";
7186 // we continue after error rather than exiting so that we can
7187 // find more errors if any
7188 } else if (s.IsNotFound()) {
7189 values[i] = "";
7190 } else {
7191 values[i] = *value;
7192 }
7193 }
7194 db->ReleaseSnapshot(readoptionscopy.snapshot);
7195
7196 if ((values[0] != values[1]) || (values[1] != values[2])) {
7197 fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
7198 key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
7199 values[2].c_str());
7200 // we continue after error rather than exiting so that we can
7201 // find more errors if any
7202 }
7203
7204 return s;
7205 }
7206
7207 // Differs from readrandomwriterandom in the following ways:
7208 // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
7209 // (b) Does deletes as well (per FLAGS_deletepercent)
7210 // (c) In order to achieve high % of 'found' during lookups, and to do
7211 // multiple writes (including puts and deletes) it uses upto
7212 // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
7213 // (d) Does not have a MultiGet option.
7214 void RandomWithVerify(ThreadState* thread) {
7215 RandomGenerator gen;
7216 std::string value;
7217 int64_t found = 0;
7218 int get_weight = 0;
7219 int put_weight = 0;
7220 int delete_weight = 0;
7221 int64_t gets_done = 0;
7222 int64_t puts_done = 0;
7223 int64_t deletes_done = 0;
7224
7225 std::unique_ptr<const char[]> key_guard;
7226 Slice key = AllocateKey(&key_guard);
7227
7228 // the number of iterations is the larger of read_ or write_
7229 for (int64_t i = 0; i < readwrites_; i++) {
7230 DB* db = SelectDB(thread);
7231 if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
7232 // one batch completed, reinitialize for next batch
7233 get_weight = FLAGS_readwritepercent;
7234 delete_weight = FLAGS_deletepercent;
7235 put_weight = 100 - get_weight - delete_weight;
7236 }
7237 GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
7238 FLAGS_numdistinct, &key);
7239 if (get_weight > 0) {
7240 // do all the gets first
7241 Status s = GetMany(db, key, &value);
7242 if (!s.ok() && !s.IsNotFound()) {
7243 fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
7244 // we continue after error rather than exiting so that we can
7245 // find more errors if any
7246 } else if (!s.IsNotFound()) {
7247 found++;
7248 }
7249 get_weight--;
7250 gets_done++;
7251 thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
7252 } else if (put_weight > 0) {
7253 // then do all the corresponding number of puts
7254 // for all the gets we have done earlier
7255 Status s = PutMany(db, write_options_, key, gen.Generate());
7256 if (!s.ok()) {
7257 fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
7258 exit(1);
7259 }
7260 put_weight--;
7261 puts_done++;
7262 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7263 } else if (delete_weight > 0) {
7264 Status s = DeleteMany(db, write_options_, key);
7265 if (!s.ok()) {
7266 fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
7267 exit(1);
7268 }
7269 delete_weight--;
7270 deletes_done++;
7271 thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
7272 }
7273 }
7274 char msg[128];
7275 snprintf(msg, sizeof(msg),
7276 "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64
7277 " found:%" PRIu64 ")",
7278 gets_done, puts_done, deletes_done, readwrites_, found);
7279 thread->stats.AddMessage(msg);
7280 }
7281
7282 // This is different from ReadWhileWriting because it does not use
7283 // an extra thread.
7284 void ReadRandomWriteRandom(ThreadState* thread) {
7285 ReadOptions options = read_options_;
7286 RandomGenerator gen;
7287 std::string value;
7288 int64_t found = 0;
7289 int get_weight = 0;
7290 int put_weight = 0;
7291 int64_t reads_done = 0;
7292 int64_t writes_done = 0;
7293 Duration duration(FLAGS_duration, readwrites_);
7294
7295 std::unique_ptr<const char[]> key_guard;
7296 Slice key = AllocateKey(&key_guard);
7297
7298 std::unique_ptr<char[]> ts_guard;
7299 if (user_timestamp_size_ > 0) {
7300 ts_guard.reset(new char[user_timestamp_size_]);
7301 }
7302
7303 // the number of iterations is the larger of read_ or write_
7304 while (!duration.Done(1)) {
7305 DB* db = SelectDB(thread);
7306 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7307 if (get_weight == 0 && put_weight == 0) {
7308 // one batch completed, reinitialize for next batch
7309 get_weight = FLAGS_readwritepercent;
7310 put_weight = 100 - get_weight;
7311 }
7312 if (get_weight > 0) {
7313 // do all the gets first
7314 Slice ts;
7315 if (user_timestamp_size_ > 0) {
7316 ts = mock_app_clock_->GetTimestampForRead(thread->rand,
7317 ts_guard.get());
7318 options.timestamp = &ts;
7319 }
7320 Status s = db->Get(options, key, &value);
7321 if (!s.ok() && !s.IsNotFound()) {
7322 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7323 // we continue after error rather than exiting so that we can
7324 // find more errors if any
7325 } else if (!s.IsNotFound()) {
7326 found++;
7327 }
7328 get_weight--;
7329 reads_done++;
7330 thread->stats.FinishedOps(nullptr, db, 1, kRead);
7331 } else if (put_weight > 0) {
7332 // then do all the corresponding number of puts
7333 // for all the gets we have done earlier
7334 Status s;
7335 if (user_timestamp_size_ > 0) {
7336 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7337 s = db->Put(write_options_, key, ts, gen.Generate());
7338 } else {
7339 s = db->Put(write_options_, key, gen.Generate());
7340 }
7341 if (!s.ok()) {
7342 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7343 ErrorExit();
7344 }
7345 put_weight--;
7346 writes_done++;
7347 thread->stats.FinishedOps(nullptr, db, 1, kWrite);
7348 }
7349 }
7350 char msg[100];
7351 snprintf(msg, sizeof(msg),
7352 "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64
7353 " found:%" PRIu64 ")",
7354 reads_done, writes_done, readwrites_, found);
7355 thread->stats.AddMessage(msg);
7356 }
7357
7358 //
7359 // Read-modify-write for random keys
7360 void UpdateRandom(ThreadState* thread) {
7361 ReadOptions options = read_options_;
7362 RandomGenerator gen;
7363 std::string value;
7364 int64_t found = 0;
7365 int64_t bytes = 0;
7366 Duration duration(FLAGS_duration, readwrites_);
7367
7368 std::unique_ptr<const char[]> key_guard;
7369 Slice key = AllocateKey(&key_guard);
7370 std::unique_ptr<char[]> ts_guard;
7371 if (user_timestamp_size_ > 0) {
7372 ts_guard.reset(new char[user_timestamp_size_]);
7373 }
7374 // the number of iterations is the larger of read_ or write_
7375 while (!duration.Done(1)) {
7376 DB* db = SelectDB(thread);
7377 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7378 Slice ts;
7379 if (user_timestamp_size_ > 0) {
7380 // Read with newest timestamp because we are doing rmw.
7381 ts = mock_app_clock_->Allocate(ts_guard.get());
7382 options.timestamp = &ts;
7383 }
7384
7385 auto status = db->Get(options, key, &value);
7386 if (status.ok()) {
7387 ++found;
7388 bytes += key.size() + value.size() + user_timestamp_size_;
7389 } else if (!status.IsNotFound()) {
7390 fprintf(stderr, "Get returned an error: %s\n",
7391 status.ToString().c_str());
7392 abort();
7393 }
7394
7395 if (thread->shared->write_rate_limiter) {
7396 thread->shared->write_rate_limiter->Request(
7397 key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
7398 RateLimiter::OpType::kWrite);
7399 }
7400
7401 Slice val = gen.Generate();
7402 Status s;
7403 if (user_timestamp_size_ > 0) {
7404 ts = mock_app_clock_->Allocate(ts_guard.get());
7405 s = db->Put(write_options_, key, ts, val);
7406 } else {
7407 s = db->Put(write_options_, key, val);
7408 }
7409 if (!s.ok()) {
7410 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7411 exit(1);
7412 }
7413 bytes += key.size() + val.size() + user_timestamp_size_;
7414 thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
7415 }
7416 char msg[100];
7417 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7418 readwrites_, found);
7419 thread->stats.AddBytes(bytes);
7420 thread->stats.AddMessage(msg);
7421 }
7422
7423 // Read-XOR-write for random keys. Xors the existing value with a randomly
7424 // generated value, and stores the result. Assuming A in the array of bytes
7425 // representing the existing value, we generate an array B of the same size,
7426 // then compute C = A^B as C[i]=A[i]^B[i], and store C
7427 void XORUpdateRandom(ThreadState* thread) {
7428 ReadOptions options = read_options_;
7429 RandomGenerator gen;
7430 std::string existing_value;
7431 int64_t found = 0;
7432 Duration duration(FLAGS_duration, readwrites_);
7433
7434 BytesXOROperator xor_operator;
7435
7436 std::unique_ptr<const char[]> key_guard;
7437 Slice key = AllocateKey(&key_guard);
7438 std::unique_ptr<char[]> ts_guard;
7439 if (user_timestamp_size_ > 0) {
7440 ts_guard.reset(new char[user_timestamp_size_]);
7441 }
7442 // the number of iterations is the larger of read_ or write_
7443 while (!duration.Done(1)) {
7444 DB* db = SelectDB(thread);
7445 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7446 Slice ts;
7447 if (user_timestamp_size_ > 0) {
7448 ts = mock_app_clock_->Allocate(ts_guard.get());
7449 options.timestamp = &ts;
7450 }
7451
7452 auto status = db->Get(options, key, &existing_value);
7453 if (status.ok()) {
7454 ++found;
7455 } else if (!status.IsNotFound()) {
7456 fprintf(stderr, "Get returned an error: %s\n",
7457 status.ToString().c_str());
7458 exit(1);
7459 }
7460
7461 Slice value =
7462 gen.Generate(static_cast<unsigned int>(existing_value.size()));
7463 std::string new_value;
7464
7465 if (status.ok()) {
7466 Slice existing_value_slice = Slice(existing_value);
7467 xor_operator.XOR(&existing_value_slice, value, &new_value);
7468 } else {
7469 xor_operator.XOR(nullptr, value, &new_value);
7470 }
7471
7472 Status s;
7473 if (user_timestamp_size_ > 0) {
7474 ts = mock_app_clock_->Allocate(ts_guard.get());
7475 s = db->Put(write_options_, key, ts, Slice(new_value));
7476 } else {
7477 s = db->Put(write_options_, key, Slice(new_value));
7478 }
7479 if (!s.ok()) {
7480 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7481 ErrorExit();
7482 }
7483 thread->stats.FinishedOps(nullptr, db, 1);
7484 }
7485 char msg[100];
7486 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7487 readwrites_, found);
7488 thread->stats.AddMessage(msg);
7489 }
7490
7491 // Read-modify-write for random keys.
7492 // Each operation causes the key grow by value_size (simulating an append).
7493 // Generally used for benchmarking against merges of similar type
7494 void AppendRandom(ThreadState* thread) {
7495 ReadOptions options = read_options_;
7496 RandomGenerator gen;
7497 std::string value;
7498 int64_t found = 0;
7499 int64_t bytes = 0;
7500
7501 std::unique_ptr<const char[]> key_guard;
7502 Slice key = AllocateKey(&key_guard);
7503 std::unique_ptr<char[]> ts_guard;
7504 if (user_timestamp_size_ > 0) {
7505 ts_guard.reset(new char[user_timestamp_size_]);
7506 }
7507 // The number of iterations is the larger of read_ or write_
7508 Duration duration(FLAGS_duration, readwrites_);
7509 while (!duration.Done(1)) {
7510 DB* db = SelectDB(thread);
7511 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7512 Slice ts;
7513 if (user_timestamp_size_ > 0) {
7514 ts = mock_app_clock_->Allocate(ts_guard.get());
7515 options.timestamp = &ts;
7516 }
7517
7518 auto status = db->Get(options, key, &value);
7519 if (status.ok()) {
7520 ++found;
7521 bytes += key.size() + value.size() + user_timestamp_size_;
7522 } else if (!status.IsNotFound()) {
7523 fprintf(stderr, "Get returned an error: %s\n",
7524 status.ToString().c_str());
7525 abort();
7526 } else {
7527 // If not existing, then just assume an empty string of data
7528 value.clear();
7529 }
7530
7531 // Update the value (by appending data)
7532 Slice operand = gen.Generate();
7533 if (value.size() > 0) {
7534 // Use a delimiter to match the semantics for StringAppendOperator
7535 value.append(1, ',');
7536 }
7537 value.append(operand.data(), operand.size());
7538
7539 Status s;
7540 if (user_timestamp_size_ > 0) {
7541 ts = mock_app_clock_->Allocate(ts_guard.get());
7542 s = db->Put(write_options_, key, ts, value);
7543 } else {
7544 // Write back to the database
7545 s = db->Put(write_options_, key, value);
7546 }
7547 if (!s.ok()) {
7548 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7549 ErrorExit();
7550 }
7551 bytes += key.size() + value.size() + user_timestamp_size_;
7552 thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
7553 }
7554
7555 char msg[100];
7556 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7557 readwrites_, found);
7558 thread->stats.AddBytes(bytes);
7559 thread->stats.AddMessage(msg);
7560 }
7561
7562 // Read-modify-write for random keys (using MergeOperator)
7563 // The merge operator to use should be defined by FLAGS_merge_operator
7564 // Adjust FLAGS_value_size so that the keys are reasonable for this operator
7565 // Assumes that the merge operator is non-null (i.e.: is well-defined)
7566 //
7567 // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
7568 // to simulate random additions over 64-bit integers using merge.
7569 //
7570 // The number of merges on the same key can be controlled by adjusting
7571 // FLAGS_merge_keys.
7572 void MergeRandom(ThreadState* thread) {
7573 RandomGenerator gen;
7574 int64_t bytes = 0;
7575 std::unique_ptr<const char[]> key_guard;
7576 Slice key = AllocateKey(&key_guard);
7577 // The number of iterations is the larger of read_ or write_
7578 Duration duration(FLAGS_duration, readwrites_);
7579 while (!duration.Done(1)) {
7580 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
7581 int64_t key_rand = thread->rand.Next() % merge_keys_;
7582 GenerateKeyFromInt(key_rand, merge_keys_, &key);
7583
7584 Status s;
7585 Slice val = gen.Generate();
7586 if (FLAGS_num_column_families > 1) {
7587 s = db_with_cfh->db->Merge(write_options_,
7588 db_with_cfh->GetCfh(key_rand), key, val);
7589 } else {
7590 s = db_with_cfh->db->Merge(
7591 write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val);
7592 }
7593
7594 if (!s.ok()) {
7595 fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
7596 exit(1);
7597 }
7598 bytes += key.size() + val.size();
7599 thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
7600 }
7601
7602 // Print some statistics
7603 char msg[100];
7604 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
7605 thread->stats.AddBytes(bytes);
7606 thread->stats.AddMessage(msg);
7607 }
7608
7609 // Read and merge random keys. The amount of reads and merges are controlled
7610 // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
7611 // keys (and thus also the number of reads and merges on the same key) can be
7612 // adjusted with FLAGS_merge_keys.
7613 //
7614 // As with MergeRandom, the merge operator to use should be defined by
7615 // FLAGS_merge_operator.
7616 void ReadRandomMergeRandom(ThreadState* thread) {
7617 RandomGenerator gen;
7618 std::string value;
7619 int64_t num_hits = 0;
7620 int64_t num_gets = 0;
7621 int64_t num_merges = 0;
7622 size_t max_length = 0;
7623
7624 std::unique_ptr<const char[]> key_guard;
7625 Slice key = AllocateKey(&key_guard);
7626 // the number of iterations is the larger of read_ or write_
7627 Duration duration(FLAGS_duration, readwrites_);
7628 while (!duration.Done(1)) {
7629 DB* db = SelectDB(thread);
7630 GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
7631
7632 bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
7633
7634 if (do_merge) {
7635 Status s = db->Merge(write_options_, key, gen.Generate());
7636 if (!s.ok()) {
7637 fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
7638 exit(1);
7639 }
7640 num_merges++;
7641 thread->stats.FinishedOps(nullptr, db, 1, kMerge);
7642 } else {
7643 Status s = db->Get(read_options_, key, &value);
7644 if (value.length() > max_length) max_length = value.length();
7645
7646 if (!s.ok() && !s.IsNotFound()) {
7647 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7648 // we continue after error rather than exiting so that we can
7649 // find more errors if any
7650 } else if (!s.IsNotFound()) {
7651 num_hits++;
7652 }
7653 num_gets++;
7654 thread->stats.FinishedOps(nullptr, db, 1, kRead);
7655 }
7656 }
7657
7658 char msg[100];
7659 snprintf(msg, sizeof(msg),
7660 "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
7661 " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
7662 num_gets, num_merges, readwrites_, num_hits, max_length);
7663 thread->stats.AddMessage(msg);
7664 }
7665
7666 void WriteSeqSeekSeq(ThreadState* thread) {
7667 writes_ = FLAGS_num;
7668 DoWrite(thread, SEQUENTIAL);
7669 // exclude writes from the ops/sec calculation
7670 thread->stats.Start(thread->tid);
7671
7672 DB* db = SelectDB(thread);
7673 ReadOptions read_opts = read_options_;
7674 std::unique_ptr<char[]> ts_guard;
7675 Slice ts;
7676 if (user_timestamp_size_ > 0) {
7677 ts_guard.reset(new char[user_timestamp_size_]);
7678 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
7679 read_opts.timestamp = &ts;
7680 }
7681 std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
7682
7683 std::unique_ptr<const char[]> key_guard;
7684 Slice key = AllocateKey(&key_guard);
7685 for (int64_t i = 0; i < FLAGS_num; ++i) {
7686 GenerateKeyFromInt(i, FLAGS_num, &key);
7687 iter->Seek(key);
7688 assert(iter->Valid() && iter->key() == key);
7689 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7690
7691 for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
7692 if (!FLAGS_reverse_iterator) {
7693 iter->Next();
7694 } else {
7695 iter->Prev();
7696 }
7697 GenerateKeyFromInt(++i, FLAGS_num, &key);
7698 assert(iter->Valid() && iter->key() == key);
7699 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7700 }
7701
7702 iter->Seek(key);
7703 assert(iter->Valid() && iter->key() == key);
7704 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7705 }
7706 }
7707
7708 bool binary_search(std::vector<int>& data, int start, int end, int key) {
7709 if (data.empty()) return false;
7710 if (start > end) return false;
7711 int mid = start + (end - start) / 2;
7712 if (mid > static_cast<int>(data.size()) - 1) return false;
7713 if (data[mid] == key) {
7714 return true;
7715 } else if (data[mid] > key) {
7716 return binary_search(data, start, mid - 1, key);
7717 } else {
7718 return binary_search(data, mid + 1, end, key);
7719 }
7720 }
7721
7722 // Does a bunch of merge operations for a key(key1) where the merge operand
7723 // is a sorted list. Next performance comparison is done between doing a Get
7724 // for key1 followed by searching for another key(key2) in the large sorted
7725 // list vs calling GetMergeOperands for key1 and then searching for the key2
7726 // in all the sorted sub-lists. Later case is expected to be a lot faster.
7727 void GetMergeOperands(ThreadState* thread) {
7728 DB* db = SelectDB(thread);
7729 const int kTotalValues = 100000;
7730 const int kListSize = 100;
7731 std::string key = "my_key";
7732 std::string value;
7733
7734 for (int i = 1; i < kTotalValues; i++) {
7735 if (i % kListSize == 0) {
7736 // Remove trailing ','
7737 value.pop_back();
7738 db->Merge(WriteOptions(), key, value);
7739 value.clear();
7740 } else {
7741 value.append(std::to_string(i)).append(",");
7742 }
7743 }
7744
7745 SortList s;
7746 std::vector<int> data;
7747 // This value can be experimented with and it will demonstrate the
7748 // perf difference between doing a Get and searching for lookup_key in the
7749 // resultant large sorted list vs doing GetMergeOperands and searching
7750 // for lookup_key within this resultant sorted sub-lists.
7751 int lookup_key = 1;
7752
7753 // Get API call
7754 std::cout << "--- Get API call --- \n";
7755 PinnableSlice p_slice;
7756 uint64_t st = FLAGS_env->NowNanos();
7757 db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
7758 s.MakeVector(data, p_slice);
7759 bool found =
7760 binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
7761 std::cout << "Found key? " << std::to_string(found) << "\n";
7762 uint64_t sp = FLAGS_env->NowNanos();
7763 std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
7764 std::string* dat_ = p_slice.GetSelf();
7765 std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
7766 << "\n";
7767 data.clear();
7768
7769 // GetMergeOperands API call
7770 std::cout << "--- GetMergeOperands API --- \n";
7771 std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
7772 st = FLAGS_env->NowNanos();
7773 int number_of_operands = 0;
7774 GetMergeOperandsOptions get_merge_operands_options;
7775 get_merge_operands_options.expected_max_number_of_operands =
7776 (kTotalValues / 100) + 1;
7777 db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
7778 a_slice.data(), &get_merge_operands_options,
7779 &number_of_operands);
7780 for (PinnableSlice& psl : a_slice) {
7781 s.MakeVector(data, psl);
7782 found =
7783 binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
7784 data.clear();
7785 if (found) break;
7786 }
7787 std::cout << "Found key? " << std::to_string(found) << "\n";
7788 sp = FLAGS_env->NowNanos();
7789 std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
7790 << " seconds \n";
7791 int to_print = 0;
7792 std::cout << "Sample data from GetMergeOperands API call: ";
7793 for (PinnableSlice& psl : a_slice) {
7794 std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
7795 if (to_print++ > 2) break;
7796 }
7797 }
7798
7799 #ifndef ROCKSDB_LITE
7800 void VerifyChecksum(ThreadState* thread) {
7801 DB* db = SelectDB(thread);
7802 ReadOptions ro;
7803 ro.adaptive_readahead = FLAGS_adaptive_readahead;
7804 ro.async_io = FLAGS_async_io;
7805 ro.rate_limiter_priority =
7806 FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
7807 ro.readahead_size = FLAGS_readahead_size;
7808 Status s = db->VerifyChecksum(ro);
7809 if (!s.ok()) {
7810 fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
7811 exit(1);
7812 }
7813 }
7814
7815 void VerifyFileChecksums(ThreadState* thread) {
7816 DB* db = SelectDB(thread);
7817 ReadOptions ro;
7818 ro.adaptive_readahead = FLAGS_adaptive_readahead;
7819 ro.async_io = FLAGS_async_io;
7820 ro.rate_limiter_priority =
7821 FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
7822 ro.readahead_size = FLAGS_readahead_size;
7823 Status s = db->VerifyFileChecksums(ro);
7824 if (!s.ok()) {
7825 fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
7826 s.ToString().c_str());
7827 exit(1);
7828 }
7829 }
7830
7831 // This benchmark stress tests Transactions. For a given --duration (or
7832 // total number of --writes, a Transaction will perform a read-modify-write
7833 // to increment the value of a key in each of N(--transaction-sets) sets of
7834 // keys (where each set has --num keys). If --threads is set, this will be
7835 // done in parallel.
7836 //
7837 // To test transactions, use --transaction_db=true. Not setting this
7838 // parameter
7839 // will run the same benchmark without transactions.
7840 //
7841 // RandomTransactionVerify() will then validate the correctness of the results
7842 // by checking if the sum of all keys in each set is the same.
7843 void RandomTransaction(ThreadState* thread) {
7844 Duration duration(FLAGS_duration, readwrites_);
7845 uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
7846 uint64_t transactions_done = 0;
7847
7848 if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
7849 fprintf(stderr, "invalid value for transaction_sets\n");
7850 abort();
7851 }
7852
7853 TransactionOptions txn_options;
7854 txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
7855 txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
7856
7857 RandomTransactionInserter inserter(&thread->rand, write_options_,
7858 read_options_, FLAGS_num,
7859 num_prefix_ranges);
7860
7861 if (FLAGS_num_multi_db > 1) {
7862 fprintf(stderr,
7863 "Cannot run RandomTransaction benchmark with "
7864 "FLAGS_multi_db > 1.");
7865 abort();
7866 }
7867
7868 while (!duration.Done(1)) {
7869 bool success;
7870
7871 // RandomTransactionInserter will attempt to insert a key for each
7872 // # of FLAGS_transaction_sets
7873 if (FLAGS_optimistic_transaction_db) {
7874 success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
7875 } else if (FLAGS_transaction_db) {
7876 TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
7877 success = inserter.TransactionDBInsert(txn_db, txn_options);
7878 } else {
7879 success = inserter.DBInsert(db_.db);
7880 }
7881
7882 if (!success) {
7883 fprintf(stderr, "Unexpected error: %s\n",
7884 inserter.GetLastStatus().ToString().c_str());
7885 abort();
7886 }
7887
7888 thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
7889 transactions_done++;
7890 }
7891
7892 char msg[100];
7893 if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
7894 snprintf(msg, sizeof(msg),
7895 "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
7896 transactions_done, inserter.GetFailureCount());
7897 } else {
7898 snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
7899 }
7900 thread->stats.AddMessage(msg);
7901 thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
7902 }
7903
7904 // Verifies consistency of data after RandomTransaction() has been run.
7905 // Since each iteration of RandomTransaction() incremented a key in each set
7906 // by the same value, the sum of the keys in each set should be the same.
7907 void RandomTransactionVerify() {
7908 if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
7909 // transactions not used, nothing to verify.
7910 return;
7911 }
7912
7913 Status s = RandomTransactionInserter::Verify(
7914 db_.db, static_cast<uint16_t>(FLAGS_transaction_sets));
7915
7916 if (s.ok()) {
7917 fprintf(stdout, "RandomTransactionVerify Success.\n");
7918 } else {
7919 fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
7920 }
7921 }
7922 #endif // ROCKSDB_LITE
7923
7924 // Writes and deletes random keys without overwriting keys.
7925 //
7926 // This benchmark is intended to partially replicate the behavior of MyRocks
7927 // secondary indices: All data is stored in keys and updates happen by
7928 // deleting the old version of the key and inserting the new version.
7929 void RandomReplaceKeys(ThreadState* thread) {
7930 std::unique_ptr<const char[]> key_guard;
7931 Slice key = AllocateKey(&key_guard);
7932 std::unique_ptr<char[]> ts_guard;
7933 if (user_timestamp_size_ > 0) {
7934 ts_guard.reset(new char[user_timestamp_size_]);
7935 }
7936 std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
7937 size_t max_counter = 50;
7938 RandomGenerator gen;
7939
7940 Status s;
7941 DB* db = SelectDB(thread);
7942 for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
7943 GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
7944 if (user_timestamp_size_ > 0) {
7945 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7946 s = db->Put(write_options_, key, ts, gen.Generate());
7947 } else {
7948 s = db->Put(write_options_, key, gen.Generate());
7949 }
7950 if (!s.ok()) {
7951 fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7952 exit(1);
7953 }
7954 }
7955
7956 db->GetSnapshot();
7957
7958 std::default_random_engine generator;
7959 std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
7960 FLAGS_stddev);
7961 Duration duration(FLAGS_duration, FLAGS_num);
7962 while (!duration.Done(1)) {
7963 int64_t rnd_id = static_cast<int64_t>(distribution(generator));
7964 int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
7965 static_cast<int64_t>(0));
7966 GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7967 &key);
7968 if (user_timestamp_size_ > 0) {
7969 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7970 s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts)
7971 : db->Delete(write_options_, key, ts);
7972 } else {
7973 s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
7974 : db->Delete(write_options_, key);
7975 }
7976 if (s.ok()) {
7977 counters[key_id] = (counters[key_id] + 1) % max_counter;
7978 GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7979 &key);
7980 if (user_timestamp_size_ > 0) {
7981 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7982 s = db->Put(write_options_, key, ts, Slice());
7983 } else {
7984 s = db->Put(write_options_, key, Slice());
7985 }
7986 }
7987
7988 if (!s.ok()) {
7989 fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7990 exit(1);
7991 }
7992
7993 thread->stats.FinishedOps(nullptr, db, 1, kOthers);
7994 }
7995
7996 char msg[200];
7997 snprintf(msg, sizeof(msg),
7998 "use single deletes: %d, "
7999 "standard deviation: %lf\n",
8000 FLAGS_use_single_deletes, FLAGS_stddev);
8001 thread->stats.AddMessage(msg);
8002 }
8003
8004 void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
8005 int64_t read = 0;
8006 int64_t found = 0;
8007 int64_t bytes = 0;
8008
8009 Iterator* iter = nullptr;
8010 // Only work on single database
8011 assert(db_.db != nullptr);
8012 iter = db_.db->NewIterator(read_options_);
8013
8014 std::unique_ptr<const char[]> key_guard;
8015 Slice key = AllocateKey(&key_guard);
8016
8017 char value_buffer[256];
8018 while (true) {
8019 {
8020 MutexLock l(&thread->shared->mu);
8021 if (thread->shared->num_done >= 1) {
8022 // Write thread have finished
8023 break;
8024 }
8025 }
8026 if (!FLAGS_use_tailing_iterator) {
8027 delete iter;
8028 iter = db_.db->NewIterator(read_options_);
8029 }
8030 // Pick a Iterator to use
8031
8032 int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
8033 GenerateKeyFromInt(key_id, FLAGS_num, &key);
8034 // Reset last 8 bytes to 0
8035 char* start = const_cast<char*>(key.data());
8036 start += key.size() - 8;
8037 memset(start, 0, 8);
8038 ++read;
8039
8040 bool key_found = false;
8041 // Seek the prefix
8042 for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
8043 iter->Next()) {
8044 key_found = true;
8045 // Copy out iterator's value to make sure we read them.
8046 if (do_deletion) {
8047 bytes += iter->key().size();
8048 if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
8049 thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
8050 db_.db->Delete(write_options_, iter->key());
8051 } else {
8052 break;
8053 }
8054 } else {
8055 bytes += iter->key().size() + iter->value().size();
8056 thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
8057 Slice value = iter->value();
8058 memcpy(value_buffer, value.data(),
8059 std::min(value.size(), sizeof(value_buffer)));
8060
8061 assert(iter->status().ok());
8062 }
8063 }
8064 found += key_found;
8065
8066 if (thread->shared->read_rate_limiter.get() != nullptr) {
8067 thread->shared->read_rate_limiter->Request(
8068 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
8069 }
8070 }
8071 delete iter;
8072
8073 char msg[100];
8074 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
8075 read);
8076 thread->stats.AddBytes(bytes);
8077 thread->stats.AddMessage(msg);
8078 }
8079
8080 void TimeSeriesWrite(ThreadState* thread) {
8081 // Special thread that keeps writing until other threads are done.
8082 RandomGenerator gen;
8083 int64_t bytes = 0;
8084
8085 // Don't merge stats from this thread with the readers.
8086 thread->stats.SetExcludeFromMerge();
8087
8088 std::unique_ptr<RateLimiter> write_rate_limiter;
8089 if (FLAGS_benchmark_write_rate_limit > 0) {
8090 write_rate_limiter.reset(
8091 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
8092 }
8093
8094 std::unique_ptr<const char[]> key_guard;
8095 Slice key = AllocateKey(&key_guard);
8096
8097 Duration duration(FLAGS_duration, writes_);
8098 while (!duration.Done(1)) {
8099 DB* db = SelectDB(thread);
8100
8101 uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
8102 // Write key id
8103 GenerateKeyFromInt(key_id, FLAGS_num, &key);
8104 // Write timestamp
8105
8106 char* start = const_cast<char*>(key.data());
8107 char* pos = start + 8;
8108 int bytes_to_fill =
8109 std::min(key_size_ - static_cast<int>(pos - start), 8);
8110 uint64_t timestamp_value = timestamp_emulator_->Get();
8111 if (port::kLittleEndian) {
8112 for (int i = 0; i < bytes_to_fill; ++i) {
8113 pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
8114 }
8115 } else {
8116 memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
8117 }
8118
8119 timestamp_emulator_->Inc();
8120
8121 Status s;
8122 Slice val = gen.Generate();
8123 s = db->Put(write_options_, key, val);
8124
8125 if (!s.ok()) {
8126 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
8127 ErrorExit();
8128 }
8129 bytes = key.size() + val.size();
8130 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
8131 thread->stats.AddBytes(bytes);
8132
8133 if (FLAGS_benchmark_write_rate_limit > 0) {
8134 write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
8135 nullptr /* stats */,
8136 RateLimiter::OpType::kWrite);
8137 }
8138 }
8139 }
8140
8141 void TimeSeries(ThreadState* thread) {
8142 if (thread->tid > 0) {
8143 bool do_deletion = FLAGS_expire_style == "delete" &&
8144 thread->tid <= FLAGS_num_deletion_threads;
8145 TimeSeriesReadOrDelete(thread, do_deletion);
8146 } else {
8147 TimeSeriesWrite(thread);
8148 thread->stats.Stop();
8149 thread->stats.Report("timeseries write");
8150 }
8151 }
8152
8153 void Compact(ThreadState* thread) {
8154 DB* db = SelectDB(thread);
8155 CompactRangeOptions cro;
8156 cro.bottommost_level_compaction =
8157 BottommostLevelCompaction::kForceOptimized;
8158 db->CompactRange(cro, nullptr, nullptr);
8159 }
8160
8161 void CompactAll() {
8162 if (db_.db != nullptr) {
8163 db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8164 }
8165 for (const auto& db_with_cfh : multi_dbs_) {
8166 db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8167 }
8168 }
8169
8170 #ifndef ROCKSDB_LITE
8171 void WaitForCompactionHelper(DBWithColumnFamilies& db) {
8172 // This is an imperfect way of waiting for compaction. The loop and sleep
8173 // is done because a thread that finishes a compaction job should get a
8174 // chance to pickup a new compaction job.
8175
8176 std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
8177 DB::Properties::kNumRunningFlushes,
8178 DB::Properties::kCompactionPending,
8179 DB::Properties::kNumRunningCompactions};
8180
8181 fprintf(stdout, "waitforcompaction(%s): started\n",
8182 db.db->GetName().c_str());
8183
8184 while (true) {
8185 bool retry = false;
8186
8187 for (const auto& k : keys) {
8188 uint64_t v;
8189 if (!db.db->GetIntProperty(k, &v)) {
8190 fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
8191 db.db->GetName().c_str(), k.c_str());
8192 exit(1);
8193 } else if (v > 0) {
8194 fprintf(stdout,
8195 "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
8196 db.db->GetName().c_str(), k.c_str());
8197 FLAGS_env->SleepForMicroseconds(10 * 1000000);
8198 retry = true;
8199 break;
8200 }
8201 }
8202
8203 if (!retry) {
8204 fprintf(stdout, "waitforcompaction(%s): finished\n",
8205 db.db->GetName().c_str());
8206 return;
8207 }
8208 }
8209 }
8210
8211 void WaitForCompaction() {
8212 // Give background threads a chance to wake
8213 FLAGS_env->SleepForMicroseconds(5 * 1000000);
8214
8215 // I am skeptical that this check race free. I hope that checking twice
8216 // reduces the chance.
8217 if (db_.db != nullptr) {
8218 WaitForCompactionHelper(db_);
8219 WaitForCompactionHelper(db_);
8220 } else {
8221 for (auto& db_with_cfh : multi_dbs_) {
8222 WaitForCompactionHelper(db_with_cfh);
8223 WaitForCompactionHelper(db_with_cfh);
8224 }
8225 }
8226 }
8227
8228 bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
8229 std::vector<LiveFileMetaData> files;
8230 db_with_cfh.db->GetLiveFilesMetaData(&files);
8231
8232 assert(from_level == 0 || from_level == 1);
8233
8234 int real_from_level = from_level;
8235 if (real_from_level > 0) {
8236 // With dynamic leveled compaction the first level with data beyond L0
8237 // might not be L1.
8238 real_from_level = std::numeric_limits<int>::max();
8239
8240 for (auto& f : files) {
8241 if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
8242 }
8243
8244 if (real_from_level == std::numeric_limits<int>::max()) {
8245 fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
8246 return true;
8247 }
8248 }
8249
8250 // The goal is to compact from from_level to the level that follows it,
8251 // and with dynamic leveled compaction the next level might not be
8252 // real_from_level+1
8253 int next_level = std::numeric_limits<int>::max();
8254
8255 std::vector<std::string> files_to_compact;
8256 for (auto& f : files) {
8257 if (f.level == real_from_level)
8258 files_to_compact.push_back(f.name);
8259 else if (f.level > real_from_level && f.level < next_level)
8260 next_level = f.level;
8261 }
8262
8263 if (files_to_compact.empty()) {
8264 fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
8265 return true;
8266 } else if (next_level == std::numeric_limits<int>::max()) {
8267 // There is no data beyond real_from_level. So we are done.
8268 fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
8269 real_from_level);
8270 return true;
8271 }
8272
8273 fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
8274 from_level, static_cast<int>(files_to_compact.size()),
8275 real_from_level, next_level);
8276
8277 ROCKSDB_NAMESPACE::CompactionOptions options;
8278 // Lets RocksDB use the configured compression for this level
8279 options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
8280
8281 ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
8282 db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
8283 options.output_file_size_limit = cfDesc.options.target_file_size_base;
8284
8285 Status status =
8286 db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
8287 if (!status.ok()) {
8288 // This can fail for valid reasons including the operation was aborted
8289 // or a filename is invalid because background compaction removed it.
8290 // Having read the current cases for which an error is raised I prefer
8291 // not to figure out whether an exception should be thrown here.
8292 fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
8293 status.ToString().c_str());
8294 return false;
8295 }
8296 return true;
8297 }
8298
8299 void CompactLevel(int from_level) {
8300 if (db_.db != nullptr) {
8301 while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
8302 }
8303 for (auto& db_with_cfh : multi_dbs_) {
8304 while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
8305 }
8306 }
8307 #endif
8308
8309 void Flush() {
8310 FlushOptions flush_opt;
8311 flush_opt.wait = true;
8312
8313 if (db_.db != nullptr) {
8314 Status s;
8315 if (FLAGS_num_column_families > 1) {
8316 s = db_.db->Flush(flush_opt, db_.cfh);
8317 } else {
8318 s = db_.db->Flush(flush_opt, db_.db->DefaultColumnFamily());
8319 }
8320
8321 if (!s.ok()) {
8322 fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
8323 exit(1);
8324 }
8325 } else {
8326 for (const auto& db_with_cfh : multi_dbs_) {
8327 Status s;
8328 if (FLAGS_num_column_families > 1) {
8329 s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
8330 } else {
8331 s = db_with_cfh.db->Flush(flush_opt,
8332 db_with_cfh.db->DefaultColumnFamily());
8333 }
8334
8335 if (!s.ok()) {
8336 fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
8337 exit(1);
8338 }
8339 }
8340 }
8341 fprintf(stdout, "flush memtable\n");
8342 }
8343
8344 void ResetStats() {
8345 if (db_.db != nullptr) {
8346 db_.db->ResetStats();
8347 }
8348 for (const auto& db_with_cfh : multi_dbs_) {
8349 db_with_cfh.db->ResetStats();
8350 }
8351 }
8352
8353 void PrintStatsHistory() {
8354 if (db_.db != nullptr) {
8355 PrintStatsHistoryImpl(db_.db, false);
8356 }
8357 for (const auto& db_with_cfh : multi_dbs_) {
8358 PrintStatsHistoryImpl(db_with_cfh.db, true);
8359 }
8360 }
8361
8362 void PrintStatsHistoryImpl(DB* db, bool print_header) {
8363 if (print_header) {
8364 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8365 }
8366
8367 std::unique_ptr<StatsHistoryIterator> shi;
8368 Status s =
8369 db->GetStatsHistory(0, std::numeric_limits<uint64_t>::max(), &shi);
8370 if (!s.ok()) {
8371 fprintf(stdout, "%s\n", s.ToString().c_str());
8372 return;
8373 }
8374 assert(shi);
8375 while (shi->Valid()) {
8376 uint64_t stats_time = shi->GetStatsTime();
8377 fprintf(stdout, "------ %s ------\n",
8378 TimeToHumanString(static_cast<int>(stats_time)).c_str());
8379 for (auto& entry : shi->GetStatsMap()) {
8380 fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time,
8381 entry.first.c_str(), entry.second);
8382 }
8383 shi->Next();
8384 }
8385 }
8386
8387 void PrintStats(const char* key) {
8388 if (db_.db != nullptr) {
8389 PrintStats(db_.db, key, false);
8390 }
8391 for (const auto& db_with_cfh : multi_dbs_) {
8392 PrintStats(db_with_cfh.db, key, true);
8393 }
8394 }
8395
8396 void PrintStats(DB* db, const char* key, bool print_header = false) {
8397 if (print_header) {
8398 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8399 }
8400 std::string stats;
8401 if (!db->GetProperty(key, &stats)) {
8402 stats = "(failed)";
8403 }
8404 fprintf(stdout, "\n%s\n", stats.c_str());
8405 }
8406
8407 void PrintStats(const std::vector<std::string>& keys) {
8408 if (db_.db != nullptr) {
8409 PrintStats(db_.db, keys);
8410 }
8411 for (const auto& db_with_cfh : multi_dbs_) {
8412 PrintStats(db_with_cfh.db, keys, true);
8413 }
8414 }
8415
8416 void PrintStats(DB* db, const std::vector<std::string>& keys,
8417 bool print_header = false) {
8418 if (print_header) {
8419 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8420 }
8421
8422 for (const auto& key : keys) {
8423 std::string stats;
8424 if (!db->GetProperty(key, &stats)) {
8425 stats = "(failed)";
8426 }
8427 fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
8428 }
8429 }
8430
8431 #ifndef ROCKSDB_LITE
8432
8433 void Replay(ThreadState* thread) {
8434 if (db_.db != nullptr) {
8435 Replay(thread, &db_);
8436 }
8437 }
8438
8439 void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
8440 Status s;
8441 std::unique_ptr<TraceReader> trace_reader;
8442 s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
8443 &trace_reader);
8444 if (!s.ok()) {
8445 fprintf(
8446 stderr,
8447 "Encountered an error creating a TraceReader from the trace file. "
8448 "Error: %s\n",
8449 s.ToString().c_str());
8450 exit(1);
8451 }
8452 std::unique_ptr<Replayer> replayer;
8453 s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
8454 std::move(trace_reader), &replayer);
8455 if (!s.ok()) {
8456 fprintf(stderr,
8457 "Encountered an error creating a default Replayer. "
8458 "Error: %s\n",
8459 s.ToString().c_str());
8460 exit(1);
8461 }
8462 s = replayer->Prepare();
8463 if (!s.ok()) {
8464 fprintf(stderr, "Prepare for replay failed. Error: %s\n",
8465 s.ToString().c_str());
8466 }
8467 s = replayer->Replay(
8468 ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads),
8469 FLAGS_trace_replay_fast_forward),
8470 nullptr);
8471 replayer.reset();
8472 if (s.ok()) {
8473 fprintf(stdout, "Replay completed from trace_file: %s\n",
8474 FLAGS_trace_file.c_str());
8475 } else {
8476 fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str());
8477 }
8478 }
8479
8480 void Backup(ThreadState* thread) {
8481 DB* db = SelectDB(thread);
8482 std::unique_ptr<BackupEngineOptions> engine_options(
8483 new BackupEngineOptions(FLAGS_backup_dir));
8484 Status s;
8485 BackupEngine* backup_engine;
8486 if (FLAGS_backup_rate_limit > 0) {
8487 engine_options->backup_rate_limiter.reset(NewGenericRateLimiter(
8488 FLAGS_backup_rate_limit, 100000 /* refill_period_us */,
8489 10 /* fairness */, RateLimiter::Mode::kAllIo));
8490 }
8491 // Build new backup of the entire DB
8492 engine_options->destroy_old_data = true;
8493 s = BackupEngine::Open(FLAGS_env, *engine_options, &backup_engine);
8494 assert(s.ok());
8495 s = backup_engine->CreateNewBackup(db);
8496 assert(s.ok());
8497 std::vector<BackupInfo> backup_info;
8498 backup_engine->GetBackupInfo(&backup_info);
8499 // Verify that a new backup is created
8500 assert(backup_info.size() == 1);
8501 }
8502
8503 void Restore(ThreadState* /* thread */) {
8504 std::unique_ptr<BackupEngineOptions> engine_options(
8505 new BackupEngineOptions(FLAGS_backup_dir));
8506 if (FLAGS_restore_rate_limit > 0) {
8507 engine_options->restore_rate_limiter.reset(NewGenericRateLimiter(
8508 FLAGS_restore_rate_limit, 100000 /* refill_period_us */,
8509 10 /* fairness */, RateLimiter::Mode::kAllIo));
8510 }
8511 BackupEngineReadOnly* backup_engine;
8512 Status s =
8513 BackupEngineReadOnly::Open(FLAGS_env, *engine_options, &backup_engine);
8514 assert(s.ok());
8515 s = backup_engine->RestoreDBFromLatestBackup(FLAGS_restore_dir,
8516 FLAGS_restore_dir);
8517 assert(s.ok());
8518 delete backup_engine;
8519 }
8520
8521 #endif // ROCKSDB_LITE
8522 };
8523
8524 int db_bench_tool(int argc, char** argv) {
8525 ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
8526 ConfigOptions config_options;
8527 static bool initialized = false;
8528 if (!initialized) {
8529 SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
8530 " [OPTIONS]...");
8531 SetVersionString(GetRocksVersionAsString(true));
8532 initialized = true;
8533 }
8534 ParseCommandLineFlags(&argc, &argv, true);
8535 FLAGS_compaction_style_e =
8536 (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
8537 #ifndef ROCKSDB_LITE
8538 if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
8539 fprintf(stderr,
8540 "Cannot provide both --statistics and --statistics_string.\n");
8541 exit(1);
8542 }
8543 if (!FLAGS_statistics_string.empty()) {
8544 Status s = Statistics::CreateFromString(config_options,
8545 FLAGS_statistics_string, &dbstats);
8546 if (dbstats == nullptr) {
8547 fprintf(stderr,
8548 "No Statistics registered matching string: %s status=%s\n",
8549 FLAGS_statistics_string.c_str(), s.ToString().c_str());
8550 exit(1);
8551 }
8552 }
8553 #endif // ROCKSDB_LITE
8554 if (FLAGS_statistics) {
8555 dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
8556 }
8557 if (dbstats) {
8558 dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
8559 }
8560 FLAGS_compaction_pri_e =
8561 (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
8562
8563 std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
8564 FLAGS_max_bytes_for_level_multiplier_additional, ',');
8565 for (size_t j = 0; j < fanout.size(); j++) {
8566 FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
8567 #ifndef CYGWIN
8568 std::stoi(fanout[j]));
8569 #else
8570 stoi(fanout[j]));
8571 #endif
8572 }
8573
8574 FLAGS_compression_type_e =
8575 StringToCompressionType(FLAGS_compression_type.c_str());
8576
8577 FLAGS_wal_compression_e =
8578 StringToCompressionType(FLAGS_wal_compression.c_str());
8579
8580 FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType(
8581 FLAGS_compressed_secondary_cache_compression_type.c_str());
8582
8583 #ifndef ROCKSDB_LITE
8584 // Stacked BlobDB
8585 FLAGS_blob_db_compression_type_e =
8586 StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
8587
8588 int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
8589 if (env_opts > 1) {
8590 fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
8591 exit(1);
8592 }
8593
8594 if (env_opts == 1) {
8595 Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri,
8596 &FLAGS_env, &env_guard);
8597 if (!s.ok()) {
8598 fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
8599 exit(1);
8600 }
8601 } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
8602 //**TODO: Make the simulate fs something that can be loaded
8603 // from the ObjectRegistry...
8604 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
8605 NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
8606 FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
8607 /*throughput_multiplier=*/
8608 int{FLAGS_simulate_hybrid_hdd_multipliers},
8609 /*is_full_fs_warm=*/FLAGS_simulate_hdd));
8610 FLAGS_env = composite_env.get();
8611 }
8612
8613 // Let -readonly imply -use_existing_db
8614 FLAGS_use_existing_db |= FLAGS_readonly;
8615 #endif // ROCKSDB_LITE
8616
8617 if (FLAGS_build_info) {
8618 std::string build_info;
8619 std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl;
8620 // Similar to --version, nothing else will be done when this flag is set
8621 exit(0);
8622 }
8623
8624 if (!FLAGS_seed) {
8625 uint64_t now = FLAGS_env->GetSystemClock()->NowMicros();
8626 seed_base = static_cast<int64_t>(now);
8627 fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n",
8628 seed_base);
8629 } else {
8630 seed_base = FLAGS_seed;
8631 }
8632
8633 if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
8634 fprintf(stderr,
8635 "`-use_existing_db` must be true for `-use_existing_keys` to be "
8636 "settable\n");
8637 exit(1);
8638 }
8639
8640 if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
8641 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
8642 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
8643 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
8644 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
8645 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
8646 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
8647 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
8648 else {
8649 fprintf(stdout, "Unknown compaction fadvice:%s\n",
8650 FLAGS_compaction_fadvice.c_str());
8651 exit(1);
8652 }
8653
8654 FLAGS_value_size_distribution_type_e =
8655 StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
8656
8657 // Note options sanitization may increase thread pool sizes according to
8658 // max_background_flushes/max_background_compactions/max_background_jobs
8659 FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
8660 ROCKSDB_NAMESPACE::Env::Priority::HIGH);
8661 FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
8662 ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
8663 FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
8664 ROCKSDB_NAMESPACE::Env::Priority::LOW);
8665
8666 // Choose a location for the test database if none given with --db=<path>
8667 if (FLAGS_db.empty()) {
8668 std::string default_db_path;
8669 FLAGS_env->GetTestDirectory(&default_db_path);
8670 default_db_path += "/dbbench";
8671 FLAGS_db = default_db_path;
8672 }
8673
8674 if (FLAGS_backup_dir.empty()) {
8675 FLAGS_backup_dir = FLAGS_db + "/backup";
8676 }
8677
8678 if (FLAGS_restore_dir.empty()) {
8679 FLAGS_restore_dir = FLAGS_db + "/restore";
8680 }
8681
8682 if (FLAGS_stats_interval_seconds > 0) {
8683 // When both are set then FLAGS_stats_interval determines the frequency
8684 // at which the timer is checked for FLAGS_stats_interval_seconds
8685 FLAGS_stats_interval = 1000;
8686 }
8687
8688 if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
8689 fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
8690 exit(1);
8691 }
8692
8693 ROCKSDB_NAMESPACE::Benchmark benchmark;
8694 benchmark.Run();
8695
8696 #ifndef ROCKSDB_LITE
8697 if (FLAGS_print_malloc_stats) {
8698 std::string stats_string;
8699 ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
8700 fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
8701 }
8702 #endif // ROCKSDB_LITE
8703
8704 return 0;
8705 }
8706 } // namespace ROCKSDB_NAMESPACE
8707 #endif