]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/tools/db_bench_tool.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / tools / db_bench_tool.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
7c673cae
FG
10#ifdef GFLAGS
11#ifdef NUMA
12#include <numa.h>
7c673cae
FG
13#endif
14#ifndef OS_WIN
15#include <unistd.h>
16#endif
17#include <fcntl.h>
7c673cae
FG
18#include <stdio.h>
19#include <stdlib.h>
20#include <sys/types.h>
1e59de90
TL
21#ifdef __APPLE__
22#include <mach/host_info.h>
23#include <mach/mach_host.h>
24#include <sys/sysctl.h>
25#endif
26#ifdef __FreeBSD__
27#include <sys/sysctl.h>
28#endif
7c673cae 29#include <atomic>
f67539c2 30#include <cinttypes>
7c673cae
FG
31#include <condition_variable>
32#include <cstddef>
1e59de90 33#include <iostream>
11fdf7f2 34#include <memory>
7c673cae 35#include <mutex>
1e59de90 36#include <queue>
7c673cae
FG
37#include <thread>
38#include <unordered_map>
39
f67539c2 40#include "db/db_impl/db_impl.h"
494da23a 41#include "db/malloc_stats.h"
7c673cae 42#include "db/version_set.h"
7c673cae
FG
43#include "monitoring/histogram.h"
44#include "monitoring/statistics.h"
494da23a 45#include "options/cf_options.h"
7c673cae
FG
46#include "port/port.h"
47#include "port/stack_trace.h"
48#include "rocksdb/cache.h"
1e59de90 49#include "rocksdb/convenience.h"
7c673cae
FG
50#include "rocksdb/db.h"
51#include "rocksdb/env.h"
52#include "rocksdb/filter_policy.h"
53#include "rocksdb/memtablerep.h"
54#include "rocksdb/options.h"
55#include "rocksdb/perf_context.h"
56#include "rocksdb/persistent_cache.h"
57#include "rocksdb/rate_limiter.h"
1e59de90 58#include "rocksdb/secondary_cache.h"
7c673cae
FG
59#include "rocksdb/slice.h"
60#include "rocksdb/slice_transform.h"
f67539c2 61#include "rocksdb/stats_history.h"
1e59de90
TL
62#include "rocksdb/table.h"
63#include "rocksdb/utilities/backup_engine.h"
7c673cae
FG
64#include "rocksdb/utilities/object_registry.h"
65#include "rocksdb/utilities/optimistic_transaction_db.h"
1e59de90 66#include "rocksdb/utilities/options_type.h"
7c673cae 67#include "rocksdb/utilities/options_util.h"
1e59de90
TL
68#ifndef ROCKSDB_LITE
69#include "rocksdb/utilities/replayer.h"
70#endif // ROCKSDB_LITE
7c673cae
FG
71#include "rocksdb/utilities/sim_cache.h"
72#include "rocksdb/utilities/transaction.h"
73#include "rocksdb/utilities/transaction_db.h"
74#include "rocksdb/write_batch.h"
f67539c2
TL
75#include "test_util/testutil.h"
76#include "test_util/transaction_test_util.h"
1e59de90 77#include "tools/simulated_hybrid_file_system.h"
11fdf7f2 78#include "util/cast_util.h"
7c673cae
FG
79#include "util/compression.h"
80#include "util/crc32c.h"
1e59de90 81#include "util/file_checksum_helper.h"
11fdf7f2 82#include "util/gflags_compat.h"
7c673cae
FG
83#include "util/mutexlock.h"
84#include "util/random.h"
85#include "util/stderr_logger.h"
86#include "util/string_util.h"
7c673cae
FG
87#include "util/xxhash.h"
88#include "utilities/blob_db/blob_db.h"
1e59de90 89#include "utilities/counted_fs.h"
7c673cae 90#include "utilities/merge_operators.h"
11fdf7f2 91#include "utilities/merge_operators/bytesxor.h"
f67539c2 92#include "utilities/merge_operators/sortlist.h"
7c673cae
FG
93#include "utilities/persistent_cache/block_cache_tier.h"
94
20effc67
TL
95#ifdef MEMKIND
96#include "memory/memkind_kmem_allocator.h"
97#endif
98
7c673cae
FG
99#ifdef OS_WIN
100#include <io.h> // open/close
101#endif
102
11fdf7f2
TL
103using GFLAGS_NAMESPACE::ParseCommandLineFlags;
104using GFLAGS_NAMESPACE::RegisterFlagValidator;
105using GFLAGS_NAMESPACE::SetUsageMessage;
1e59de90
TL
106using GFLAGS_NAMESPACE::SetVersionString;
107
108#ifdef ROCKSDB_LITE
109#define IF_ROCKSDB_LITE(Then, Else) Then
110#else
111#define IF_ROCKSDB_LITE(Then, Else) Else
112#endif
7c673cae
FG
113
114DEFINE_string(
115 benchmarks,
116 "fillseq,"
117 "fillseqdeterministic,"
118 "fillsync,"
119 "fillrandom,"
120 "filluniquerandomdeterministic,"
121 "overwrite,"
122 "readrandom,"
123 "newiterator,"
124 "newiteratorwhilewriting,"
125 "seekrandom,"
126 "seekrandomwhilewriting,"
127 "seekrandomwhilemerging,"
128 "readseq,"
129 "readreverse,"
130 "compact,"
11fdf7f2 131 "compactall,"
1e59de90
TL
132 "flush,"
133IF_ROCKSDB_LITE("",
134 "compact0,"
135 "compact1,"
136 "waitforcompaction,"
137)
7c673cae 138 "multireadrandom,"
494da23a 139 "mixgraph,"
7c673cae 140 "readseq,"
f67539c2 141 "readtorowcache,"
7c673cae
FG
142 "readtocache,"
143 "readreverse,"
144 "readwhilewriting,"
145 "readwhilemerging,"
11fdf7f2 146 "readwhilescanning,"
7c673cae
FG
147 "readrandomwriterandom,"
148 "updaterandom,"
11fdf7f2 149 "xorupdaterandom,"
20effc67 150 "approximatesizerandom,"
7c673cae
FG
151 "randomwithverify,"
152 "fill100K,"
153 "crc32c,"
154 "xxhash,"
1e59de90
TL
155 "xxhash64,"
156 "xxh3,"
7c673cae
FG
157 "compress,"
158 "uncompress,"
159 "acquireload,"
160 "fillseekseq,"
161 "randomtransaction,"
162 "randomreplacekeys,"
f67539c2 163 "timeseries,"
1e59de90
TL
164 "getmergeoperands,",
165 "readrandomoperands,"
166 "backup,"
167 "restore"
7c673cae
FG
168
169 "Comma-separated list of operations to run in the specified"
170 " order. Available benchmarks:\n"
171 "\tfillseq -- write N values in sequential key"
172 " order in async mode\n"
173 "\tfillseqdeterministic -- write N values in the specified"
174 " key order and keep the shape of the LSM tree\n"
175 "\tfillrandom -- write N values in random key order in async"
176 " mode\n"
177 "\tfilluniquerandomdeterministic -- write N values in a random"
178 " key order and keep the shape of the LSM tree\n"
1e59de90
TL
179 "\toverwrite -- overwrite N values in random key order in "
180 "async mode\n"
f67539c2 181 "\tfillsync -- write N/1000 values in random key order in "
7c673cae
FG
182 "sync mode\n"
183 "\tfill100K -- write N/1000 100K values in random order in"
184 " async mode\n"
185 "\tdeleteseq -- delete N keys in sequential order\n"
186 "\tdeleterandom -- delete N keys in random order\n"
187 "\treadseq -- read N times sequentially\n"
188 "\treadtocache -- 1 thread reading database sequentially\n"
189 "\treadreverse -- read N times in reverse order\n"
190 "\treadrandom -- read N times in random order\n"
191 "\treadmissing -- read N missing keys in random order\n"
192 "\treadwhilewriting -- 1 writer, N threads doing random "
193 "reads\n"
194 "\treadwhilemerging -- 1 merger, N threads doing random "
195 "reads\n"
11fdf7f2
TL
196 "\treadwhilescanning -- 1 thread doing full table scan, "
197 "N threads doing random reads\n"
7c673cae
FG
198 "\treadrandomwriterandom -- N threads doing random-read, "
199 "random-write\n"
7c673cae
FG
200 "\tupdaterandom -- N threads doing read-modify-write for random "
201 "keys\n"
11fdf7f2
TL
202 "\txorupdaterandom -- N threads doing read-XOR-write for "
203 "random keys\n"
7c673cae
FG
204 "\tappendrandom -- N threads doing read-modify-write with "
205 "growing values\n"
206 "\tmergerandom -- same as updaterandom/appendrandom using merge"
207 " operator. "
208 "Must be used with merge_operator\n"
209 "\treadrandommergerandom -- perform N random read-or-merge "
210 "operations. Must be used with merge_operator\n"
211 "\tnewiterator -- repeated iterator creation\n"
212 "\tseekrandom -- N random seeks, call Next seek_nexts times "
213 "per seek\n"
214 "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
215 "overwrite\n"
216 "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
217 "merge\n"
1e59de90
TL
218 "\tcrc32c -- repeated crc32c of <block size> data\n"
219 "\txxhash -- repeated xxHash of <block size> data\n"
220 "\txxhash64 -- repeated xxHash64 of <block size> data\n"
221 "\txxh3 -- repeated XXH3 of <block size> data\n"
7c673cae
FG
222 "\tacquireload -- load N*1000 times\n"
223 "\tfillseekseq -- write N values in sequential key, then read "
224 "them by seeking to each key\n"
225 "\trandomtransaction -- execute N random transactions and "
226 "verify correctness\n"
227 "\trandomreplacekeys -- randomly replaces N keys by deleting "
228 "the old version and putting the new version\n\n"
229 "\ttimeseries -- 1 writer generates time series data "
230 "and multiple readers doing random reads on id\n\n"
231 "Meta operations:\n"
11fdf7f2
TL
232 "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
233 "\tcompactall -- Compact the entire DB\n"
1e59de90
TL
234IF_ROCKSDB_LITE("",
235 "\tcompact0 -- compact L0 into L1\n"
236 "\tcompact1 -- compact L1 into L2\n"
237 "\twaitforcompaction - pause until compaction is (probably) done\n"
238)
239 "\tflush - flush the memtable\n"
7c673cae
FG
240 "\tstats -- Print DB stats\n"
241 "\tresetstats -- Reset DB stats\n"
242 "\tlevelstats -- Print the number of files and bytes per level\n"
1e59de90 243 "\tmemstats -- Print memtable stats\n"
7c673cae 244 "\tsstables -- Print sstable info\n"
11fdf7f2 245 "\theapprofile -- Dump a heap profile (if supported by this port)\n"
1e59de90 246IF_ROCKSDB_LITE("",
f67539c2 247 "\treplay -- replay the trace file specified with trace_file\n"
1e59de90 248)
f67539c2
TL
249 "\tgetmergeoperands -- Insert lots of merge records which are a list of "
250 "sorted ints for a key and then compare performance of lookup for another "
1e59de90
TL
251 "key by doing a Get followed by binary searching in the large sorted list "
252 "vs doing a GetMergeOperands and binary searching in the operands which "
253 "are sorted sub-lists. The MergeOperator used is sortlist.h\n"
254 "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An "
255 "operation includes a rare but possible retry in case it got "
256 "`Status::Incomplete()`. This happens upon encountering more keys than "
257 "have ever been seen by the thread (or eight initially)\n"
258 "\tbackup -- Create a backup of the current DB and verify that a new backup is corrected. "
259 "Rate limit can be specified through --backup_rate_limit\n"
260 "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n");
7c673cae
FG
261
262DEFINE_int64(num, 1000000, "Number of key/values to place in database");
263
264DEFINE_int64(numdistinct, 1000,
265 "Number of distinct keys to use. Used in RandomWithVerify to "
266 "read/write on fewer keys so that gets are more likely to find the"
267 " key and puts are more likely to update the same key");
268
269DEFINE_int64(merge_keys, -1,
270 "Number of distinct keys to use for MergeRandom and "
271 "ReadRandomMergeRandom. "
272 "If negative, there will be FLAGS_num keys.");
273DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
274
275DEFINE_int32(
276 num_hot_column_families, 0,
277 "Number of Hot Column Families. If more than 0, only write to this "
278 "number of column families. After finishing all the writes to them, "
279 "create new set of column families and insert to them. Only used "
280 "when num_column_families > 1.");
281
11fdf7f2
TL
282DEFINE_string(column_family_distribution, "",
283 "Comma-separated list of percentages, where the ith element "
284 "indicates the probability of an op using the ith column family. "
285 "The number of elements must be `num_hot_column_families` if "
286 "specified; otherwise, it must be `num_column_families`. The "
287 "sum of elements must be 100. E.g., if `num_column_families=4`, "
288 "and `num_hot_column_families=0`, a valid list could be "
289 "\"10,20,30,40\".");
290
1e59de90
TL
291DEFINE_int64(reads, -1,
292 "Number of read operations to do. "
7c673cae
FG
293 "If negative, do FLAGS_num reads.");
294
1e59de90
TL
295DEFINE_int64(deletes, -1,
296 "Number of delete operations to do. "
7c673cae
FG
297 "If negative, do FLAGS_num deletions.");
298
299DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
300
1e59de90
TL
301DEFINE_int64(seed, 0,
302 "Seed base for random number generators. "
303 "When 0 it is derived from the current time.");
304static int64_t seed_base;
7c673cae
FG
305
306DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
307
1e59de90
TL
308DEFINE_int32(duration, 0,
309 "Time in seconds for the random-ops tests to run."
7c673cae
FG
310 " When 0 then num & reads determine the test duration");
311
f67539c2
TL
312DEFINE_string(value_size_distribution_type, "fixed",
313 "Value size distribution type: fixed, uniform, normal");
314
315DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
316static unsigned int value_size = 100;
317
318DEFINE_int32(value_size_min, 100, "Min size of random value");
319
320DEFINE_int32(value_size_max, 102400, "Max size of random value");
7c673cae
FG
321
322DEFINE_int32(seek_nexts, 0,
323 "How many times to call Next() after Seek() in "
324 "fillseekseq, seekrandom, seekrandomwhilewriting and "
325 "seekrandomwhilemerging");
326
327DEFINE_bool(reverse_iterator, false,
328 "When true use Prev rather than Next for iterators that do "
329 "Seek and then Next");
330
1e59de90
TL
331DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark");
332
494da23a
TL
333DEFINE_int64(max_scan_distance, 0,
334 "Used to define iterate_upper_bound (or iterate_lower_bound "
335 "if FLAGS_reverse_iterator is set to true) when value is nonzero");
336
7c673cae
FG
337DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
338
7c673cae
FG
339DEFINE_int64(batch_size, 1, "Batch size");
340
11fdf7f2 341static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
7c673cae
FG
342 return true;
343}
344
345static bool ValidateUint32Range(const char* flagname, uint64_t value) {
346 if (value > std::numeric_limits<uint32_t>::max()) {
347 fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
348 (unsigned long)value);
349 return false;
350 }
351 return true;
352}
353
354DEFINE_int32(key_size, 16, "size of each key");
355
20effc67
TL
356DEFINE_int32(user_timestamp_size, 0,
357 "number of bytes in a user-defined timestamp");
358
7c673cae
FG
359DEFINE_int32(num_multi_db, 0,
360 "Number of DBs used in the benchmark. 0 means single DB.");
361
1e59de90
TL
362DEFINE_double(compression_ratio, 0.5,
363 "Arrange to generate values that shrink to this fraction of "
364 "their original size after compression");
365
366DEFINE_double(
367 overwrite_probability, 0.0,
368 "Used in 'filluniquerandom' benchmark: for each write operation, "
369 "we give a probability to perform an overwrite instead. The key used for "
370 "the overwrite is randomly chosen from the last 'overwrite_window_size' "
371 "keys previously inserted into the DB. "
372 "Valid overwrite_probability values: [0.0, 1.0].");
373
374DEFINE_uint32(overwrite_window_size, 1,
375 "Used in 'filluniquerandom' benchmark. For each write operation,"
376 " when the overwrite_probability flag is set by the user, the "
377 "key used to perform an overwrite is randomly chosen from the "
378 "last 'overwrite_window_size' keys previously inserted into DB. "
379 "Warning: large values can affect throughput. "
380 "Valid overwrite_window_size values: [1, kMaxUint32].");
381
382DEFINE_uint64(
383 disposable_entries_delete_delay, 0,
384 "Minimum delay in microseconds for the series of Deletes "
385 "to be issued. When 0 the insertion of the last disposable entry is "
386 "immediately followed by the issuance of the Deletes. "
387 "(only compatible with fillanddeleteuniquerandom benchmark).");
388
389DEFINE_uint64(disposable_entries_batch_size, 0,
390 "Number of consecutively inserted disposable KV entries "
391 "that will be deleted after 'delete_delay' microseconds. "
392 "A series of Deletes is always issued once all the "
393 "disposable KV entries it targets have been inserted "
394 "into the DB. When 0 no deletes are issued and a "
395 "regular 'filluniquerandom' benchmark occurs. "
396 "(only compatible with fillanddeleteuniquerandom benchmark)");
397
398DEFINE_int32(disposable_entries_value_size, 64,
399 "Size of the values (in bytes) of the entries targeted by "
400 "selective deletes. "
401 "(only compatible with fillanddeleteuniquerandom benchmark)");
402
403DEFINE_uint64(
404 persistent_entries_batch_size, 0,
405 "Number of KV entries being inserted right before the deletes "
406 "targeting the disposable KV entries are issued. These "
407 "persistent keys are not targeted by the deletes, and will always "
408 "remain valid in the DB. (only compatible with "
409 "--benchmarks='fillanddeleteuniquerandom' "
410 "and used when--disposable_entries_batch_size is > 0).");
411
412DEFINE_int32(persistent_entries_value_size, 64,
413 "Size of the values (in bytes) of the entries not targeted by "
414 "deletes. (only compatible with "
415 "--benchmarks='fillanddeleteuniquerandom' "
416 "and used when--disposable_entries_batch_size is > 0).");
7c673cae
FG
417
418DEFINE_double(read_random_exp_range, 0.0,
419 "Read random's key will be generated using distribution of "
420 "num * exp(-r) where r is uniform number from 0 to this value. "
421 "The larger the number is, the more skewed the reads are. "
422 "Only used in readrandom and multireadrandom benchmarks.");
423
424DEFINE_bool(histogram, false, "Print histogram of operation timings");
425
1e59de90
TL
426DEFINE_bool(confidence_interval_only, false,
427 "Print 95% confidence interval upper and lower bounds only for "
428 "aggregate stats.");
429
7c673cae
FG
430DEFINE_bool(enable_numa, false,
431 "Make operations aware of NUMA architecture and bind memory "
432 "and cpus corresponding to nodes together. In NUMA, memory "
433 "in same node as CPUs are closer when compared to memory in "
434 "other nodes. Reads can be faster when the process is bound to "
435 "CPU and memory of same node. Use \"$numactl --hardware\" command "
436 "to see NUMA memory architecture.");
437
f67539c2
TL
438DEFINE_int64(db_write_buffer_size,
439 ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
7c673cae
FG
440 "Number of bytes to buffer in all memtables before compacting");
441
11fdf7f2
TL
442DEFINE_bool(cost_write_buffer_to_cache, false,
443 "The usage of memtable is costed to the block cache");
444
1e59de90
TL
445DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
446 "The size, in bytes, of one block in arena memory allocation.");
447
f67539c2 448DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
7c673cae
FG
449 "Number of bytes to buffer in memtable before compacting");
450
451DEFINE_int32(max_write_buffer_number,
f67539c2 452 ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
7c673cae 453 "The number of in-memory memtables. Each memtable is of size"
11fdf7f2 454 " write_buffer_size bytes.");
7c673cae
FG
455
456DEFINE_int32(min_write_buffer_number_to_merge,
f67539c2 457 ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
7c673cae
FG
458 "The minimum number of write buffers that will be merged together"
459 "before writing to storage. This is cheap because it is an"
460 "in-memory merge. If this feature is not enabled, then all these"
461 "write buffers are flushed to L0 as separate files and this "
462 "increases read amplification because a get request has to check"
463 " in all of these files. Also, an in-memory merge may result in"
464 " writing less data to storage if there are duplicate records "
465 " in each of these individual write buffers.");
466
467DEFINE_int32(max_write_buffer_number_to_maintain,
f67539c2 468 ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
7c673cae
FG
469 "The total maximum number of write buffers to maintain in memory "
470 "including copies of buffers that have already been flushed. "
471 "Unlike max_write_buffer_number, this parameter does not affect "
472 "flushing. This controls the minimum amount of write history "
473 "that will be available in memory for conflict checking when "
474 "Transactions are used. If this value is too low, some "
475 "transactions may fail at commit time due to not being able to "
476 "determine whether there were any write conflicts. Setting this "
477 "value to 0 will cause write buffers to be freed immediately "
478 "after they are flushed. If this value is set to -1, "
479 "'max_write_buffer_number' will be used.");
480
f67539c2
TL
481DEFINE_int64(max_write_buffer_size_to_maintain,
482 ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
483 "The total maximum size of write buffers to maintain in memory "
484 "including copies of buffers that have already been flushed. "
485 "Unlike max_write_buffer_number, this parameter does not affect "
486 "flushing. This controls the minimum amount of write history "
487 "that will be available in memory for conflict checking when "
488 "Transactions are used. If this value is too low, some "
489 "transactions may fail at commit time due to not being able to "
490 "determine whether there were any write conflicts. Setting this "
491 "value to 0 will cause write buffers to be freed immediately "
492 "after they are flushed. If this value is set to -1, "
493 "'max_write_buffer_number' will be used.");
494
11fdf7f2 495DEFINE_int32(max_background_jobs,
f67539c2 496 ROCKSDB_NAMESPACE::Options().max_background_jobs,
11fdf7f2
TL
497 "The maximum number of concurrent background jobs that can occur "
498 "in parallel.");
499
500DEFINE_int32(num_bottom_pri_threads, 0,
501 "The number of threads in the bottom-priority thread pool (used "
502 "by universal compaction only).");
503
504DEFINE_int32(num_high_pri_threads, 0,
505 "The maximum number of concurrent background compactions"
506 " that can occur in parallel.");
507
508DEFINE_int32(num_low_pri_threads, 0,
509 "The maximum number of concurrent background compactions"
510 " that can occur in parallel.");
511
7c673cae 512DEFINE_int32(max_background_compactions,
f67539c2 513 ROCKSDB_NAMESPACE::Options().max_background_compactions,
7c673cae
FG
514 "The maximum number of concurrent background compactions"
515 " that can occur in parallel.");
516
7c673cae
FG
517DEFINE_uint64(subcompactions, 1,
518 "Maximum number of subcompactions to divide L0-L1 compactions "
519 "into.");
1e59de90
TL
520static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
521 RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
7c673cae
FG
522
523DEFINE_int32(max_background_flushes,
f67539c2 524 ROCKSDB_NAMESPACE::Options().max_background_flushes,
7c673cae
FG
525 "The maximum number of concurrent background flushes"
526 " that can occur in parallel.");
527
f67539c2
TL
528static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
529DEFINE_int32(compaction_style,
530 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
7c673cae
FG
531 "style of compaction: level-based, universal and fifo");
532
f67539c2
TL
533static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
534DEFINE_int32(compaction_pri,
535 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
7c673cae
FG
536 "priority of files to compaction: by size or by data age");
537
538DEFINE_int32(universal_size_ratio, 0,
1e59de90
TL
539 "Percentage flexibility while comparing file size "
540 "(for universal compaction only).");
7c673cae 541
1e59de90
TL
542DEFINE_int32(universal_min_merge_width, 0,
543 "The minimum number of files in a single compaction run "
544 "(for universal compaction only).");
7c673cae 545
1e59de90
TL
546DEFINE_int32(universal_max_merge_width, 0,
547 "The max number of files to compact in universal style "
548 "compaction");
7c673cae
FG
549
550DEFINE_int32(universal_max_size_amplification_percent, 0,
551 "The max size amplification for universal style compaction");
552
553DEFINE_int32(universal_compression_size_percent, -1,
554 "The percentage of the database to compress for universal "
555 "compaction. -1 means compress everything.");
556
557DEFINE_bool(universal_allow_trivial_move, false,
558 "Allow trivial move in universal compaction.");
559
1e59de90
TL
560DEFINE_bool(universal_incremental, false,
561 "Enable incremental compactions in universal compaction.");
562
7c673cae
FG
563DEFINE_int64(cache_size, 8 << 20, // 8MB
564 "Number of bytes to use as a cache of uncompressed data");
565
1e59de90 566DEFINE_int32(cache_numshardbits, -1,
7c673cae
FG
567 "Number of shards for the block cache"
568 " is 2 ** cache_numshardbits. Negative means use default settings."
569 " This is applied only if FLAGS_cache_size is non-negative.");
570
571DEFINE_double(cache_high_pri_pool_ratio, 0.0,
572 "Ratio of block cache reserve for high pri blocks. "
573 "If > 0.0, we also enable "
574 "cache_index_and_filter_blocks_with_high_priority.");
575
1e59de90
TL
576DEFINE_double(cache_low_pri_pool_ratio, 0.0,
577 "Ratio of block cache reserve for low pri blocks.");
578
579DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
580
581DEFINE_bool(use_compressed_secondary_cache, false,
582 "Use the CompressedSecondaryCache as the secondary cache.");
583
584DEFINE_int64(compressed_secondary_cache_size, 8 << 20, // 8MB
585 "Number of bytes to use as a cache of data");
586
587DEFINE_int32(compressed_secondary_cache_numshardbits, 6,
588 "Number of shards for the block cache"
589 " is 2 ** compressed_secondary_cache_numshardbits."
590 " Negative means use default settings."
591 " This is applied only if FLAGS_cache_size is non-negative.");
592
593DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio, 0.0,
594 "Ratio of block cache reserve for high pri blocks. "
595 "If > 0.0, we also enable "
596 "cache_index_and_filter_blocks_with_high_priority.");
597
598DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio, 0.0,
599 "Ratio of block cache reserve for low pri blocks.");
600
601DEFINE_string(compressed_secondary_cache_compression_type, "lz4",
602 "The compression algorithm to use for large "
603 "values stored in CompressedSecondaryCache.");
604static enum ROCKSDB_NAMESPACE::CompressionType
605 FLAGS_compressed_secondary_cache_compression_type_e =
606 ROCKSDB_NAMESPACE::kLZ4Compression;
607
608DEFINE_uint32(
609 compressed_secondary_cache_compress_format_version, 2,
610 "compress_format_version can have two values: "
611 "compress_format_version == 1 -- decompressed size is not included"
612 " in the block header."
613 "compress_format_version == 2 -- decompressed size is included"
614 " in the block header in varint32 format.");
7c673cae
FG
615
616DEFINE_int64(simcache_size, -1,
617 "Number of bytes to use as a simcache of "
618 "uncompressed data. Nagative value disables simcache.");
619
620DEFINE_bool(cache_index_and_filter_blocks, false,
621 "Cache index/filter blocks in block cache.");
622
1e59de90
TL
623DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false,
624 "Use JemallocNodumpAllocator for block/blob cache.");
625
20effc67 626DEFINE_bool(use_cache_memkind_kmem_allocator, false,
1e59de90 627 "Use memkind kmem allocator for block/blob cache.");
20effc67 628
11fdf7f2
TL
629DEFINE_bool(partition_index_and_filters, false,
630 "Partition index and filter blocks.");
631
632DEFINE_bool(partition_index, false, "Partition index blocks");
633
20effc67
TL
634DEFINE_bool(index_with_first_key, false, "Include first key in the index");
635
636DEFINE_bool(
637 optimize_filters_for_memory,
638 ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
639 "Minimize memory footprint of filters");
640
641DEFINE_int64(
642 index_shortening_mode, 2,
643 "mode to shorten index: 0 for no shortening; 1 for only shortening "
644 "separaters; 2 for shortening shortening and successor");
645
11fdf7f2 646DEFINE_int64(metadata_block_size,
f67539c2 647 ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
11fdf7f2
TL
648 "Max partition size when partitioning index/filters");
649
650// The default reduces the overhead of reading time with flash. With HDD, which
651// offers much less throughput, however, this number better to be set to 1.
652DEFINE_int32(ops_between_duration_checks, 1000,
653 "Check duration limit every x ops");
654
7c673cae
FG
655DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
656 "Pin index/filter blocks of L0 files in block cache.");
657
11fdf7f2
TL
658DEFINE_bool(
659 pin_top_level_index_and_filter, false,
660 "Pin top-level index of partitioned index/filter blocks in block cache.");
661
7c673cae 662DEFINE_int32(block_size,
f67539c2
TL
663 static_cast<int32_t>(
664 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
7c673cae
FG
665 "Number of bytes in a block.");
666
f67539c2
TL
667DEFINE_int32(format_version,
668 static_cast<int32_t>(
669 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
670 "Format version of SST files.");
11fdf7f2 671
7c673cae 672DEFINE_int32(block_restart_interval,
f67539c2 673 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
7c673cae
FG
674 "Number of keys between restart points "
675 "for delta encoding of keys in data block.");
676
f67539c2
TL
677DEFINE_int32(
678 index_block_restart_interval,
679 ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
680 "Number of keys between restart points "
681 "for delta encoding of keys in index block.");
7c673cae
FG
682
683DEFINE_int32(read_amp_bytes_per_bit,
f67539c2 684 ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
7c673cae
FG
685 "Number of bytes per bit to be used in block read-amp bitmap");
686
f67539c2
TL
687DEFINE_bool(
688 enable_index_compression,
689 ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
690 "Compress the index block");
11fdf7f2 691
f67539c2
TL
692DEFINE_bool(block_align,
693 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
11fdf7f2
TL
694 "Align data blocks on page size");
695
1e59de90
TL
696DEFINE_int64(prepopulate_block_cache, 0,
697 "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
698 "to insert during flush");
699
11fdf7f2
TL
700DEFINE_bool(use_data_block_hash_index, false,
701 "if use kDataBlockBinaryAndHash "
702 "instead of kDataBlockBinarySearch. "
703 "This is valid if only we use BlockTable");
704
705DEFINE_double(data_block_hash_table_util_ratio, 0.75,
706 "util ratio for data block hash index table. "
707 "This is only valid if use_data_block_hash_index is "
708 "set to true");
709
7c673cae
FG
710DEFINE_int64(compressed_cache_size, -1,
711 "Number of bytes to use as a cache of compressed data.");
712
713DEFINE_int64(row_cache_size, 0,
714 "Number of bytes to use as a cache of individual rows"
715 " (0 = disabled).");
716
f67539c2 717DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
7c673cae
FG
718 "Maximum number of files to keep open at the same time"
719 " (use default if == 0)");
720
f67539c2
TL
721DEFINE_int32(file_opening_threads,
722 ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
7c673cae
FG
723 "If open_files is set to -1, this option set the number of "
724 "threads that will be used to open files during DB::Open()");
725
7c673cae
FG
726DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
727
f67539c2
TL
728DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
729
7c673cae
FG
730DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
731 "Maximum windows randomaccess buffer size");
732
733DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
734 "Maximum write buffer for Writable File");
735
1e59de90
TL
736DEFINE_int32(bloom_bits, -1,
737 "Bloom filter bits per key. Negative means use default."
738 "Zero disables.");
739
740DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
741
7c673cae
FG
742DEFINE_double(memtable_bloom_size_ratio, 0,
743 "Ratio of memtable size used for bloom filter. 0 means no bloom "
744 "filter.");
494da23a
TL
745DEFINE_bool(memtable_whole_key_filtering, false,
746 "Try to use whole key bloom filter in memtables.");
7c673cae
FG
747DEFINE_bool(memtable_use_huge_page, false,
748 "Try to use huge page in memtables.");
749
1e59de90
TL
750DEFINE_bool(whole_key_filtering,
751 ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering,
752 "Use whole keys (in addition to prefixes) in SST bloom filter.");
753
754DEFINE_bool(use_existing_db, false,
755 "If true, do not destroy the existing database. If you set this "
756 "flag and also specify a benchmark that wants a fresh database, "
757 "that benchmark will fail.");
7c673cae 758
494da23a
TL
759DEFINE_bool(use_existing_keys, false,
760 "If true, uses existing keys in the DB, "
761 "rather than generating new ones. This involves some startup "
762 "latency to load all keys into memory. It is supported for the "
763 "same read/overwrite benchmarks as `-use_existing_db=true`, which "
764 "must also be set for this flag to be enabled. When this flag is "
765 "set, the value for `-num` will be ignored.");
766
7c673cae
FG
767DEFINE_bool(show_table_properties, false,
768 "If true, then per-level table"
769 " properties will be printed on every stats-interval when"
770 " stats_interval is set and stats_per_interval is on.");
771
772DEFINE_string(db, "", "Use the db with the following name.");
773
1e59de90
TL
774DEFINE_bool(progress_reports, true,
775 "If true, db_bench will report number of finished operations.");
776
7c673cae
FG
777// Read cache flags
778
779DEFINE_string(read_cache_path, "",
780 "If not empty string, a read cache will be used in this path");
781
782DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
783 "Maximum size of the read cache");
784
785DEFINE_bool(read_cache_direct_write, true,
786 "Whether to use Direct IO for writing to the read cache");
787
788DEFINE_bool(read_cache_direct_read, true,
789 "Whether to use Direct IO for reading from read cache");
790
11fdf7f2
TL
791DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
792
7c673cae
FG
793static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
794 if (value >= 20) {
1e59de90
TL
795 fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname,
796 value);
7c673cae
FG
797 return false;
798 }
799 return true;
800}
801
11fdf7f2 802DEFINE_bool(verify_checksum, true,
1e59de90
TL
803 "Verify checksum for every block read from storage");
804
805DEFINE_int32(checksum_type,
806 ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum,
807 "ChecksumType as an int");
7c673cae
FG
808
809DEFINE_bool(statistics, false, "Database statistics");
f67539c2 810DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
494da23a 811 "stats level for statistics");
7c673cae 812DEFINE_string(statistics_string, "", "Serialized statistics string");
f67539c2 813static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
7c673cae 814
1e59de90
TL
815DEFINE_int64(writes, -1,
816 "Number of write operations to do. If negative, do --num reads.");
7c673cae 817
1e59de90
TL
818DEFINE_bool(finish_after_writes, false,
819 "Write thread terminates after all writes are finished");
7c673cae
FG
820
821DEFINE_bool(sync, false, "Sync all writes to disk");
822
823DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
824
825DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
826
1e59de90
TL
827DEFINE_bool(manual_wal_flush, false,
828 "If true, buffer WAL until buffer is full or a manual FlushWAL().");
829
830DEFINE_string(wal_compression, "none",
831 "Algorithm to use for WAL compression. none to disable.");
832static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e =
833 ROCKSDB_NAMESPACE::kNoCompression;
834
7c673cae
FG
835DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
836
837DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
838 "Truth key/values used when using verify");
839
840DEFINE_int32(num_levels, 7, "The total number of levels");
841
f67539c2
TL
842DEFINE_int64(target_file_size_base,
843 ROCKSDB_NAMESPACE::Options().target_file_size_base,
7c673cae
FG
844 "Target file size at level-1");
845
846DEFINE_int32(target_file_size_multiplier,
f67539c2 847 ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
7c673cae
FG
848 "A multiplier to compute target level-N file size (N >= 2)");
849
850DEFINE_uint64(max_bytes_for_level_base,
f67539c2 851 ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
7c673cae
FG
852 "Max bytes for level-1");
853
854DEFINE_bool(level_compaction_dynamic_level_bytes, false,
855 "Whether level size base is dynamic");
856
857DEFINE_double(max_bytes_for_level_multiplier, 10,
858 "A multiplier to compute max bytes for level-N (N >= 2)");
859
860static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
861DEFINE_string(max_bytes_for_level_multiplier_additional, "",
862 "A vector that specifies additional fanout per level");
863
864DEFINE_int32(level0_stop_writes_trigger,
f67539c2 865 ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
1e59de90 866 "Number of files in level-0 that will trigger put stop.");
7c673cae
FG
867
868DEFINE_int32(level0_slowdown_writes_trigger,
f67539c2 869 ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
1e59de90 870 "Number of files in level-0 that will slow down writes.");
7c673cae
FG
871
872DEFINE_int32(level0_file_num_compaction_trigger,
f67539c2 873 ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
1e59de90 874 "Number of files in level-0 when compactions start.");
7c673cae 875
20effc67
TL
876DEFINE_uint64(periodic_compaction_seconds,
877 ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
878 "Files older than this will be picked up for compaction and"
879 " rewritten to the same level");
880
1e59de90
TL
881DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl");
882
7c673cae 883static bool ValidateInt32Percent(const char* flagname, int32_t value) {
1e59de90
TL
884 if (value <= 0 || value >= 100) {
885 fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname,
886 value);
7c673cae
FG
887 return false;
888 }
889 return true;
890}
1e59de90
TL
891DEFINE_int32(readwritepercent, 90,
892 "Ratio of reads to reads/writes (expressed as percentage) for "
893 "the ReadRandomWriteRandom workload. The default value 90 means "
894 "90% operations out of all reads and writes operations are "
895 "reads. In other words, 9 gets for every 1 put.");
896
897DEFINE_int32(mergereadpercent, 70,
898 "Ratio of merges to merges&reads (expressed as percentage) for "
899 "the ReadRandomMergeRandom workload. The default value 70 means "
900 "70% out of all read and merge operations are merges. In other "
901 "words, 7 merges for every 3 gets.");
902
903DEFINE_int32(deletepercent, 2,
904 "Percentage of deletes out of reads/writes/deletes (used in "
905 "RandomWithVerify only). RandomWithVerify "
7c673cae
FG
906 "calculates writepercent as (100 - FLAGS_readwritepercent - "
907 "deletepercent), so deletepercent must be smaller than (100 - "
908 "FLAGS_readwritepercent)");
909
1e59de90
TL
910DEFINE_bool(optimize_filters_for_hits,
911 ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits,
7c673cae
FG
912 "Optimizes bloom filters for workloads for most lookups return "
913 "a value. For now this doesn't create bloom filters for the max "
914 "level of the LSM to reduce metadata that should fit in RAM. ");
915
1e59de90
TL
916DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks,
917 "RocksDB will aggressively check consistency of the data.");
918
919DEFINE_bool(force_consistency_checks,
920 ROCKSDB_NAMESPACE::Options().force_consistency_checks,
921 "Runs consistency checks on the LSM every time a change is "
922 "applied.");
923
924DEFINE_bool(check_flush_compaction_key_order,
925 ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order,
926 "During flush or compaction, check whether keys inserted to "
927 "output files are in order.");
928
7c673cae
FG
929DEFINE_uint64(delete_obsolete_files_period_micros, 0,
930 "Ignored. Left here for backward compatibility");
931
494da23a
TL
932DEFINE_int64(writes_before_delete_range, 0,
933 "Number of writes before DeleteRange is called regularly.");
934
7c673cae 935DEFINE_int64(writes_per_range_tombstone, 0,
494da23a 936 "Number of writes between range tombstones");
7c673cae
FG
937
938DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
939
940DEFINE_int64(max_num_range_tombstones, 0,
1e59de90 941 "Maximum number of range tombstones to insert.");
7c673cae
FG
942
943DEFINE_bool(expand_range_tombstones, false,
944 "Expand range tombstone into sequential regular tombstones.");
945
946#ifndef ROCKSDB_LITE
11fdf7f2 947// Transactions Options
7c673cae
FG
948DEFINE_bool(optimistic_transaction_db, false,
949 "Open a OptimisticTransactionDB instance. "
950 "Required for randomtransaction benchmark.");
951
952DEFINE_bool(transaction_db, false,
953 "Open a TransactionDB instance. "
954 "Required for randomtransaction benchmark.");
955
956DEFINE_uint64(transaction_sets, 2,
957 "Number of keys each transaction will "
958 "modify (use in RandomTransaction only). Max: 9999");
959
960DEFINE_bool(transaction_set_snapshot, false,
961 "Setting to true will have each transaction call SetSnapshot()"
962 " upon creation.");
963
964DEFINE_int32(transaction_sleep, 0,
965 "Max microseconds to sleep in between "
966 "reading and writing a value (used in RandomTransaction only). ");
967
968DEFINE_uint64(transaction_lock_timeout, 100,
969 "If using a transaction_db, specifies the lock wait timeout in"
970 " milliseconds before failing a transaction waiting on a lock");
971DEFINE_string(
972 options_file, "",
973 "The path to a RocksDB options file. If specified, then db_bench will "
974 "run with the RocksDB options in the default column family of the "
975 "specified options file. "
976 "Note that with this setting, db_bench will ONLY accept the following "
977 "RocksDB options related command-line arguments, all other arguments "
978 "that are related to RocksDB options will be ignored:\n"
979 "\t--use_existing_db\n"
494da23a 980 "\t--use_existing_keys\n"
7c673cae
FG
981 "\t--statistics\n"
982 "\t--row_cache_size\n"
983 "\t--row_cache_numshardbits\n"
984 "\t--enable_io_prio\n"
985 "\t--dump_malloc_stats\n"
986 "\t--num_multi_db\n");
987
11fdf7f2 988// FIFO Compaction Options
7c673cae
FG
989DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
990 "The limit of total table file sizes to trigger FIFO compaction");
11fdf7f2
TL
991
992DEFINE_bool(fifo_compaction_allow_compaction, true,
993 "Allow compaction in FIFO compaction.");
994
995DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
996
1e59de90
TL
997DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
998
999// Stacked BlobDB Options
1000DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
11fdf7f2 1001
f67539c2
TL
1002DEFINE_bool(
1003 blob_db_enable_gc,
1004 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
1e59de90 1005 "[Stacked BlobDB] Enable BlobDB garbage collection.");
11fdf7f2 1006
f67539c2
TL
1007DEFINE_double(
1008 blob_db_gc_cutoff,
1009 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
1e59de90 1010 "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
11fdf7f2 1011
f67539c2
TL
1012DEFINE_bool(blob_db_is_fifo,
1013 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
1e59de90 1014 "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
f67539c2
TL
1015
1016DEFINE_uint64(blob_db_max_db_size,
1017 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
1e59de90
TL
1018 "[Stacked BlobDB] Max size limit of the directory where blob "
1019 "files are stored.");
11fdf7f2 1020
1e59de90
TL
1021DEFINE_uint64(blob_db_max_ttl_range, 0,
1022 "[Stacked BlobDB] TTL range to generate BlobDB data (in "
1023 "seconds). 0 means no TTL.");
11fdf7f2 1024
1e59de90
TL
1025DEFINE_uint64(
1026 blob_db_ttl_range_secs,
1027 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
1028 "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
11fdf7f2 1029
1e59de90
TL
1030DEFINE_uint64(
1031 blob_db_min_blob_size,
1032 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
1033 "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
1034 "smaller than this will be inlined with the key in the LSM tree.");
11fdf7f2 1035
f67539c2
TL
1036DEFINE_uint64(blob_db_bytes_per_sync,
1037 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
1e59de90 1038 "[Stacked BlobDB] Bytes to sync blob file at.");
11fdf7f2 1039
f67539c2
TL
1040DEFINE_uint64(blob_db_file_size,
1041 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
1e59de90 1042 "[Stacked BlobDB] Target size of each blob file.");
11fdf7f2 1043
1e59de90
TL
1044DEFINE_string(
1045 blob_db_compression_type, "snappy",
1046 "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
f67539c2
TL
1047static enum ROCKSDB_NAMESPACE::CompressionType
1048 FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
1049
1e59de90
TL
1050#endif // ROCKSDB_LITE
1051
1052// Integrated BlobDB options
1053DEFINE_bool(
1054 enable_blob_files,
1055 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
1056 "[Integrated BlobDB] Enable writing large values to separate blob files.");
1057
1058DEFINE_uint64(min_blob_size,
1059 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
1060 "[Integrated BlobDB] The size of the smallest value to be stored "
1061 "separately in a blob file.");
1062
1063DEFINE_uint64(blob_file_size,
1064 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
1065 "[Integrated BlobDB] The size limit for blob files.");
1066
1067DEFINE_string(blob_compression_type, "none",
1068 "[Integrated BlobDB] The compression algorithm to use for large "
1069 "values stored in blob files.");
1070
1071DEFINE_bool(enable_blob_garbage_collection,
1072 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1073 .enable_blob_garbage_collection,
1074 "[Integrated BlobDB] Enable blob garbage collection.");
1075
1076DEFINE_double(blob_garbage_collection_age_cutoff,
1077 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1078 .blob_garbage_collection_age_cutoff,
1079 "[Integrated BlobDB] The cutoff in terms of blob file age for "
1080 "garbage collection.");
1081
1082DEFINE_double(blob_garbage_collection_force_threshold,
1083 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1084 .blob_garbage_collection_force_threshold,
1085 "[Integrated BlobDB] The threshold for the ratio of garbage in "
1086 "the oldest blob files for forcing garbage collection.");
1087
1088DEFINE_uint64(blob_compaction_readahead_size,
1089 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
1090 .blob_compaction_readahead_size,
1091 "[Integrated BlobDB] Compaction readahead for blob files.");
1092
1093DEFINE_int32(
1094 blob_file_starting_level,
1095 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level,
1096 "[Integrated BlobDB] The starting level for blob files.");
1097
1098DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache.");
1099
1100DEFINE_bool(
1101 use_shared_block_and_blob_cache, true,
1102 "[Integrated BlobDB] Use a shared backing cache for both block "
1103 "cache and blob cache. It only takes effect if use_blob_cache is enabled.");
1104
1105DEFINE_uint64(
1106 blob_cache_size, 8 << 20,
1107 "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only "
1108 "takes effect if the block and blob caches are different "
1109 "(use_shared_block_and_blob_cache = false).");
1110
1111DEFINE_int32(blob_cache_numshardbits, 6,
1112 "[Integrated BlobDB] Number of shards for the blob cache is 2 ** "
1113 "blob_cache_numshardbits. Negative means use default settings. "
1114 "It only takes effect if blob_cache_size is greater than 0, and "
1115 "the block and blob caches are different "
1116 "(use_shared_block_and_blob_cache = false).");
1117
1118DEFINE_int32(prepopulate_blob_cache, 0,
1119 "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
1120 "to disable and 1 to insert during flush.");
1121
1122#ifndef ROCKSDB_LITE
1123
f67539c2
TL
1124// Secondary DB instance Options
1125DEFINE_bool(use_secondary_db, false,
1126 "Open a RocksDB secondary instance. A primary instance can be "
1127 "running in another db_bench process.");
1128
1129DEFINE_string(secondary_path, "",
1130 "Path to a directory used by the secondary instance to store "
1131 "private files, e.g. info log.");
1132
1133DEFINE_int32(secondary_update_interval, 5,
1134 "Secondary instance attempts to catch up with the primary every "
1135 "secondary_update_interval seconds.");
1136
7c673cae
FG
1137#endif // ROCKSDB_LITE
1138
1139DEFINE_bool(report_bg_io_stats, false,
1140 "Measure times spents on I/Os while in compactions. ");
1141
1142DEFINE_bool(use_stderr_info_logger, false,
1143 "Write info logs to stderr instead of to LOG file. ");
1144
1e59de90
TL
1145#ifndef ROCKSDB_LITE
1146
11fdf7f2 1147DEFINE_string(trace_file, "", "Trace workload to a file. ");
7c673cae 1148
1e59de90
TL
1149DEFINE_double(trace_replay_fast_forward, 1.0,
1150 "Fast forward trace replay, must > 0.0.");
f67539c2
TL
1151DEFINE_int32(block_cache_trace_sampling_frequency, 1,
1152 "Block cache trace sampling frequency, termed s. It uses spatial "
1153 "downsampling and samples accesses to one out of s blocks.");
1154DEFINE_int64(
1155 block_cache_trace_max_trace_file_size_in_bytes,
1156 uint64_t{64} * 1024 * 1024 * 1024,
1157 "The maximum block cache trace file size in bytes. Block cache accesses "
1158 "will not be logged if the trace file size exceeds this threshold. Default "
1159 "is 64 GB.");
1160DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
1161DEFINE_int32(trace_replay_threads, 1,
1162 "The number of threads to replay, must >=1.");
1163
1e59de90
TL
1164DEFINE_bool(io_uring_enabled, true,
1165 "If true, enable the use of IO uring if the platform supports it");
1166extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; }
1167#endif // ROCKSDB_LITE
1168
1169DEFINE_bool(adaptive_readahead, false,
1170 "carry forward internal auto readahead size from one file to next "
1171 "file at each level during iteration");
1172
1173DEFINE_bool(rate_limit_user_ops, false,
1174 "When true use Env::IO_USER priority level to charge internal rate "
1175 "limiter for reads associated with user operations.");
1176
1177DEFINE_bool(file_checksum, false,
1178 "When true use FileChecksumGenCrc32cFactory for "
1179 "file_checksum_gen_factory.");
1180
1181DEFINE_bool(rate_limit_auto_wal_flush, false,
1182 "When true use Env::IO_USER priority level to charge internal rate "
1183 "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
1184 "false) after the user write operation.");
1185
1186DEFINE_bool(async_io, false,
1187 "When set true, RocksDB does asynchronous reads for internal auto "
1188 "readahead prefetching.");
1189
1190DEFINE_bool(optimize_multiget_for_io, true,
1191 "When set true, RocksDB does asynchronous reads for SST files in "
1192 "multiple levels for MultiGet.");
1193
1194DEFINE_bool(charge_compression_dictionary_building_buffer, false,
1195 "Setting for "
1196 "CacheEntryRoleOptions::charged of "
1197 "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
1198
1199DEFINE_bool(charge_filter_construction, false,
1200 "Setting for "
1201 "CacheEntryRoleOptions::charged of "
1202 "CacheEntryRole::kFilterConstruction");
1203
1204DEFINE_bool(charge_table_reader, false,
1205 "Setting for "
1206 "CacheEntryRoleOptions::charged of "
1207 "CacheEntryRole::kBlockBasedTableReader");
1208
1209DEFINE_bool(charge_file_metadata, false,
1210 "Setting for "
1211 "CacheEntryRoleOptions::charged of "
1212 "CacheEntryRole::kFileMetadata");
1213
1214DEFINE_bool(charge_blob_cache, false,
1215 "Setting for "
1216 "CacheEntryRoleOptions::charged of "
1217 "CacheEntryRole::kBlobCache");
1218
1219DEFINE_uint64(backup_rate_limit, 0ull,
1220 "If non-zero, db_bench will rate limit reads and writes for DB "
1221 "backup. This "
1222 "is the global rate in ops/second.");
1223
1224DEFINE_uint64(restore_rate_limit, 0ull,
1225 "If non-zero, db_bench will rate limit reads and writes for DB "
1226 "restore. This "
1227 "is the global rate in ops/second.");
1228
1229DEFINE_string(backup_dir, "",
1230 "If not empty string, use the given dir for backup.");
1231
1232DEFINE_string(restore_dir, "",
1233 "If not empty string, use the given dir for restore.");
1234
1235DEFINE_uint64(
1236 initial_auto_readahead_size,
1237 ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size,
1238 "RocksDB does auto-readahead for iterators on noticing more than two reads "
1239 "for a table file if user doesn't provide readahead_size. The readahead "
1240 "size starts at initial_auto_readahead_size");
1241
1242DEFINE_uint64(
1243 max_auto_readahead_size,
1244 ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size,
1245 "Rocksdb implicit readahead starts at "
1246 "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every "
1247 "additional read upto max_auto_readahead_size");
1248
1249DEFINE_uint64(
1250 num_file_reads_for_auto_readahead,
1251 ROCKSDB_NAMESPACE::BlockBasedTableOptions()
1252 .num_file_reads_for_auto_readahead,
1253 "Rocksdb implicit readahead is enabled if reads are sequential and "
1254 "num_file_reads_for_auto_readahead indicates after how many sequential "
1255 "reads into that file internal auto prefetching should be start.");
1256
f67539c2
TL
1257static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
1258 const char* ctype) {
7c673cae
FG
1259 assert(ctype);
1260
1261 if (!strcasecmp(ctype, "none"))
f67539c2 1262 return ROCKSDB_NAMESPACE::kNoCompression;
7c673cae 1263 else if (!strcasecmp(ctype, "snappy"))
f67539c2 1264 return ROCKSDB_NAMESPACE::kSnappyCompression;
7c673cae 1265 else if (!strcasecmp(ctype, "zlib"))
f67539c2 1266 return ROCKSDB_NAMESPACE::kZlibCompression;
7c673cae 1267 else if (!strcasecmp(ctype, "bzip2"))
f67539c2 1268 return ROCKSDB_NAMESPACE::kBZip2Compression;
7c673cae 1269 else if (!strcasecmp(ctype, "lz4"))
f67539c2 1270 return ROCKSDB_NAMESPACE::kLZ4Compression;
7c673cae 1271 else if (!strcasecmp(ctype, "lz4hc"))
f67539c2 1272 return ROCKSDB_NAMESPACE::kLZ4HCCompression;
7c673cae 1273 else if (!strcasecmp(ctype, "xpress"))
f67539c2 1274 return ROCKSDB_NAMESPACE::kXpressCompression;
7c673cae 1275 else if (!strcasecmp(ctype, "zstd"))
f67539c2 1276 return ROCKSDB_NAMESPACE::kZSTD;
1e59de90
TL
1277 else {
1278 fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
1279 exit(1);
1280 }
7c673cae
FG
1281}
1282
1283static std::string ColumnFamilyName(size_t i) {
1284 if (i == 0) {
f67539c2 1285 return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
7c673cae
FG
1286 } else {
1287 char name[100];
1288 snprintf(name, sizeof(name), "column_family_name_%06zu", i);
1289 return std::string(name);
1290 }
1291}
1292
1293DEFINE_string(compression_type, "snappy",
1294 "Algorithm to use to compress the database");
f67539c2
TL
1295static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
1296 ROCKSDB_NAMESPACE::kSnappyCompression;
7c673cae 1297
494da23a
TL
1298DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
1299
f67539c2 1300DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
11fdf7f2
TL
1301 "Compression level. The meaning of this value is library-"
1302 "dependent. If unset, we try to use the default for the library "
1303 "specified in `--compression_type`");
7c673cae 1304
11fdf7f2 1305DEFINE_int32(compression_max_dict_bytes,
f67539c2 1306 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
7c673cae
FG
1307 "Maximum size of dictionary used to prime the compression "
1308 "library.");
1309
11fdf7f2 1310DEFINE_int32(compression_zstd_max_train_bytes,
f67539c2 1311 ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
11fdf7f2
TL
1312 "Maximum size of training data passed to zstd's dictionary "
1313 "trainer.");
7c673cae 1314
1e59de90
TL
1315DEFINE_int32(min_level_to_compress, -1,
1316 "If non-negative, compression starts"
7c673cae
FG
1317 " from this level. Levels with number < min_level_to_compress are"
1318 " not compressed. Otherwise, apply compression_type to "
1319 "all levels.");
1320
20effc67
TL
1321DEFINE_int32(compression_parallel_threads, 1,
1322 "Number of threads for parallel compression.");
1323
1e59de90
TL
1324DEFINE_uint64(compression_max_dict_buffer_bytes,
1325 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
1326 "Maximum bytes to buffer to collect samples for dictionary.");
1327
1328DEFINE_bool(compression_use_zstd_dict_trainer,
1329 ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer,
1330 "If true, use ZSTD_TrainDictionary() to create dictionary, else"
1331 "use ZSTD_FinalizeDictionary() to create dictionary");
1332
7c673cae
FG
1333static bool ValidateTableCacheNumshardbits(const char* flagname,
1334 int32_t value) {
1e59de90
TL
1335 if (0 >= value || value >= 20) {
1336 fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val < 20\n",
7c673cae
FG
1337 flagname, value);
1338 return false;
1339 }
1340 return true;
1341}
1342DEFINE_int32(table_cache_numshardbits, 4, "");
1343
1344#ifndef ROCKSDB_LITE
20effc67 1345DEFINE_string(env_uri, "",
1e59de90 1346 "URI for registry Env lookup. Mutually exclusive with --fs_uri");
20effc67
TL
1347DEFINE_string(fs_uri, "",
1348 "URI for registry Filesystem lookup. Mutually exclusive"
1e59de90 1349 " with --env_uri."
20effc67 1350 " Creates a default environment with the specified filesystem.");
7c673cae 1351#endif // ROCKSDB_LITE
1e59de90
TL
1352DEFINE_string(simulate_hybrid_fs_file, "",
1353 "File for Store Metadata for Simulate hybrid FS. Empty means "
1354 "disable the feature. Now, if it is set, last_level_temperature "
1355 "is set to kWarm.");
1356DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
1357 "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
1358 "are simulated.");
1359DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
1360
1361DEFINE_int64(
1362 preclude_last_level_data_seconds, 0,
1363 "Preclude the latest data from the last level. (Used for tiered storage)");
1364
1365DEFINE_int64(preserve_internal_time_seconds, 0,
1366 "Preserve the internal time information which stores with SST.");
f67539c2
TL
1367
1368static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
1369
1370static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
7c673cae 1371
1e59de90
TL
1372DEFINE_int64(stats_interval, 0,
1373 "Stats are reported every N operations when this is greater than "
1374 "zero. When 0 the interval grows over time.");
1375
1376DEFINE_int64(stats_interval_seconds, 0,
1377 "Report stats every N seconds. This overrides stats_interval when"
1378 " both are > 0.");
7c673cae 1379
1e59de90
TL
1380DEFINE_int32(stats_per_interval, 0,
1381 "Reports additional stats per interval when this is greater than "
1382 "0.");
7c673cae 1383
1e59de90
TL
1384DEFINE_uint64(slow_usecs, 1000000,
1385 "A message is printed for operations that take at least this "
1386 "many microseconds.");
7c673cae
FG
1387
1388DEFINE_int64(report_interval_seconds, 0,
1e59de90 1389 "If greater than zero, it will write simple stats in CSV format "
7c673cae
FG
1390 "to --report_file every N seconds");
1391
1392DEFINE_string(report_file, "report.csv",
1393 "Filename where some simple stats are reported to (if "
1394 "--report_interval_seconds is bigger than 0)");
1395
1396DEFINE_int32(thread_status_per_interval, 0,
1397 "Takes and report a snapshot of the current status of each thread"
1398 " when this is greater than 0.");
1399
f67539c2
TL
1400DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
1401 "Level of perf collection");
7c673cae 1402
7c673cae
FG
1403DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
1404 "Slowdown writes if pending compaction bytes exceed this number");
1405
1406DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
1407 "Stop writes if pending compaction bytes exceed this number");
1408
1409DEFINE_uint64(delayed_write_rate, 8388608u,
1410 "Limited bytes allowed to DB when soft_rate_limit or "
1411 "level0_slowdown_writes_trigger triggers");
1412
11fdf7f2
TL
1413DEFINE_bool(enable_pipelined_write, true,
1414 "Allow WAL and memtable writes to be pipelined");
1415
20effc67
TL
1416DEFINE_bool(
1417 unordered_write, false,
1418 "Enable the unordered write feature, which provides higher throughput but "
1419 "relaxes the guarantees around atomic reads and immutable snapshots");
f67539c2 1420
11fdf7f2 1421DEFINE_bool(allow_concurrent_memtable_write, true,
7c673cae
FG
1422 "Allow multi-writers to update mem tables in parallel.");
1423
1e59de90
TL
1424DEFINE_double(experimental_mempurge_threshold, 0.0,
1425 "Maximum useful payload ratio estimate that triggers a mempurge "
1426 "(memtable garbage collection).");
1427
f67539c2
TL
1428DEFINE_bool(inplace_update_support,
1429 ROCKSDB_NAMESPACE::Options().inplace_update_support,
11fdf7f2
TL
1430 "Support in-place memtable update for smaller or same-size values");
1431
1432DEFINE_uint64(inplace_update_num_locks,
f67539c2 1433 ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
11fdf7f2
TL
1434 "Number of RW locks to protect in-place memtable updates");
1435
1436DEFINE_bool(enable_write_thread_adaptive_yield, true,
7c673cae
FG
1437 "Use a yielding spin loop for brief writer thread waits.");
1438
1439DEFINE_uint64(
1440 write_thread_max_yield_usec, 100,
1441 "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
1442
1443DEFINE_uint64(write_thread_slow_yield_usec, 3,
1444 "The threshold at which a slow yield is considered a signal that "
1445 "other processes or threads want the core.");
1446
7c673cae
FG
1447DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
1448
1e59de90
TL
1449DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000,
1450 "Set refill period on rate limiter.");
1451
11fdf7f2
TL
1452DEFINE_bool(rate_limiter_auto_tuned, false,
1453 "Enable dynamic adjustment of rate limit according to demand for "
1454 "background I/O");
1455
1e59de90 1456DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit");
11fdf7f2 1457
1e59de90
TL
1458DEFINE_uint64(
1459 sine_write_rate_interval_milliseconds, 10000,
1460 "Interval of which the sine wave write_rate_limit is recalculated");
11fdf7f2 1461
1e59de90 1462DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d");
11fdf7f2 1463
1e59de90 1464DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d");
11fdf7f2 1465
1e59de90 1466DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d");
11fdf7f2 1467
1e59de90 1468DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d");
11fdf7f2
TL
1469
1470DEFINE_bool(rate_limit_bg_reads, false,
1471 "Use options.rate_limiter on compaction reads");
1472
7c673cae
FG
1473DEFINE_uint64(
1474 benchmark_write_rate_limit, 0,
1475 "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
1476 "is the global rate in bytes/second.");
1477
494da23a 1478// the parameters of mix_graph
f67539c2
TL
1479DEFINE_double(keyrange_dist_a, 0.0,
1480 "The parameter 'a' of prefix average access distribution "
1481 "f(x)=a*exp(b*x)+c*exp(d*x)");
1482DEFINE_double(keyrange_dist_b, 0.0,
1483 "The parameter 'b' of prefix average access distribution "
1484 "f(x)=a*exp(b*x)+c*exp(d*x)");
1485DEFINE_double(keyrange_dist_c, 0.0,
1486 "The parameter 'c' of prefix average access distribution"
1487 "f(x)=a*exp(b*x)+c*exp(d*x)");
1488DEFINE_double(keyrange_dist_d, 0.0,
1489 "The parameter 'd' of prefix average access distribution"
1490 "f(x)=a*exp(b*x)+c*exp(d*x)");
1491DEFINE_int64(keyrange_num, 1,
1492 "The number of key ranges that are in the same prefix "
1e59de90 1493 "group, each prefix range will have its key access distribution");
494da23a 1494DEFINE_double(key_dist_a, 0.0,
1e59de90 1495 "The parameter 'a' of key access distribution model f(x)=a*x^b");
494da23a 1496DEFINE_double(key_dist_b, 0.0,
1e59de90 1497 "The parameter 'b' of key access distribution model f(x)=a*x^b");
494da23a
TL
1498DEFINE_double(value_theta, 0.0,
1499 "The parameter 'theta' of Generized Pareto Distribution "
1500 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1e59de90
TL
1501// Use reasonable defaults based on the mixgraph paper
1502DEFINE_double(value_k, 0.2615,
494da23a
TL
1503 "The parameter 'k' of Generized Pareto Distribution "
1504 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1e59de90
TL
1505// Use reasonable defaults based on the mixgraph paper
1506DEFINE_double(value_sigma, 25.45,
494da23a
TL
1507 "The parameter 'theta' of Generized Pareto Distribution "
1508 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1509DEFINE_double(iter_theta, 0.0,
1510 "The parameter 'theta' of Generized Pareto Distribution "
1511 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1e59de90
TL
1512// Use reasonable defaults based on the mixgraph paper
1513DEFINE_double(iter_k, 2.517,
494da23a
TL
1514 "The parameter 'k' of Generized Pareto Distribution "
1515 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1e59de90
TL
1516// Use reasonable defaults based on the mixgraph paper
1517DEFINE_double(iter_sigma, 14.236,
494da23a
TL
1518 "The parameter 'sigma' of Generized Pareto Distribution "
1519 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1520DEFINE_double(mix_get_ratio, 1.0,
1521 "The ratio of Get queries of mix_graph workload");
1522DEFINE_double(mix_put_ratio, 0.0,
1523 "The ratio of Put queries of mix_graph workload");
1524DEFINE_double(mix_seek_ratio, 0.0,
1525 "The ratio of Seek queries of mix_graph workload");
1526DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
494da23a
TL
1527DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
1528DEFINE_double(
1529 sine_mix_rate_noise, 0.0,
1530 "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
1531DEFINE_bool(sine_mix_rate, false,
1532 "Enable the sine QPS control on the mix workload");
1533DEFINE_uint64(
1534 sine_mix_rate_interval_milliseconds, 10000,
1535 "Interval of which the sine wave read_rate_limit is recalculated");
1536DEFINE_int64(mix_accesses, -1,
1537 "The total query accesses of mix_graph workload");
1538
7c673cae
FG
1539DEFINE_uint64(
1540 benchmark_read_rate_limit, 0,
1541 "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
1542 "is the global rate in ops/second.");
1543
f67539c2
TL
1544DEFINE_uint64(max_compaction_bytes,
1545 ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
7c673cae
FG
1546 "Max bytes allowed in one compaction");
1547
1548#ifndef ROCKSDB_LITE
1549DEFINE_bool(readonly, false, "Run read only benchmarks.");
494da23a
TL
1550
1551DEFINE_bool(print_malloc_stats, false,
1552 "Print malloc stats to stdout after benchmarks finish.");
7c673cae
FG
1553#endif // ROCKSDB_LITE
1554
1555DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1556
1557DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
1e59de90
TL
1558DEFINE_uint64(wal_size_limit_MB, 0,
1559 "Set the size limit for the WAL Files in MB.");
7c673cae
FG
1560DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1561
f67539c2 1562DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
7c673cae
FG
1563 "Allow reads to occur via mmap-ing files");
1564
f67539c2 1565DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
7c673cae
FG
1566 "Allow writes to occur via mmap-ing files");
1567
f67539c2 1568DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
7c673cae
FG
1569 "Use O_DIRECT for reading data");
1570
1571DEFINE_bool(use_direct_io_for_flush_and_compaction,
f67539c2 1572 ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
11fdf7f2 1573 "Use O_DIRECT for background flush and compaction writes");
7c673cae 1574
f67539c2
TL
1575DEFINE_bool(advise_random_on_open,
1576 ROCKSDB_NAMESPACE::Options().advise_random_on_open,
7c673cae
FG
1577 "Advise random access on table file open");
1578
1579DEFINE_string(compaction_fadvice, "NORMAL",
1580 "Access pattern advice when a file is compacted");
1581static auto FLAGS_compaction_fadvice_e =
f67539c2 1582 ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
7c673cae
FG
1583
1584DEFINE_bool(use_tailing_iterator, false,
1585 "Use tailing iterator to access a series of keys instead of get");
1586
f67539c2 1587DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
7c673cae
FG
1588 "Use adaptive mutex");
1589
f67539c2 1590DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
7c673cae
FG
1591 "Allows OS to incrementally sync SST files to disk while they are"
1592 " being written, in the background. Issue one request for every"
1593 " bytes_per_sync written. 0 turns it off.");
1594
f67539c2
TL
1595DEFINE_uint64(wal_bytes_per_sync,
1596 ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
7c673cae
FG
1597 "Allows OS to incrementally sync WAL files to disk while they are"
1598 " being written, in the background. Issue one request for every"
1599 " wal_bytes_per_sync written. 0 turns it off.");
1600
1601DEFINE_bool(use_single_deletes, true,
1602 "Use single deletes (used in RandomReplaceKeys only).");
1603
1604DEFINE_double(stddev, 2000.0,
1605 "Standard deviation of normal distribution used for picking keys"
1606 " (used in RandomReplaceKeys only).");
1607
1608DEFINE_int32(key_id_range, 100000,
1609 "Range of possible value of key id (used in TimeSeries only).");
1610
1611DEFINE_string(expire_style, "none",
1612 "Style to remove expired time entries. Can be one of the options "
1613 "below: none (do not expired data), compaction_filter (use a "
1614 "compaction filter to remove expired data), delete (seek IDs and "
1615 "remove expired data) (used in TimeSeries only).");
1616
1617DEFINE_uint64(
1618 time_range, 100000,
1619 "Range of timestamp that store in the database (used in TimeSeries"
1620 " only).");
1621
1622DEFINE_int32(num_deletion_threads, 1,
1623 "Number of threads to do deletion (used in TimeSeries and delete "
1624 "expire_style only).");
1625
1e59de90
TL
1626DEFINE_int32(max_successive_merges, 0,
1627 "Maximum number of successive merge operations on a key in the "
1628 "memtable");
7c673cae
FG
1629
1630static bool ValidatePrefixSize(const char* flagname, int32_t value) {
1e59de90 1631 if (value < 0 || value >= 2000000000) {
7c673cae
FG
1632 fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
1633 flagname, value);
1634 return false;
1635 }
1636 return true;
1637}
f67539c2 1638
1e59de90
TL
1639DEFINE_int32(prefix_size, 0,
1640 "control the prefix size for HashSkipList and plain table");
1641DEFINE_int64(keys_per_prefix, 0,
1642 "control average number of keys generated per prefix, 0 means no "
1643 "special handling of the prefix, i.e. use the prefix comes with "
1644 "the generated random number.");
f67539c2
TL
1645DEFINE_bool(total_order_seek, false,
1646 "Enable total order seek regardless of index format.");
1647DEFINE_bool(prefix_same_as_start, false,
1648 "Enforce iterator to return keys with prefix same as seek key.");
1649DEFINE_bool(
1650 seek_missing_prefix, false,
1651 "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
1652
7c673cae
FG
1653DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
1654 "If non-zero, enable "
1655 "memtable insert with hint with the given prefix size.");
1e59de90
TL
1656DEFINE_bool(enable_io_prio, false,
1657 "Lower the background flush/compaction threads' IO priority");
1658DEFINE_bool(enable_cpu_prio, false,
1659 "Lower the background flush/compaction threads' CPU priority");
1660DEFINE_bool(identity_as_first_hash, false,
1661 "the first hash function of cuckoo table becomes an identity "
1662 "function. This is only valid when key is 8 bytes");
7c673cae 1663DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
f67539c2
TL
1664DEFINE_uint64(stats_dump_period_sec,
1665 ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
11fdf7f2 1666 "Gap between printing stats to log in seconds");
494da23a 1667DEFINE_uint64(stats_persist_period_sec,
f67539c2 1668 ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
494da23a 1669 "Gap between persisting stats in seconds");
f67539c2
TL
1670DEFINE_bool(persist_stats_to_disk,
1671 ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
1672 "whether to persist stats to disk");
494da23a 1673DEFINE_uint64(stats_history_buffer_size,
f67539c2 1674 ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
494da23a 1675 "Max number of stats snapshots to keep in memory");
1e59de90
TL
1676DEFINE_bool(avoid_flush_during_recovery,
1677 ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
1678 "If true, avoids flushing the recovered WAL data where possible.");
f67539c2
TL
1679DEFINE_int64(multiread_stride, 0,
1680 "Stride length for the keys in a MultiGet batch");
1681DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
7c673cae 1682
7c673cae
FG
1683DEFINE_string(memtablerep, "skip_list", "");
1684DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
1e59de90
TL
1685DEFINE_bool(use_plain_table, false,
1686 "if use plain table instead of block-based table format");
7c673cae
FG
1687DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
1688DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1e59de90
TL
1689DEFINE_bool(use_hash_search, false,
1690 "if use kHashSearch instead of kBinarySearch. "
7c673cae 1691 "This is valid if only we use BlockTable");
1e59de90
TL
1692DEFINE_string(merge_operator, "",
1693 "The merge operator to use with the database."
7c673cae
FG
1694 "If a new merge operator is specified, be sure to use fresh"
1695 " database The possible merge operators are defined in"
1696 " utilities/merge_operators.h");
1e59de90
TL
1697DEFINE_int32(skip_list_lookahead, 0,
1698 "Used with skip_list memtablerep; try linear search first for "
1699 "this many steps from the previous position");
1700DEFINE_bool(report_file_operations, false,
1701 "if report number of file operations");
1702DEFINE_bool(report_open_timing, false, "if report open timing");
f67539c2 1703DEFINE_int32(readahead_size, 0, "Iterator readahead size");
7c673cae 1704
20effc67
TL
1705DEFINE_bool(read_with_latest_user_timestamp, true,
1706 "If true, always use the current latest timestamp for read. If "
1707 "false, choose a random timestamp from the past.");
1708
1e59de90
TL
1709#ifndef ROCKSDB_LITE
1710DEFINE_string(secondary_cache_uri, "",
1711 "Full URI for creating a custom secondary cache object");
1712static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
1713#endif // ROCKSDB_LITE
7c673cae 1714
11fdf7f2 1715static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
7c673cae
FG
1716 RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
1717
11fdf7f2 1718static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
7c673cae
FG
1719 RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
1720
11fdf7f2 1721static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
7c673cae
FG
1722 RegisterFlagValidator(&FLAGS_cache_numshardbits,
1723 &ValidateCacheNumshardbits);
1724
11fdf7f2 1725static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
7c673cae
FG
1726 RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
1727
1728DEFINE_int32(disable_seek_compaction, false,
1729 "Not used, left here for backwards compatibility");
1730
1e59de90
TL
1731DEFINE_bool(allow_data_in_errors,
1732 ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
1733 "If true, allow logging data, e.g. key, value in LOG files.");
1734
11fdf7f2 1735static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
7c673cae 1736 RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
1e59de90
TL
1737static const bool FLAGS_table_cache_numshardbits_dummy
1738 __attribute__((__unused__)) = RegisterFlagValidator(
1739 &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits);
7c673cae 1740
1e59de90
TL
1741DEFINE_uint32(write_batch_protection_bytes_per_key, 0,
1742 "Size of per-key-value checksum in each write batch. Currently "
1743 "only value 0 and 8 are supported.");
7c673cae 1744
1e59de90
TL
1745DEFINE_uint32(
1746 memtable_protection_bytes_per_key, 0,
1747 "Enable memtable per key-value checksum protection. "
1748 "Each entry in memtable will be suffixed by a per key-value checksum. "
1749 "This options determines the size of such checksums. "
1750 "Supported values: 0, 1, 2, 4, 8.");
7c673cae 1751
1e59de90
TL
1752DEFINE_bool(build_info, false,
1753 "Print the build info via GetRocksBuildInfoAsString");
7c673cae 1754
1e59de90
TL
1755DEFINE_bool(track_and_verify_wals_in_manifest, false,
1756 "If true, enable WAL tracking in the MANIFEST");
7c673cae 1757
1e59de90
TL
1758namespace ROCKSDB_NAMESPACE {
1759namespace {
1760static Status CreateMemTableRepFactory(
1761 const ConfigOptions& config_options,
1762 std::shared_ptr<MemTableRepFactory>* factory) {
1763 Status s;
1764 if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) {
1765 factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead));
1766#ifndef ROCKSDB_LITE
1767 } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) {
1768 factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
1769 } else if (!strcasecmp(FLAGS_memtablerep.c_str(),
1770 VectorRepFactory::kNickName())) {
1771 factory->reset(new VectorRepFactory());
1772 } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) {
1773 factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count));
1774#endif // ROCKSDB_LITE
1775 } else {
1776 std::unique_ptr<MemTableRepFactory> unique;
1777 s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep,
1778 &unique);
7c673cae 1779 if (s.ok()) {
1e59de90 1780 factory->reset(unique.release());
7c673cae 1781 }
7c673cae 1782 }
1e59de90
TL
1783 return s;
1784}
7c673cae
FG
1785
1786} // namespace
1787
1e59de90 1788enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal };
f67539c2
TL
1789
1790static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
1791
1792static enum DistributionType StringToDistributionType(const char* ctype) {
1793 assert(ctype);
1794
1795 if (!strcasecmp(ctype, "fixed"))
1796 return kFixed;
1797 else if (!strcasecmp(ctype, "uniform"))
1798 return kUniform;
1799 else if (!strcasecmp(ctype, "normal"))
1800 return kNormal;
1801
1802 fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
1e59de90 1803 exit(1);
f67539c2
TL
1804}
1805
1806class BaseDistribution {
1807 public:
20effc67
TL
1808 BaseDistribution(unsigned int _min, unsigned int _max)
1809 : min_value_size_(_min), max_value_size_(_max) {}
f67539c2
TL
1810 virtual ~BaseDistribution() {}
1811
1812 unsigned int Generate() {
1813 auto val = Get();
1814 if (NeedTruncate()) {
1815 val = std::max(min_value_size_, val);
1816 val = std::min(max_value_size_, val);
1817 }
1818 return val;
1819 }
1e59de90 1820
f67539c2
TL
1821 private:
1822 virtual unsigned int Get() = 0;
1e59de90 1823 virtual bool NeedTruncate() { return true; }
f67539c2
TL
1824 unsigned int min_value_size_;
1825 unsigned int max_value_size_;
1826};
1827
1e59de90 1828class FixedDistribution : public BaseDistribution {
f67539c2 1829 public:
1e59de90
TL
1830 FixedDistribution(unsigned int size)
1831 : BaseDistribution(size, size), size_(size) {}
1832
f67539c2 1833 private:
1e59de90
TL
1834 virtual unsigned int Get() override { return size_; }
1835 virtual bool NeedTruncate() override { return false; }
f67539c2
TL
1836 unsigned int size_;
1837};
1838
1e59de90
TL
1839class NormalDistribution : public BaseDistribution,
1840 public std::normal_distribution<double> {
f67539c2 1841 public:
20effc67
TL
1842 NormalDistribution(unsigned int _min, unsigned int _max)
1843 : BaseDistribution(_min, _max),
1844 // 99.7% values within the range [min, max].
1845 std::normal_distribution<double>(
1846 (double)(_min + _max) / 2.0 /*mean*/,
1847 (double)(_max - _min) / 6.0 /*stddev*/),
1848 gen_(rd_()) {}
1849
f67539c2
TL
1850 private:
1851 virtual unsigned int Get() override {
1852 return static_cast<unsigned int>((*this)(gen_));
1853 }
1854 std::random_device rd_;
1855 std::mt19937 gen_;
1856};
1857
1e59de90
TL
1858class UniformDistribution : public BaseDistribution,
1859 public std::uniform_int_distribution<unsigned int> {
f67539c2 1860 public:
20effc67
TL
1861 UniformDistribution(unsigned int _min, unsigned int _max)
1862 : BaseDistribution(_min, _max),
1863 std::uniform_int_distribution<unsigned int>(_min, _max),
1864 gen_(rd_()) {}
1865
f67539c2 1866 private:
1e59de90
TL
1867 virtual unsigned int Get() override { return (*this)(gen_); }
1868 virtual bool NeedTruncate() override { return false; }
f67539c2
TL
1869 std::random_device rd_;
1870 std::mt19937 gen_;
1871};
1872
7c673cae
FG
1873// Helper for quickly generating random data.
1874class RandomGenerator {
1875 private:
1876 std::string data_;
1877 unsigned int pos_;
f67539c2 1878 std::unique_ptr<BaseDistribution> dist_;
7c673cae
FG
1879
1880 public:
1881 RandomGenerator() {
f67539c2
TL
1882 auto max_value_size = FLAGS_value_size_max;
1883 switch (FLAGS_value_size_distribution_type_e) {
1884 case kUniform:
1885 dist_.reset(new UniformDistribution(FLAGS_value_size_min,
1886 FLAGS_value_size_max));
1887 break;
1888 case kNormal:
1e59de90
TL
1889 dist_.reset(
1890 new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max));
f67539c2
TL
1891 break;
1892 case kFixed:
1893 default:
1894 dist_.reset(new FixedDistribution(value_size));
1895 max_value_size = value_size;
1896 }
7c673cae
FG
1897 // We use a limited amount of data over and over again and ensure
1898 // that it is larger than the compression window (32KB), and also
1899 // large enough to serve all typical value sizes we want to write.
1900 Random rnd(301);
1901 std::string piece;
f67539c2 1902 while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
7c673cae
FG
1903 // Add a short fragment that is as compressible as specified
1904 // by FLAGS_compression_ratio.
1905 test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
1906 data_.append(piece);
1907 }
1908 pos_ = 0;
1909 }
1910
1911 Slice Generate(unsigned int len) {
1912 assert(len <= data_.size());
1913 if (pos_ + len > data_.size()) {
1914 pos_ = 0;
1915 }
1916 pos_ += len;
1917 return Slice(data_.data() + pos_ - len, len);
1918 }
11fdf7f2 1919
f67539c2
TL
1920 Slice Generate() {
1921 auto len = dist_->Generate();
1922 return Generate(len);
11fdf7f2 1923 }
7c673cae
FG
1924};
1925
1926static void AppendWithSpace(std::string* str, Slice msg) {
1927 if (msg.empty()) return;
1928 if (!str->empty()) {
1929 str->push_back(' ');
1930 }
1931 str->append(msg.data(), msg.size());
1932}
1933
1934struct DBWithColumnFamilies {
1935 std::vector<ColumnFamilyHandle*> cfh;
1936 DB* db;
1937#ifndef ROCKSDB_LITE
1938 OptimisticTransactionDB* opt_txn_db;
1e59de90 1939#endif // ROCKSDB_LITE
7c673cae
FG
1940 std::atomic<size_t> num_created; // Need to be updated after all the
1941 // new entries in cfh are set.
1942 size_t num_hot; // Number of column families to be queried at each moment.
1943 // After each CreateNewCf(), another num_hot number of new
1944 // Column families will be created and used to be queried.
1945 port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
11fdf7f2
TL
1946 std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating
1947 // on cfh[i].
7c673cae
FG
1948
1949 DBWithColumnFamilies()
1950 : db(nullptr)
1951#ifndef ROCKSDB_LITE
1e59de90
TL
1952 ,
1953 opt_txn_db(nullptr)
7c673cae
FG
1954#endif // ROCKSDB_LITE
1955 {
1956 cfh.clear();
1957 num_created = 0;
1958 num_hot = 0;
1959 }
1960
1961 DBWithColumnFamilies(const DBWithColumnFamilies& other)
1962 : cfh(other.cfh),
1963 db(other.db),
1964#ifndef ROCKSDB_LITE
1965 opt_txn_db(other.opt_txn_db),
1966#endif // ROCKSDB_LITE
1967 num_created(other.num_created.load()),
11fdf7f2
TL
1968 num_hot(other.num_hot),
1969 cfh_idx_to_prob(other.cfh_idx_to_prob) {
1970 }
7c673cae
FG
1971
1972 void DeleteDBs() {
1973 std::for_each(cfh.begin(), cfh.end(),
1974 [](ColumnFamilyHandle* cfhi) { delete cfhi; });
1975 cfh.clear();
1976#ifndef ROCKSDB_LITE
1977 if (opt_txn_db) {
1978 delete opt_txn_db;
1979 opt_txn_db = nullptr;
1980 } else {
1981 delete db;
1982 db = nullptr;
1983 }
1984#else
1985 delete db;
1986 db = nullptr;
1987#endif // ROCKSDB_LITE
1988 }
1989
1990 ColumnFamilyHandle* GetCfh(int64_t rand_num) {
1991 assert(num_hot > 0);
11fdf7f2
TL
1992 size_t rand_offset = 0;
1993 if (!cfh_idx_to_prob.empty()) {
1994 assert(cfh_idx_to_prob.size() == num_hot);
1995 int sum = 0;
1996 while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
1997 sum += cfh_idx_to_prob[rand_offset];
1998 ++rand_offset;
1999 }
2000 assert(rand_offset < cfh_idx_to_prob.size());
2001 } else {
2002 rand_offset = rand_num % num_hot;
2003 }
7c673cae 2004 return cfh[num_created.load(std::memory_order_acquire) - num_hot +
11fdf7f2 2005 rand_offset];
7c673cae
FG
2006 }
2007
2008 // stage: assume CF from 0 to stage * num_hot has be created. Need to create
2009 // stage * num_hot + 1 to stage * (num_hot + 1).
2010 void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
2011 MutexLock l(&create_cf_mutex);
2012 if ((stage + 1) * num_hot <= num_created) {
2013 // Already created.
2014 return;
2015 }
2016 auto new_num_created = num_created + num_hot;
2017 assert(new_num_created <= cfh.size());
2018 for (size_t i = num_created; i < new_num_created; i++) {
2019 Status s =
2020 db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
2021 if (!s.ok()) {
2022 fprintf(stderr, "create column family error: %s\n",
2023 s.ToString().c_str());
2024 abort();
2025 }
2026 }
2027 num_created.store(new_num_created, std::memory_order_release);
2028 }
2029};
2030
1e59de90 2031// A class that reports stats to CSV file.
7c673cae
FG
2032class ReporterAgent {
2033 public:
2034 ReporterAgent(Env* env, const std::string& fname,
2035 uint64_t report_interval_secs)
2036 : env_(env),
2037 total_ops_done_(0),
2038 last_report_(0),
2039 report_interval_secs_(report_interval_secs),
2040 stop_(false) {
2041 auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
2042 if (s.ok()) {
2043 s = report_file_->Append(Header() + "\n");
2044 }
2045 if (s.ok()) {
2046 s = report_file_->Flush();
2047 }
2048 if (!s.ok()) {
2049 fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
2050 s.ToString().c_str());
2051 abort();
2052 }
2053
2054 reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
2055 }
2056
2057 ~ReporterAgent() {
2058 {
2059 std::unique_lock<std::mutex> lk(mutex_);
2060 stop_ = true;
2061 stop_cv_.notify_all();
2062 }
2063 reporting_thread_.join();
2064 }
2065
2066 // thread safe
2067 void ReportFinishedOps(int64_t num_ops) {
2068 total_ops_done_.fetch_add(num_ops);
2069 }
2070
2071 private:
2072 std::string Header() const { return "secs_elapsed,interval_qps"; }
2073 void SleepAndReport() {
1e59de90
TL
2074 auto* clock = env_->GetSystemClock().get();
2075 auto time_started = clock->NowMicros();
7c673cae
FG
2076 while (true) {
2077 {
2078 std::unique_lock<std::mutex> lk(mutex_);
2079 if (stop_ ||
2080 stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
2081 [&]() { return stop_; })) {
2082 // stopping
2083 break;
2084 }
2085 // else -> timeout, which means time for a report!
2086 }
2087 auto total_ops_done_snapshot = total_ops_done_.load();
2088 // round the seconds elapsed
2089 auto secs_elapsed =
1e59de90 2090 (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
7c673cae 2091 kMicrosInSecond;
1e59de90
TL
2092 std::string report =
2093 std::to_string(secs_elapsed) + "," +
2094 std::to_string(total_ops_done_snapshot - last_report_) + "\n";
7c673cae
FG
2095 auto s = report_file_->Append(report);
2096 if (s.ok()) {
2097 s = report_file_->Flush();
2098 }
2099 if (!s.ok()) {
2100 fprintf(stderr,
2101 "Can't write to report file (%s), stopping the reporting\n",
2102 s.ToString().c_str());
2103 break;
2104 }
2105 last_report_ = total_ops_done_snapshot;
2106 }
2107 }
2108
2109 Env* env_;
2110 std::unique_ptr<WritableFile> report_file_;
2111 std::atomic<int64_t> total_ops_done_;
2112 int64_t last_report_;
2113 const uint64_t report_interval_secs_;
f67539c2 2114 ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
7c673cae
FG
2115 std::mutex mutex_;
2116 // will notify on stop
2117 std::condition_variable stop_cv_;
2118 bool stop_;
2119};
2120
2121enum OperationType : unsigned char {
2122 kRead = 0,
2123 kWrite,
2124 kDelete,
2125 kSeek,
2126 kMerge,
2127 kUpdate,
2128 kCompress,
2129 kUncompress,
2130 kCrc,
2131 kHash,
2132 kOthers
2133};
2134
2135static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
1e59de90
TL
2136 OperationTypeString = {{kRead, "read"}, {kWrite, "write"},
2137 {kDelete, "delete"}, {kSeek, "seek"},
2138 {kMerge, "merge"}, {kUpdate, "update"},
2139 {kCompress, "compress"}, {kCompress, "uncompress"},
2140 {kCrc, "crc"}, {kHash, "hash"},
2141 {kOthers, "op"}};
7c673cae
FG
2142
2143class CombinedStats;
2144class Stats {
2145 private:
1e59de90 2146 SystemClock* clock_;
7c673cae 2147 int id_;
20effc67 2148 uint64_t start_ = 0;
11fdf7f2 2149 uint64_t sine_interval_;
7c673cae
FG
2150 uint64_t finish_;
2151 double seconds_;
2152 uint64_t done_;
2153 uint64_t last_report_done_;
2154 uint64_t next_report_;
2155 uint64_t bytes_;
2156 uint64_t last_op_finish_;
2157 uint64_t last_report_finish_;
2158 std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
1e59de90
TL
2159 std::hash<unsigned char>>
2160 hist_;
7c673cae
FG
2161 std::string message_;
2162 bool exclude_from_merge_;
2163 ReporterAgent* reporter_agent_; // does not own
2164 friend class CombinedStats;
2165
2166 public:
1e59de90 2167 Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
7c673cae
FG
2168
2169 void SetReporterAgent(ReporterAgent* reporter_agent) {
2170 reporter_agent_ = reporter_agent;
2171 }
2172
2173 void Start(int id) {
2174 id_ = id;
2175 next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
2176 last_op_finish_ = start_;
2177 hist_.clear();
2178 done_ = 0;
2179 last_report_done_ = 0;
2180 bytes_ = 0;
2181 seconds_ = 0;
1e59de90
TL
2182 start_ = clock_->NowMicros();
2183 sine_interval_ = clock_->NowMicros();
7c673cae
FG
2184 finish_ = start_;
2185 last_report_finish_ = start_;
2186 message_.clear();
2187 // When set, stats from this thread won't be merged with others.
2188 exclude_from_merge_ = false;
2189 }
2190
2191 void Merge(const Stats& other) {
1e59de90 2192 if (other.exclude_from_merge_) return;
7c673cae
FG
2193
2194 for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
2195 auto this_it = hist_.find(it->first);
2196 if (this_it != hist_.end()) {
2197 this_it->second->Merge(*(other.hist_.at(it->first)));
2198 } else {
1e59de90 2199 hist_.insert({it->first, it->second});
7c673cae
FG
2200 }
2201 }
2202
2203 done_ += other.done_;
2204 bytes_ += other.bytes_;
2205 seconds_ += other.seconds_;
2206 if (other.start_ < start_) start_ = other.start_;
2207 if (other.finish_ > finish_) finish_ = other.finish_;
2208
1e59de90 2209 // Just keep the messages from one thread.
7c673cae
FG
2210 if (message_.empty()) message_ = other.message_;
2211 }
2212
2213 void Stop() {
1e59de90 2214 finish_ = clock_->NowMicros();
7c673cae
FG
2215 seconds_ = (finish_ - start_) * 1e-6;
2216 }
2217
1e59de90 2218 void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); }
7c673cae
FG
2219
2220 void SetId(int id) { id_ = id; }
2221 void SetExcludeFromMerge() { exclude_from_merge_ = true; }
2222
2223 void PrintThreadStatus() {
2224 std::vector<ThreadStatus> thread_list;
2225 FLAGS_env->GetThreadList(&thread_list);
2226
1e59de90
TL
2227 fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID",
2228 "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage",
2229 "State", "OperationProperties");
7c673cae
FG
2230
2231 int64_t current_time = 0;
1e59de90 2232 clock_->GetCurrentTime(&current_time).PermitUncheckedError();
7c673cae
FG
2233 for (auto ts : thread_list) {
2234 fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
1e59de90
TL
2235 ts.thread_id,
2236 ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
2237 ts.cf_name.c_str(),
2238 ThreadStatus::GetOperationName(ts.operation_type).c_str(),
2239 ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
2240 ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
2241 ThreadStatus::GetStateName(ts.state_type).c_str());
7c673cae
FG
2242
2243 auto op_properties = ThreadStatus::InterpretOperationProperties(
2244 ts.operation_type, ts.op_properties);
2245 for (const auto& op_prop : op_properties) {
1e59de90
TL
2246 fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(),
2247 op_prop.second);
7c673cae
FG
2248 }
2249 fprintf(stderr, "\n");
2250 }
2251 }
2252
1e59de90 2253 void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
11fdf7f2 2254
1e59de90 2255 uint64_t GetSineInterval() { return sine_interval_; }
11fdf7f2 2256
1e59de90 2257 uint64_t GetStart() { return start_; }
11fdf7f2 2258
7c673cae 2259 void ResetLastOpTime() {
1e59de90
TL
2260 // Set to now to avoid latency from calls to SleepForMicroseconds.
2261 last_op_finish_ = clock_->NowMicros();
7c673cae
FG
2262 }
2263
2264 void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
2265 enum OperationType op_type = kOthers) {
2266 if (reporter_agent_) {
2267 reporter_agent_->ReportFinishedOps(num_ops);
2268 }
2269 if (FLAGS_histogram) {
1e59de90 2270 uint64_t now = clock_->NowMicros();
7c673cae
FG
2271 uint64_t micros = now - last_op_finish_;
2272
1e59de90 2273 if (hist_.find(op_type) == hist_.end()) {
7c673cae
FG
2274 auto hist_temp = std::make_shared<HistogramImpl>();
2275 hist_.insert({op_type, std::move(hist_temp)});
2276 }
2277 hist_[op_type]->Add(micros);
2278
1e59de90 2279 if (micros >= FLAGS_slow_usecs && !FLAGS_stats_interval) {
7c673cae
FG
2280 fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
2281 fflush(stderr);
2282 }
2283 last_op_finish_ = now;
2284 }
2285
2286 done_ += num_ops;
1e59de90 2287 if (done_ >= next_report_ && FLAGS_progress_reports) {
7c673cae 2288 if (!FLAGS_stats_interval) {
1e59de90
TL
2289 if (next_report_ < 1000)
2290 next_report_ += 100;
2291 else if (next_report_ < 5000)
2292 next_report_ += 500;
2293 else if (next_report_ < 10000)
2294 next_report_ += 1000;
2295 else if (next_report_ < 50000)
2296 next_report_ += 5000;
2297 else if (next_report_ < 100000)
2298 next_report_ += 10000;
2299 else if (next_report_ < 500000)
2300 next_report_ += 50000;
2301 else
2302 next_report_ += 100000;
7c673cae
FG
2303 fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
2304 } else {
1e59de90 2305 uint64_t now = clock_->NowMicros();
7c673cae
FG
2306 int64_t usecs_since_last = now - last_report_finish_;
2307
2308 // Determine whether to print status where interval is either
2309 // each N operations or each N seconds.
2310
2311 if (FLAGS_stats_interval_seconds &&
2312 usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
1e59de90 2313 // Don't check again for this many operations.
7c673cae
FG
2314 next_report_ += FLAGS_stats_interval;
2315
2316 } else {
7c673cae 2317 fprintf(stderr,
1e59de90
TL
2318 "%s ... thread %d: (%" PRIu64 ",%" PRIu64
2319 ") ops and "
7c673cae 2320 "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
1e59de90 2321 clock_->TimeToString(now / 1000000).c_str(), id_,
7c673cae 2322 done_ - last_report_done_, done_,
1e59de90 2323 (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
7c673cae
FG
2324 done_ / ((now - start_) / 1000000.0),
2325 (now - last_report_finish_) / 1000000.0,
2326 (now - start_) / 1000000.0);
2327
2328 if (id_ == 0 && FLAGS_stats_per_interval) {
2329 std::string stats;
2330
2331 if (db_with_cfh && db_with_cfh->num_created.load()) {
2332 for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
2333 if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
2334 &stats))
2335 fprintf(stderr, "%s\n", stats.c_str());
2336 if (FLAGS_show_table_properties) {
2337 for (int level = 0; level < FLAGS_num_levels; ++level) {
2338 if (db->GetProperty(
2339 db_with_cfh->cfh[i],
2340 "rocksdb.aggregated-table-properties-at-level" +
1e59de90 2341 std::to_string(level),
7c673cae
FG
2342 &stats)) {
2343 if (stats.find("# entries=0") == std::string::npos) {
2344 fprintf(stderr, "Level[%d]: %s\n", level,
2345 stats.c_str());
2346 }
2347 }
2348 }
2349 }
2350 }
2351 } else if (db) {
2352 if (db->GetProperty("rocksdb.stats", &stats)) {
1e59de90
TL
2353 fprintf(stderr, "%s", stats.c_str());
2354 }
2355 if (db->GetProperty("rocksdb.num-running-compactions", &stats)) {
2356 fprintf(stderr, "num-running-compactions: %s\n", stats.c_str());
2357 }
2358 if (db->GetProperty("rocksdb.num-running-flushes", &stats)) {
2359 fprintf(stderr, "num-running-flushes: %s\n\n", stats.c_str());
7c673cae
FG
2360 }
2361 if (FLAGS_show_table_properties) {
2362 for (int level = 0; level < FLAGS_num_levels; ++level) {
2363 if (db->GetProperty(
2364 "rocksdb.aggregated-table-properties-at-level" +
1e59de90 2365 std::to_string(level),
7c673cae
FG
2366 &stats)) {
2367 if (stats.find("# entries=0") == std::string::npos) {
2368 fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
2369 }
2370 }
2371 }
2372 }
2373 }
2374 }
2375
2376 next_report_ += FLAGS_stats_interval;
2377 last_report_finish_ = now;
2378 last_report_done_ = done_;
2379 }
2380 }
2381 if (id_ == 0 && FLAGS_thread_status_per_interval) {
2382 PrintThreadStatus();
2383 }
2384 fflush(stderr);
2385 }
2386 }
2387
1e59de90 2388 void AddBytes(int64_t n) { bytes_ += n; }
7c673cae
FG
2389
2390 void Report(const Slice& name) {
2391 // Pretend at least one op was done in case we are running a benchmark
2392 // that does not call FinishedOps().
2393 if (done_ < 1) done_ = 1;
2394
2395 std::string extra;
1e59de90 2396 double elapsed = (finish_ - start_) * 1e-6;
7c673cae
FG
2397 if (bytes_ > 0) {
2398 // Rate is computed on actual elapsed time, not the sum of per-thread
2399 // elapsed times.
7c673cae
FG
2400 char rate[100];
2401 snprintf(rate, sizeof(rate), "%6.1f MB/s",
2402 (bytes_ / 1048576.0) / elapsed);
2403 extra = rate;
2404 }
2405 AppendWithSpace(&extra, message_);
1e59de90
TL
2406 double throughput = (double)done_ / elapsed;
2407
2408 fprintf(stdout,
2409 "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64
2410 " operations;%s%s\n",
2411 name.ToString().c_str(), seconds_ * 1e6 / done_, (long)throughput,
2412 elapsed, done_, (extra.empty() ? "" : " "), extra.c_str());
7c673cae
FG
2413 if (FLAGS_histogram) {
2414 for (auto it = hist_.begin(); it != hist_.end(); ++it) {
2415 fprintf(stdout, "Microseconds per %s:\n%s\n",
2416 OperationTypeString[it->first].c_str(),
2417 it->second->ToString().c_str());
2418 }
2419 }
2420 if (FLAGS_report_file_operations) {
1e59de90
TL
2421 auto* counted_fs =
2422 FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>();
2423 assert(counted_fs);
2424 fprintf(stdout, "%s", counted_fs->PrintCounters().c_str());
2425 counted_fs->ResetCounters();
7c673cae
FG
2426 }
2427 fflush(stdout);
2428 }
2429};
2430
2431class CombinedStats {
2432 public:
2433 void AddStats(const Stats& stat) {
2434 uint64_t total_ops = stat.done_;
2435 uint64_t total_bytes_ = stat.bytes_;
2436 double elapsed;
2437
2438 if (total_ops < 1) {
2439 total_ops = 1;
2440 }
2441
2442 elapsed = (stat.finish_ - stat.start_) * 1e-6;
2443 throughput_ops_.emplace_back(total_ops / elapsed);
2444
2445 if (total_bytes_ > 0) {
2446 double mbs = (total_bytes_ / 1048576.0);
2447 throughput_mbs_.emplace_back(mbs / elapsed);
2448 }
2449 }
2450
2451 void Report(const std::string& bench_name) {
1e59de90
TL
2452 if (throughput_ops_.size() < 2) {
2453 // skip if there are not enough samples
2454 return;
2455 }
2456
2457 const char* name = bench_name.c_str();
2458 int num_runs = static_cast<int>(throughput_ops_.size());
2459
2460 if (throughput_mbs_.size() == throughput_ops_.size()) {
2461 fprintf(stdout,
2462 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2463 "%.1f) MB/sec\n",
2464 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2465 static_cast<int>(CalcConfidence95(throughput_ops_)),
2466 CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_));
2467 } else {
2468 fprintf(stdout, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name,
2469 num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2470 static_cast<int>(CalcConfidence95(throughput_ops_)));
2471 }
2472 }
2473
2474 void ReportWithConfidenceIntervals(const std::string& bench_name) {
2475 if (throughput_ops_.size() < 2) {
2476 // skip if there are not enough samples
2477 return;
2478 }
2479
2480 const char* name = bench_name.c_str();
2481 int num_runs = static_cast<int>(throughput_ops_.size());
2482
2483 int ops_avg = static_cast<int>(CalcAvg(throughput_ops_));
2484 int ops_confidence_95 = static_cast<int>(CalcConfidence95(throughput_ops_));
2485
2486 if (throughput_mbs_.size() == throughput_ops_.size()) {
2487 double mbs_avg = CalcAvg(throughput_mbs_);
2488 double mbs_confidence_95 = CalcConfidence95(throughput_mbs_);
2489 fprintf(stdout,
2490 "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n",
2491 name, num_runs, ops_avg - ops_confidence_95,
2492 ops_avg + ops_confidence_95, mbs_avg - mbs_confidence_95,
2493 mbs_avg + mbs_confidence_95);
2494 } else {
2495 fprintf(stdout, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name, num_runs,
2496 ops_avg - ops_confidence_95, ops_avg + ops_confidence_95);
2497 }
2498 }
2499
2500 void ReportFinal(const std::string& bench_name) {
2501 if (throughput_ops_.size() < 2) {
2502 // skip if there are not enough samples
2503 return;
2504 }
2505
7c673cae
FG
2506 const char* name = bench_name.c_str();
2507 int num_runs = static_cast<int>(throughput_ops_.size());
2508
2509 if (throughput_mbs_.size() == throughput_ops_.size()) {
1e59de90 2510 // \xC2\xB1 is +/- character in UTF-8
7c673cae 2511 fprintf(stdout,
1e59de90
TL
2512 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
2513 "%.1f) MB/sec\n"
7c673cae
FG
2514 "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
2515 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
1e59de90
TL
2516 static_cast<int>(CalcConfidence95(throughput_ops_)),
2517 CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name,
2518 num_runs, static_cast<int>(CalcMedian(throughput_ops_)),
7c673cae
FG
2519 CalcMedian(throughput_mbs_));
2520 } else {
2521 fprintf(stdout,
1e59de90 2522 "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
7c673cae 2523 "%s [MEDIAN %d runs] : %d ops/sec\n",
1e59de90
TL
2524 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2525 static_cast<int>(CalcConfidence95(throughput_ops_)), name,
7c673cae
FG
2526 num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
2527 }
2528 }
2529
2530 private:
1e59de90 2531 double CalcAvg(std::vector<double>& data) {
7c673cae
FG
2532 double avg = 0;
2533 for (double x : data) {
2534 avg += x;
2535 }
2536 avg = avg / data.size();
2537 return avg;
2538 }
2539
1e59de90
TL
2540 // Calculates 95% CI assuming a normal distribution of samples.
2541 // Samples are not from a normal distribution, but it still
2542 // provides useful approximation.
2543 double CalcConfidence95(std::vector<double>& data) {
2544 assert(data.size() > 1);
2545 double avg = CalcAvg(data);
2546 double std_error = CalcStdDev(data, avg) / std::sqrt(data.size());
2547
2548 // Z score for the 97.5 percentile
2549 // see https://en.wikipedia.org/wiki/1.96
2550 return 1.959964 * std_error;
2551 }
2552
2553 double CalcMedian(std::vector<double>& data) {
7c673cae
FG
2554 assert(data.size() > 0);
2555 std::sort(data.begin(), data.end());
2556
2557 size_t mid = data.size() / 2;
2558 if (data.size() % 2 == 1) {
2559 // Odd number of entries
2560 return data[mid];
2561 } else {
2562 // Even number of entries
2563 return (data[mid] + data[mid - 1]) / 2;
2564 }
2565 }
2566
1e59de90
TL
2567 double CalcStdDev(std::vector<double>& data, double average) {
2568 assert(data.size() > 1);
2569 double squared_sum = 0.0;
2570 for (double x : data) {
2571 squared_sum += std::pow(x - average, 2);
2572 }
2573
2574 // using samples count - 1 following Bessel's correction
2575 // see https://en.wikipedia.org/wiki/Bessel%27s_correction
2576 return std::sqrt(squared_sum / (data.size() - 1));
2577 }
2578
7c673cae
FG
2579 std::vector<double> throughput_ops_;
2580 std::vector<double> throughput_mbs_;
2581};
2582
2583class TimestampEmulator {
2584 private:
2585 std::atomic<uint64_t> timestamp_;
2586
2587 public:
2588 TimestampEmulator() : timestamp_(0) {}
2589 uint64_t Get() const { return timestamp_.load(); }
2590 void Inc() { timestamp_++; }
20effc67
TL
2591 Slice Allocate(char* scratch) {
2592 // TODO: support larger timestamp sizes
2593 assert(FLAGS_user_timestamp_size == 8);
2594 assert(scratch);
2595 uint64_t ts = timestamp_.fetch_add(1);
2596 EncodeFixed64(scratch, ts);
2597 return Slice(scratch, FLAGS_user_timestamp_size);
2598 }
2599 Slice GetTimestampForRead(Random64& rand, char* scratch) {
2600 assert(FLAGS_user_timestamp_size == 8);
2601 assert(scratch);
2602 if (FLAGS_read_with_latest_user_timestamp) {
2603 return Allocate(scratch);
2604 }
2605 // Choose a random timestamp from the past.
2606 uint64_t ts = rand.Next() % Get();
2607 EncodeFixed64(scratch, ts);
2608 return Slice(scratch, FLAGS_user_timestamp_size);
2609 }
7c673cae
FG
2610};
2611
2612// State shared by all concurrent executions of the same benchmark.
2613struct SharedState {
2614 port::Mutex mu;
2615 port::CondVar cv;
2616 int total;
2617 int perf_level;
2618 std::shared_ptr<RateLimiter> write_rate_limiter;
2619 std::shared_ptr<RateLimiter> read_rate_limiter;
2620
2621 // Each thread goes through the following states:
2622 // (1) initializing
2623 // (2) waiting for others to be initialized
2624 // (3) running
2625 // (4) done
2626
2627 long num_initialized;
2628 long num_done;
2629 bool start;
2630
1e59de90 2631 SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {}
7c673cae
FG
2632};
2633
2634// Per-thread state for concurrent executions of the same benchmark.
2635struct ThreadState {
1e59de90
TL
2636 int tid; // 0..n-1 when running in n threads
2637 Random64 rand; // Has different seeds for different threads
7c673cae
FG
2638 Stats stats;
2639 SharedState* shared;
2640
1e59de90
TL
2641 explicit ThreadState(int index, int my_seed)
2642 : tid(index), rand(seed_base + my_seed) {}
7c673cae
FG
2643};
2644
2645class Duration {
2646 public:
2647 Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
2648 max_seconds_ = max_seconds;
1e59de90 2649 max_ops_ = max_ops;
7c673cae
FG
2650 ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
2651 ops_ = 0;
2652 start_at_ = FLAGS_env->NowMicros();
2653 }
2654
2655 int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
2656
2657 bool Done(int64_t increment) {
1e59de90 2658 if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
7c673cae
FG
2659 ops_ += increment;
2660
2661 if (max_seconds_) {
2662 // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
11fdf7f2
TL
2663 auto granularity = FLAGS_ops_between_duration_checks;
2664 if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
7c673cae
FG
2665 uint64_t now = FLAGS_env->NowMicros();
2666 return ((now - start_at_) / 1000000) >= max_seconds_;
2667 } else {
2668 return false;
2669 }
2670 } else {
2671 return ops_ > max_ops_;
2672 }
2673 }
2674
2675 private:
2676 uint64_t max_seconds_;
2677 int64_t max_ops_;
2678 int64_t ops_per_stage_;
2679 int64_t ops_;
2680 uint64_t start_at_;
2681};
2682
2683class Benchmark {
2684 private:
2685 std::shared_ptr<Cache> cache_;
2686 std::shared_ptr<Cache> compressed_cache_;
1e59de90 2687 std::shared_ptr<const SliceTransform> prefix_extractor_;
7c673cae
FG
2688 DBWithColumnFamilies db_;
2689 std::vector<DBWithColumnFamilies> multi_dbs_;
2690 int64_t num_;
7c673cae 2691 int key_size_;
20effc67 2692 int user_timestamp_size_;
7c673cae 2693 int prefix_size_;
1e59de90 2694 int total_thread_count_;
7c673cae
FG
2695 int64_t keys_per_prefix_;
2696 int64_t entries_per_batch_;
494da23a 2697 int64_t writes_before_delete_range_;
7c673cae
FG
2698 int64_t writes_per_range_tombstone_;
2699 int64_t range_tombstone_width_;
2700 int64_t max_num_range_tombstones_;
1e59de90 2701 ReadOptions read_options_;
7c673cae
FG
2702 WriteOptions write_options_;
2703 Options open_options_; // keep options around to properly destroy db later
494da23a 2704#ifndef ROCKSDB_LITE
11fdf7f2 2705 TraceOptions trace_options_;
f67539c2 2706 TraceOptions block_cache_trace_options_;
494da23a 2707#endif
7c673cae
FG
2708 int64_t reads_;
2709 int64_t deletes_;
2710 double read_random_exp_range_;
2711 int64_t writes_;
2712 int64_t readwrites_;
2713 int64_t merge_keys_;
2714 bool report_file_operations_;
1e59de90
TL
2715 bool use_blob_db_; // Stacked BlobDB
2716 bool read_operands_; // read via GetMergeOperands()
494da23a 2717 std::vector<std::string> keys_;
11fdf7f2
TL
2718
2719 class ErrorHandlerListener : public EventListener {
2720 public:
494da23a 2721#ifndef ROCKSDB_LITE
11fdf7f2
TL
2722 ErrorHandlerListener()
2723 : mutex_(),
2724 cv_(&mutex_),
2725 no_auto_recovery_(false),
2726 recovery_complete_(false) {}
2727
494da23a 2728 ~ErrorHandlerListener() override {}
11fdf7f2 2729
1e59de90
TL
2730 const char* Name() const override { return kClassName(); }
2731 static const char* kClassName() { return "ErrorHandlerListener"; }
2732
11fdf7f2 2733 void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
494da23a
TL
2734 Status /*bg_error*/,
2735 bool* auto_recovery) override {
11fdf7f2
TL
2736 if (*auto_recovery && no_auto_recovery_) {
2737 *auto_recovery = false;
2738 }
2739 }
2740
494da23a 2741 void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
11fdf7f2
TL
2742 InstrumentedMutexLock l(&mutex_);
2743 recovery_complete_ = true;
2744 cv_.SignalAll();
2745 }
2746
f67539c2 2747 bool WaitForRecovery(uint64_t abs_time_us) {
11fdf7f2
TL
2748 InstrumentedMutexLock l(&mutex_);
2749 if (!recovery_complete_) {
f67539c2 2750 cv_.TimedWait(abs_time_us);
11fdf7f2
TL
2751 }
2752 if (recovery_complete_) {
2753 recovery_complete_ = false;
2754 return true;
2755 }
2756 return false;
2757 }
2758
2759 void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
2760
2761 private:
2762 InstrumentedMutex mutex_;
2763 InstrumentedCondVar cv_;
2764 bool no_auto_recovery_;
2765 bool recovery_complete_;
494da23a
TL
2766#else // ROCKSDB_LITE
2767 bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
2768 void EnableAutoRecovery(bool /*enable*/) {}
2769#endif // ROCKSDB_LITE
11fdf7f2
TL
2770 };
2771
2772 std::shared_ptr<ErrorHandlerListener> listener_;
7c673cae 2773
20effc67
TL
2774 std::unique_ptr<TimestampEmulator> mock_app_clock_;
2775
7c673cae
FG
2776 bool SanityCheck() {
2777 if (FLAGS_compression_ratio > 1) {
2778 fprintf(stderr, "compression_ratio should be between 0 and 1\n");
2779 return false;
2780 }
2781 return true;
2782 }
2783
494da23a 2784 inline bool CompressSlice(const CompressionInfo& compression_info,
11fdf7f2 2785 const Slice& input, std::string* compressed) {
20effc67
TL
2786 constexpr uint32_t compress_format_version = 2;
2787
2788 return CompressData(input, compression_info, compress_format_version,
2789 compressed);
7c673cae
FG
2790 }
2791
1e59de90 2792 void PrintHeader(const Options& options) {
7c673cae 2793 PrintEnvironment();
20effc67
TL
2794 fprintf(stdout,
2795 "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n",
2796 FLAGS_key_size, FLAGS_user_timestamp_size);
f67539c2
TL
2797 auto avg_value_size = FLAGS_value_size;
2798 if (FLAGS_value_size_distribution_type_e == kFixed) {
1e59de90
TL
2799 fprintf(stdout,
2800 "Values: %d bytes each (%d bytes after compression)\n",
f67539c2
TL
2801 avg_value_size,
2802 static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2803 } else {
2804 avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
1e59de90
TL
2805 fprintf(stdout,
2806 "Values: %d avg bytes each (%d bytes after compression)\n",
f67539c2
TL
2807 avg_value_size,
2808 static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2809 fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
1e59de90
TL
2810 FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min,
2811 FLAGS_value_size_max);
f67539c2 2812 }
7c673cae
FG
2813 fprintf(stdout, "Entries: %" PRIu64 "\n", num_);
2814 fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size);
2815 fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_);
2816 fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
1e59de90
TL
2817 ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_) /
2818 1048576.0));
2819 fprintf(
2820 stdout, "FileSize: %.1f MB (estimated)\n",
2821 (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) /
2822 1048576.0));
7c673cae
FG
2823 fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
2824 FLAGS_benchmark_write_rate_limit);
2825 fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
2826 FLAGS_benchmark_read_rate_limit);
2827 if (FLAGS_enable_numa) {
2828 fprintf(stderr, "Running in NUMA enabled mode.\n");
2829#ifndef NUMA
2830 fprintf(stderr, "NUMA is not defined in the system.\n");
2831 exit(1);
2832#else
2833 if (numa_available() == -1) {
2834 fprintf(stderr, "NUMA is not supported by the system.\n");
2835 exit(1);
2836 }
2837#endif
2838 }
2839
2840 auto compression = CompressionTypeToString(FLAGS_compression_type_e);
2841 fprintf(stdout, "Compression: %s\n", compression.c_str());
494da23a
TL
2842 fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
2843 FLAGS_sample_for_compression);
1e59de90
TL
2844 if (options.memtable_factory != nullptr) {
2845 fprintf(stdout, "Memtablerep: %s\n",
2846 options.memtable_factory->GetId().c_str());
7c673cae
FG
2847 }
2848 fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
2849
2850 PrintWarnings(compression.c_str());
2851 fprintf(stdout, "------------------------------------------------\n");
2852 }
2853
2854 void PrintWarnings(const char* compression) {
2855#if defined(__GNUC__) && !defined(__OPTIMIZE__)
1e59de90
TL
2856 fprintf(
2857 stdout,
2858 "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
7c673cae
FG
2859#endif
2860#ifndef NDEBUG
2861 fprintf(stdout,
2862 "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
2863#endif
f67539c2 2864 if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
7c673cae
FG
2865 // The test string should not be too small.
2866 const int len = FLAGS_block_size;
2867 std::string input_str(len, 'y');
2868 std::string compressed;
494da23a
TL
2869 CompressionOptions opts;
2870 CompressionContext context(FLAGS_compression_type_e);
2871 CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2872 FLAGS_compression_type_e,
2873 FLAGS_sample_for_compression);
2874 bool result = CompressSlice(info, Slice(input_str), &compressed);
7c673cae
FG
2875
2876 if (!result) {
2877 fprintf(stdout, "WARNING: %s compression is not enabled\n",
2878 compression);
2879 } else if (compressed.size() >= input_str.size()) {
2880 fprintf(stdout, "WARNING: %s compression is not effective\n",
2881 compression);
2882 }
2883 }
2884 }
2885
2886// Current the following isn't equivalent to OS_LINUX.
2887#if defined(__linux)
2888 static Slice TrimSpace(Slice s) {
2889 unsigned int start = 0;
2890 while (start < s.size() && isspace(s[start])) {
2891 start++;
2892 }
2893 unsigned int limit = static_cast<unsigned int>(s.size());
1e59de90 2894 while (limit > start && isspace(s[limit - 1])) {
7c673cae
FG
2895 limit--;
2896 }
2897 return Slice(s.data() + start, limit - start);
2898 }
2899#endif
2900
2901 void PrintEnvironment() {
1e59de90
TL
2902 fprintf(stderr, "RocksDB: version %s\n",
2903 GetRocksVersionAsString(true).c_str());
7c673cae 2904
1e59de90 2905#if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
7c673cae
FG
2906 time_t now = time(nullptr);
2907 char buf[52];
2908 // Lint complains about ctime() usage, so replace it with ctime_r(). The
2909 // requirement is to provide a buffer which is at least 26 bytes.
2910 fprintf(stderr, "Date: %s",
2911 ctime_r(&now, buf)); // ctime_r() adds newline
2912
1e59de90 2913#if defined(__linux)
7c673cae
FG
2914 FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2915 if (cpuinfo != nullptr) {
2916 char line[1000];
2917 int num_cpus = 0;
2918 std::string cpu_type;
2919 std::string cache_size;
2920 while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2921 const char* sep = strchr(line, ':');
2922 if (sep == nullptr) {
2923 continue;
2924 }
2925 Slice key = TrimSpace(Slice(line, sep - 1 - line));
2926 Slice val = TrimSpace(Slice(sep + 1));
2927 if (key == "model name") {
2928 ++num_cpus;
2929 cpu_type = val.ToString();
2930 } else if (key == "cache size") {
2931 cache_size = val.ToString();
2932 }
2933 }
2934 fclose(cpuinfo);
2935 fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
2936 fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
2937 }
1e59de90
TL
2938#elif defined(__APPLE__)
2939 struct host_basic_info h;
2940 size_t hlen = HOST_BASIC_INFO_COUNT;
2941 if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
2942 (uint32_t*)&hlen) == KERN_SUCCESS) {
2943 std::string cpu_type;
2944 std::string cache_size;
2945 size_t hcache_size;
2946 hlen = sizeof(hcache_size);
2947 if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
2948 cache_size = std::to_string(hcache_size);
2949 }
2950 switch (h.cpu_type) {
2951 case CPU_TYPE_X86_64:
2952 cpu_type = "x86_64";
2953 break;
2954 case CPU_TYPE_ARM64:
2955 cpu_type = "arm64";
2956 break;
2957 default:
2958 break;
2959 }
2960 fprintf(stderr, "CPU: %d * %s\n", h.max_cpus, cpu_type.c_str());
2961 fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
2962 }
2963#elif defined(__FreeBSD__)
2964 int ncpus;
2965 size_t len = sizeof(ncpus);
2966 int mib[2] = {CTL_HW, HW_NCPU};
2967 if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
2968 char cpu_type[16];
2969 len = sizeof(cpu_type) - 1;
2970 mib[1] = HW_MACHINE;
2971 if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
2972
2973 fprintf(stderr, "CPU: %d * %s\n", ncpus, cpu_type);
2974 // no programmatic way to get the cache line size except on PPC
2975 }
2976#endif
2977#endif
2978 }
2979
2980 static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
2981 const Slice& key) {
2982 const char* pos = key.data();
2983 pos += 8;
2984 uint64_t timestamp = 0;
2985 if (port::kLittleEndian) {
2986 int bytes_to_fill = 8;
7c673cae
FG
2987 for (int i = 0; i < bytes_to_fill; ++i) {
2988 timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
2989 << ((bytes_to_fill - i - 1) << 3));
2990 }
2991 } else {
2992 memcpy(&timestamp, pos, sizeof(timestamp));
2993 }
2994 return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
2995 }
2996
2997 class ExpiredTimeFilter : public CompactionFilter {
2998 public:
2999 explicit ExpiredTimeFilter(
3000 const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
3001 : timestamp_emulator_(timestamp_emulator) {}
11fdf7f2
TL
3002 bool Filter(int /*level*/, const Slice& key,
3003 const Slice& /*existing_value*/, std::string* /*new_value*/,
3004 bool* /*value_changed*/) const override {
7c673cae
FG
3005 return KeyExpired(timestamp_emulator_.get(), key);
3006 }
3007 const char* Name() const override { return "ExpiredTimeFilter"; }
3008
3009 private:
3010 std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3011 };
3012
11fdf7f2
TL
3013 class KeepFilter : public CompactionFilter {
3014 public:
494da23a
TL
3015 bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
3016 std::string* /*new_value*/,
3017 bool* /*value_changed*/) const override {
11fdf7f2
TL
3018 return false;
3019 }
3020
494da23a 3021 const char* Name() const override { return "KeepFilter"; }
11fdf7f2
TL
3022 };
3023
1e59de90
TL
3024 static std::shared_ptr<MemoryAllocator> GetCacheAllocator() {
3025 std::shared_ptr<MemoryAllocator> allocator;
3026
3027 if (FLAGS_use_cache_jemalloc_no_dump_allocator) {
3028 JemallocAllocatorOptions jemalloc_options;
3029 if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) {
3030 fprintf(stderr, "JemallocNodumpAllocator not supported.\n");
7c673cae
FG
3031 exit(1);
3032 }
1e59de90 3033 } else if (FLAGS_use_cache_memkind_kmem_allocator) {
20effc67 3034#ifdef MEMKIND
1e59de90 3035 allocator = std::make_shared<MemkindKmemAllocator>();
20effc67 3036#else
1e59de90
TL
3037 fprintf(stderr, "Memkind library is not linked with the binary.\n");
3038 exit(1);
20effc67 3039#endif
1e59de90
TL
3040 }
3041
3042 return allocator;
3043 }
3044
3045 static std::shared_ptr<Cache> NewCache(int64_t capacity) {
3046 if (capacity <= 0) {
3047 return nullptr;
3048 }
3049 if (FLAGS_cache_type == "clock_cache") {
3050 fprintf(stderr, "Old clock cache implementation has been removed.\n");
3051 exit(1);
3052 } else if (FLAGS_cache_type == "hyper_clock_cache") {
3053 return HyperClockCacheOptions(static_cast<size_t>(capacity),
3054 FLAGS_block_size /*estimated_entry_charge*/,
3055 FLAGS_cache_numshardbits)
3056 .MakeSharedCache();
3057 } else if (FLAGS_cache_type == "lru_cache") {
3058 LRUCacheOptions opts(
3059 static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
3060 false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
3061 GetCacheAllocator(), kDefaultToAdaptiveMutex,
3062 kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio);
3063
3064#ifndef ROCKSDB_LITE
3065 if (!FLAGS_secondary_cache_uri.empty()) {
3066 Status s = SecondaryCache::CreateFromString(
3067 ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
3068 if (secondary_cache == nullptr) {
3069 fprintf(
3070 stderr,
3071 "No secondary cache registered matching string: %s status=%s\n",
3072 FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
3073 exit(1);
3074 }
3075 opts.secondary_cache = secondary_cache;
20effc67 3076 }
1e59de90
TL
3077#endif // ROCKSDB_LITE
3078
3079 if (FLAGS_use_compressed_secondary_cache) {
3080 CompressedSecondaryCacheOptions secondary_cache_opts;
3081 secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size;
3082 secondary_cache_opts.num_shard_bits =
3083 FLAGS_compressed_secondary_cache_numshardbits;
3084 secondary_cache_opts.high_pri_pool_ratio =
3085 FLAGS_compressed_secondary_cache_high_pri_pool_ratio;
3086 secondary_cache_opts.low_pri_pool_ratio =
3087 FLAGS_compressed_secondary_cache_low_pri_pool_ratio;
3088 secondary_cache_opts.compression_type =
3089 FLAGS_compressed_secondary_cache_compression_type_e;
3090 secondary_cache_opts.compress_format_version =
3091 FLAGS_compressed_secondary_cache_compress_format_version;
3092 opts.secondary_cache =
3093 NewCompressedSecondaryCache(secondary_cache_opts);
3094 }
3095
3096 return NewLRUCache(opts);
3097 } else {
3098 fprintf(stderr, "Cache type not supported.");
3099 exit(1);
7c673cae
FG
3100 }
3101 }
3102
3103 public:
3104 Benchmark()
3105 : cache_(NewCache(FLAGS_cache_size)),
3106 compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
1e59de90
TL
3107 prefix_extractor_(FLAGS_prefix_size != 0
3108 ? NewFixedPrefixTransform(FLAGS_prefix_size)
3109 : nullptr),
7c673cae 3110 num_(FLAGS_num),
7c673cae 3111 key_size_(FLAGS_key_size),
20effc67 3112 user_timestamp_size_(FLAGS_user_timestamp_size),
7c673cae 3113 prefix_size_(FLAGS_prefix_size),
1e59de90 3114 total_thread_count_(0),
7c673cae
FG
3115 keys_per_prefix_(FLAGS_keys_per_prefix),
3116 entries_per_batch_(1),
3117 reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
3118 read_random_exp_range_(0.0),
3119 writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
3120 readwrites_(
3121 (FLAGS_writes < 0 && FLAGS_reads < 0)
3122 ? FLAGS_num
3123 : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
3124 merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
11fdf7f2
TL
3125 report_file_operations_(FLAGS_report_file_operations),
3126#ifndef ROCKSDB_LITE
1e59de90 3127 use_blob_db_(FLAGS_use_blob_db), // Stacked BlobDB
11fdf7f2 3128#else
1e59de90 3129 use_blob_db_(false), // Stacked BlobDB
11fdf7f2 3130#endif // !ROCKSDB_LITE
1e59de90 3131 read_operands_(false) {
7c673cae
FG
3132 // use simcache instead of cache
3133 if (FLAGS_simcache_size >= 0) {
3134 if (FLAGS_cache_numshardbits >= 1) {
3135 cache_ =
3136 NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
3137 } else {
3138 cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
3139 }
3140 }
3141
3142 if (report_file_operations_) {
1e59de90
TL
3143 FLAGS_env = new CompositeEnvWrapper(
3144 FLAGS_env,
3145 std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem()));
7c673cae
FG
3146 }
3147
3148 if (FLAGS_prefix_size > FLAGS_key_size) {
3149 fprintf(stderr, "prefix size is larger than key size");
3150 exit(1);
3151 }
3152
3153 std::vector<std::string> files;
3154 FLAGS_env->GetChildren(FLAGS_db, &files);
3155 for (size_t i = 0; i < files.size(); i++) {
3156 if (Slice(files[i]).starts_with("heap-")) {
3157 FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
3158 }
3159 }
3160 if (!FLAGS_use_existing_db) {
3161 Options options;
f67539c2 3162 options.env = FLAGS_env;
7c673cae
FG
3163 if (!FLAGS_wal_dir.empty()) {
3164 options.wal_dir = FLAGS_wal_dir;
3165 }
11fdf7f2
TL
3166#ifndef ROCKSDB_LITE
3167 if (use_blob_db_) {
1e59de90 3168 // Stacked BlobDB
11fdf7f2
TL
3169 blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
3170 }
3171#endif // !ROCKSDB_LITE
7c673cae
FG
3172 DestroyDB(FLAGS_db, options);
3173 if (!FLAGS_wal_dir.empty()) {
3174 FLAGS_env->DeleteDir(FLAGS_wal_dir);
3175 }
3176
3177 if (FLAGS_num_multi_db > 1) {
3178 FLAGS_env->CreateDir(FLAGS_db);
3179 if (!FLAGS_wal_dir.empty()) {
3180 FLAGS_env->CreateDir(FLAGS_wal_dir);
3181 }
3182 }
3183 }
11fdf7f2
TL
3184
3185 listener_.reset(new ErrorHandlerListener());
20effc67
TL
3186 if (user_timestamp_size_ > 0) {
3187 mock_app_clock_.reset(new TimestampEmulator());
3188 }
7c673cae
FG
3189 }
3190
1e59de90 3191 void DeleteDBs() {
7c673cae 3192 db_.DeleteDBs();
1e59de90
TL
3193 for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
3194 delete dbwcf.db;
3195 }
3196 }
3197
3198 ~Benchmark() {
3199 DeleteDBs();
7c673cae 3200 if (cache_.get() != nullptr) {
1e59de90
TL
3201 // Clear cache reference first
3202 open_options_.write_buffer_manager.reset();
7c673cae
FG
3203 // this will leak, but we're shutting down so nobody cares
3204 cache_->DisownData();
3205 }
3206 }
3207
3208 Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
3209 char* data = new char[key_size_];
3210 const char* const_data = data;
3211 key_guard->reset(const_data);
3212 return Slice(key_guard->get(), key_size_);
3213 }
3214
3215 // Generate key according to the given specification and random number.
20effc67
TL
3216 // The resulting key will have the following format:
3217 // - If keys_per_prefix_ is positive, extra trailing bytes are either cut
3218 // off or padded with '0'.
3219 // The prefix value is derived from key value.
3220 // ----------------------------
3221 // | prefix 00000 | key 00000 |
3222 // ----------------------------
3223 //
3224 // - If keys_per_prefix_ is 0, the key is simply a binary representation of
3225 // random number followed by trailing '0's
3226 // ----------------------------
3227 // | key 00000 |
3228 // ----------------------------
7c673cae 3229 void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
494da23a
TL
3230 if (!keys_.empty()) {
3231 assert(FLAGS_use_existing_keys);
3232 assert(keys_.size() == static_cast<size_t>(num_keys));
3233 assert(v < static_cast<uint64_t>(num_keys));
3234 *key = keys_[v];
3235 return;
3236 }
7c673cae
FG
3237 char* start = const_cast<char*>(key->data());
3238 char* pos = start;
3239 if (keys_per_prefix_ > 0) {
3240 int64_t num_prefix = num_keys / keys_per_prefix_;
3241 int64_t prefix = v % num_prefix;
3242 int bytes_to_fill = std::min(prefix_size_, 8);
3243 if (port::kLittleEndian) {
3244 for (int i = 0; i < bytes_to_fill; ++i) {
3245 pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
3246 }
3247 } else {
3248 memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
3249 }
3250 if (prefix_size_ > 8) {
3251 // fill the rest with 0s
3252 memset(pos + 8, '0', prefix_size_ - 8);
3253 }
3254 pos += prefix_size_;
3255 }
3256
3257 int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
3258 if (port::kLittleEndian) {
3259 for (int i = 0; i < bytes_to_fill; ++i) {
3260 pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
3261 }
3262 } else {
3263 memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
3264 }
3265 pos += bytes_to_fill;
3266 if (key_size_ > pos - start) {
3267 memset(pos, '0', key_size_ - (pos - start));
3268 }
3269 }
3270
f67539c2
TL
3271 void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
3272 GenerateKeyFromInt(v, num_keys, key);
3273 if (FLAGS_seek_missing_prefix) {
3274 assert(prefix_size_ > 8);
3275 char* key_ptr = const_cast<char*>(key->data());
3276 // This rely on GenerateKeyFromInt filling paddings with '0's.
3277 // Putting a '1' will create a non-existing prefix.
3278 key_ptr[8] = '1';
3279 }
3280 }
3281
7c673cae
FG
3282 std::string GetPathForMultiple(std::string base_name, size_t id) {
3283 if (!base_name.empty()) {
3284#ifndef OS_WIN
3285 if (base_name.back() != '/') {
3286 base_name += '/';
3287 }
3288#else
3289 if (base_name.back() != '\\') {
3290 base_name += '\\';
3291 }
3292#endif
3293 }
1e59de90 3294 return base_name + std::to_string(id);
7c673cae
FG
3295 }
3296
f67539c2
TL
3297 void VerifyDBFromDB(std::string& truth_db_name) {
3298 DBWithColumnFamilies truth_db;
3299 auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
3300 if (!s.ok()) {
3301 fprintf(stderr, "open error: %s\n", s.ToString().c_str());
3302 exit(1);
3303 }
3304 ReadOptions ro;
3305 ro.total_order_seek = true;
3306 std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
3307 std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
3308 // Verify that all the key/values in truth_db are retrivable in db with
3309 // ::Get
3310 fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
3311 for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
7c673cae
FG
3312 std::string value;
3313 s = db_.db->Get(ro, truth_iter->key(), &value);
3314 assert(s.ok());
3315 // TODO(myabandeh): provide debugging hints
3316 assert(Slice(value) == truth_iter->value());
f67539c2
TL
3317 }
3318 // Verify that the db iterator does not give any extra key/value
3319 fprintf(stderr, "Verifying db == truth_db...\n");
3320 for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
3321 db_iter->Next(), truth_iter->Next()) {
3322 assert(truth_iter->Valid());
3323 assert(truth_iter->value() == db_iter->value());
3324 }
3325 // No more key should be left unchecked in truth_db
3326 assert(!truth_iter->Valid());
3327 fprintf(stderr, "...Verified\n");
7c673cae 3328 }
7c673cae 3329
20effc67 3330 void ErrorExit() {
1e59de90 3331 DeleteDBs();
20effc67
TL
3332 exit(1);
3333 }
3334
7c673cae
FG
3335 void Run() {
3336 if (!SanityCheck()) {
20effc67 3337 ErrorExit();
7c673cae
FG
3338 }
3339 Open(&open_options_);
1e59de90 3340 PrintHeader(open_options_);
7c673cae
FG
3341 std::stringstream benchmark_stream(FLAGS_benchmarks);
3342 std::string name;
3343 std::unique_ptr<ExpiredTimeFilter> filter;
3344 while (std::getline(benchmark_stream, name, ',')) {
3345 // Sanitize parameters
3346 num_ = FLAGS_num;
3347 reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
3348 writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
3349 deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
f67539c2 3350 value_size = FLAGS_value_size;
7c673cae
FG
3351 key_size_ = FLAGS_key_size;
3352 entries_per_batch_ = FLAGS_batch_size;
494da23a 3353 writes_before_delete_range_ = FLAGS_writes_before_delete_range;
7c673cae
FG
3354 writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
3355 range_tombstone_width_ = FLAGS_range_tombstone_width;
3356 max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
3357 write_options_ = WriteOptions();
3358 read_random_exp_range_ = FLAGS_read_random_exp_range;
3359 if (FLAGS_sync) {
3360 write_options_.sync = true;
3361 }
3362 write_options_.disableWAL = FLAGS_disable_wal;
1e59de90
TL
3363 write_options_.rate_limiter_priority =
3364 FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL;
3365 read_options_ = ReadOptions(FLAGS_verify_checksum, true);
3366 read_options_.total_order_seek = FLAGS_total_order_seek;
3367 read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start;
3368 read_options_.rate_limiter_priority =
3369 FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
3370 read_options_.tailing = FLAGS_use_tailing_iterator;
3371 read_options_.readahead_size = FLAGS_readahead_size;
3372 read_options_.adaptive_readahead = FLAGS_adaptive_readahead;
3373 read_options_.async_io = FLAGS_async_io;
3374 read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io;
7c673cae
FG
3375
3376 void (Benchmark::*method)(ThreadState*) = nullptr;
3377 void (Benchmark::*post_process_method)() = nullptr;
3378
3379 bool fresh_db = false;
3380 int num_threads = FLAGS_threads;
3381
3382 int num_repeat = 1;
3383 int num_warmup = 0;
3384 if (!name.empty() && *name.rbegin() == ']') {
3385 auto it = name.find('[');
3386 if (it == std::string::npos) {
3387 fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
20effc67 3388 ErrorExit();
7c673cae
FG
3389 }
3390 std::string args = name.substr(it + 1);
3391 args.resize(args.size() - 1);
3392 name.resize(it);
3393
3394 std::string bench_arg;
3395 std::stringstream args_stream(args);
3396 while (std::getline(args_stream, bench_arg, '-')) {
3397 if (bench_arg.empty()) {
3398 continue;
3399 }
3400 if (bench_arg[0] == 'X') {
3401 // Repeat the benchmark n times
3402 std::string num_str = bench_arg.substr(1);
3403 num_repeat = std::stoi(num_str);
3404 } else if (bench_arg[0] == 'W') {
3405 // Warm up the benchmark for n times
3406 std::string num_str = bench_arg.substr(1);
3407 num_warmup = std::stoi(num_str);
3408 }
3409 }
3410 }
3411
3412 // Both fillseqdeterministic and filluniquerandomdeterministic
3413 // fill the levels except the max level with UNIQUE_RANDOM
3414 // and fill the max level with fillseq and filluniquerandom, respectively
3415 if (name == "fillseqdeterministic" ||
3416 name == "filluniquerandomdeterministic") {
3417 if (!FLAGS_disable_auto_compactions) {
3418 fprintf(stderr,
3419 "Please disable_auto_compactions in FillDeterministic "
3420 "benchmark\n");
20effc67 3421 ErrorExit();
7c673cae
FG
3422 }
3423 if (num_threads > 1) {
3424 fprintf(stderr,
3425 "filldeterministic multithreaded not supported"
3426 ", use 1 thread\n");
3427 num_threads = 1;
3428 }
3429 fresh_db = true;
3430 if (name == "fillseqdeterministic") {
3431 method = &Benchmark::WriteSeqDeterministic;
3432 } else {
3433 method = &Benchmark::WriteUniqueRandomDeterministic;
3434 }
3435 } else if (name == "fillseq") {
3436 fresh_db = true;
3437 method = &Benchmark::WriteSeq;
3438 } else if (name == "fillbatch") {
3439 fresh_db = true;
3440 entries_per_batch_ = 1000;
3441 method = &Benchmark::WriteSeq;
3442 } else if (name == "fillrandom") {
3443 fresh_db = true;
3444 method = &Benchmark::WriteRandom;
1e59de90
TL
3445 } else if (name == "filluniquerandom" ||
3446 name == "fillanddeleteuniquerandom") {
7c673cae
FG
3447 fresh_db = true;
3448 if (num_threads > 1) {
3449 fprintf(stderr,
1e59de90
TL
3450 "filluniquerandom and fillanddeleteuniquerandom "
3451 "multithreaded not supported, use 1 thread");
7c673cae
FG
3452 num_threads = 1;
3453 }
3454 method = &Benchmark::WriteUniqueRandom;
3455 } else if (name == "overwrite") {
3456 method = &Benchmark::WriteRandom;
3457 } else if (name == "fillsync") {
3458 fresh_db = true;
3459 num_ /= 1000;
3460 write_options_.sync = true;
3461 method = &Benchmark::WriteRandom;
3462 } else if (name == "fill100K") {
3463 fresh_db = true;
3464 num_ /= 1000;
f67539c2 3465 value_size = 100 * 1000;
7c673cae
FG
3466 method = &Benchmark::WriteRandom;
3467 } else if (name == "readseq") {
3468 method = &Benchmark::ReadSequential;
f67539c2
TL
3469 } else if (name == "readtorowcache") {
3470 if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
3471 fprintf(stderr,
3472 "Please set use_existing_keys to true and specify a "
3473 "row cache size in readtorowcache benchmark\n");
20effc67 3474 ErrorExit();
f67539c2
TL
3475 }
3476 method = &Benchmark::ReadToRowCache;
7c673cae
FG
3477 } else if (name == "readtocache") {
3478 method = &Benchmark::ReadSequential;
3479 num_threads = 1;
3480 reads_ = num_;
3481 } else if (name == "readreverse") {
3482 method = &Benchmark::ReadReverse;
3483 } else if (name == "readrandom") {
f67539c2
TL
3484 if (FLAGS_multiread_stride) {
3485 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3486 entries_per_batch_);
3487 }
7c673cae
FG
3488 method = &Benchmark::ReadRandom;
3489 } else if (name == "readrandomfast") {
3490 method = &Benchmark::ReadRandomFast;
3491 } else if (name == "multireadrandom") {
3492 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3493 entries_per_batch_);
3494 method = &Benchmark::MultiReadRandom;
1e59de90
TL
3495 } else if (name == "multireadwhilewriting") {
3496 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3497 entries_per_batch_);
3498 num_threads++;
3499 method = &Benchmark::MultiReadWhileWriting;
20effc67
TL
3500 } else if (name == "approximatesizerandom") {
3501 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3502 entries_per_batch_);
3503 method = &Benchmark::ApproximateSizeRandom;
494da23a
TL
3504 } else if (name == "mixgraph") {
3505 method = &Benchmark::MixGraph;
7c673cae
FG
3506 } else if (name == "readmissing") {
3507 ++key_size_;
3508 method = &Benchmark::ReadRandom;
3509 } else if (name == "newiterator") {
3510 method = &Benchmark::IteratorCreation;
3511 } else if (name == "newiteratorwhilewriting") {
3512 num_threads++; // Add extra thread for writing
3513 method = &Benchmark::IteratorCreationWhileWriting;
3514 } else if (name == "seekrandom") {
3515 method = &Benchmark::SeekRandom;
3516 } else if (name == "seekrandomwhilewriting") {
3517 num_threads++; // Add extra thread for writing
3518 method = &Benchmark::SeekRandomWhileWriting;
3519 } else if (name == "seekrandomwhilemerging") {
3520 num_threads++; // Add extra thread for merging
3521 method = &Benchmark::SeekRandomWhileMerging;
3522 } else if (name == "readrandomsmall") {
3523 reads_ /= 1000;
3524 method = &Benchmark::ReadRandom;
3525 } else if (name == "deleteseq") {
3526 method = &Benchmark::DeleteSeq;
3527 } else if (name == "deleterandom") {
3528 method = &Benchmark::DeleteRandom;
3529 } else if (name == "readwhilewriting") {
3530 num_threads++; // Add extra thread for writing
3531 method = &Benchmark::ReadWhileWriting;
3532 } else if (name == "readwhilemerging") {
3533 num_threads++; // Add extra thread for writing
3534 method = &Benchmark::ReadWhileMerging;
11fdf7f2
TL
3535 } else if (name == "readwhilescanning") {
3536 num_threads++; // Add extra thread for scaning
3537 method = &Benchmark::ReadWhileScanning;
7c673cae
FG
3538 } else if (name == "readrandomwriterandom") {
3539 method = &Benchmark::ReadRandomWriteRandom;
3540 } else if (name == "readrandommergerandom") {
3541 if (FLAGS_merge_operator.empty()) {
3542 fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3543 name.c_str());
20effc67 3544 ErrorExit();
7c673cae
FG
3545 }
3546 method = &Benchmark::ReadRandomMergeRandom;
3547 } else if (name == "updaterandom") {
3548 method = &Benchmark::UpdateRandom;
11fdf7f2
TL
3549 } else if (name == "xorupdaterandom") {
3550 method = &Benchmark::XORUpdateRandom;
7c673cae
FG
3551 } else if (name == "appendrandom") {
3552 method = &Benchmark::AppendRandom;
3553 } else if (name == "mergerandom") {
3554 if (FLAGS_merge_operator.empty()) {
3555 fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3556 name.c_str());
3557 exit(1);
3558 }
3559 method = &Benchmark::MergeRandom;
3560 } else if (name == "randomwithverify") {
3561 method = &Benchmark::RandomWithVerify;
3562 } else if (name == "fillseekseq") {
3563 method = &Benchmark::WriteSeqSeekSeq;
3564 } else if (name == "compact") {
3565 method = &Benchmark::Compact;
11fdf7f2
TL
3566 } else if (name == "compactall") {
3567 CompactAll();
1e59de90
TL
3568#ifndef ROCKSDB_LITE
3569 } else if (name == "compact0") {
3570 CompactLevel(0);
3571 } else if (name == "compact1") {
3572 CompactLevel(1);
3573 } else if (name == "waitforcompaction") {
3574 WaitForCompaction();
3575#endif
3576 } else if (name == "flush") {
3577 Flush();
7c673cae
FG
3578 } else if (name == "crc32c") {
3579 method = &Benchmark::Crc32c;
3580 } else if (name == "xxhash") {
3581 method = &Benchmark::xxHash;
1e59de90
TL
3582 } else if (name == "xxhash64") {
3583 method = &Benchmark::xxHash64;
3584 } else if (name == "xxh3") {
3585 method = &Benchmark::xxh3;
7c673cae
FG
3586 } else if (name == "acquireload") {
3587 method = &Benchmark::AcquireLoad;
3588 } else if (name == "compress") {
3589 method = &Benchmark::Compress;
3590 } else if (name == "uncompress") {
3591 method = &Benchmark::Uncompress;
3592#ifndef ROCKSDB_LITE
3593 } else if (name == "randomtransaction") {
3594 method = &Benchmark::RandomTransaction;
3595 post_process_method = &Benchmark::RandomTransactionVerify;
3596#endif // ROCKSDB_LITE
3597 } else if (name == "randomreplacekeys") {
3598 fresh_db = true;
3599 method = &Benchmark::RandomReplaceKeys;
3600 } else if (name == "timeseries") {
3601 timestamp_emulator_.reset(new TimestampEmulator());
3602 if (FLAGS_expire_style == "compaction_filter") {
3603 filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
3604 fprintf(stdout, "Compaction filter is used to remove expired data");
3605 open_options_.compaction_filter = filter.get();
3606 }
3607 fresh_db = true;
3608 method = &Benchmark::TimeSeries;
1e59de90
TL
3609 } else if (name == "block_cache_entry_stats") {
3610 // DB::Properties::kBlockCacheEntryStats
3611 PrintStats("rocksdb.block-cache-entry-stats");
7c673cae
FG
3612 } else if (name == "stats") {
3613 PrintStats("rocksdb.stats");
3614 } else if (name == "resetstats") {
3615 ResetStats();
3616 } else if (name == "verify") {
3617 VerifyDBFromDB(FLAGS_truth_db);
3618 } else if (name == "levelstats") {
3619 PrintStats("rocksdb.levelstats");
1e59de90
TL
3620 } else if (name == "memstats") {
3621 std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
3622 "rocksdb.cur-size-active-mem-table",
3623 "rocksdb.cur-size-all-mem-tables",
3624 "rocksdb.size-all-mem-tables",
3625 "rocksdb.num-entries-active-mem-table",
3626 "rocksdb.num-entries-imm-mem-tables"};
3627 PrintStats(keys);
7c673cae
FG
3628 } else if (name == "sstables") {
3629 PrintStats("rocksdb.sstables");
f67539c2
TL
3630 } else if (name == "stats_history") {
3631 PrintStatsHistory();
1e59de90 3632#ifndef ROCKSDB_LITE
11fdf7f2
TL
3633 } else if (name == "replay") {
3634 if (num_threads > 1) {
3635 fprintf(stderr, "Multi-threaded replay is not yet supported\n");
20effc67 3636 ErrorExit();
11fdf7f2
TL
3637 }
3638 if (FLAGS_trace_file == "") {
3639 fprintf(stderr, "Please set --trace_file to be replayed from\n");
20effc67 3640 ErrorExit();
11fdf7f2
TL
3641 }
3642 method = &Benchmark::Replay;
1e59de90 3643#endif // ROCKSDB_LITE
f67539c2
TL
3644 } else if (name == "getmergeoperands") {
3645 method = &Benchmark::GetMergeOperands;
1e59de90
TL
3646#ifndef ROCKSDB_LITE
3647 } else if (name == "verifychecksum") {
3648 method = &Benchmark::VerifyChecksum;
3649 } else if (name == "verifyfilechecksums") {
3650 method = &Benchmark::VerifyFileChecksums;
3651#endif // ROCKSDB_LITE
3652 } else if (name == "readrandomoperands") {
3653 read_operands_ = true;
3654 method = &Benchmark::ReadRandom;
3655#ifndef ROCKSDB_LITE
3656 } else if (name == "backup") {
3657 method = &Benchmark::Backup;
3658 } else if (name == "restore") {
3659 method = &Benchmark::Restore;
3660#endif
7c673cae
FG
3661 } else if (!name.empty()) { // No error message for empty name
3662 fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
20effc67 3663 ErrorExit();
7c673cae
FG
3664 }
3665
3666 if (fresh_db) {
3667 if (FLAGS_use_existing_db) {
3668 fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
3669 name.c_str());
3670 method = nullptr;
3671 } else {
3672 if (db_.db != nullptr) {
3673 db_.DeleteDBs();
3674 DestroyDB(FLAGS_db, open_options_);
3675 }
3676 Options options = open_options_;
3677 for (size_t i = 0; i < multi_dbs_.size(); i++) {
3678 delete multi_dbs_[i].db;
3679 if (!open_options_.wal_dir.empty()) {
3680 options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
3681 }
3682 DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
3683 }
3684 multi_dbs_.clear();
3685 }
3686 Open(&open_options_); // use open_options for the last accessed
3687 }
3688
3689 if (method != nullptr) {
3690 fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
11fdf7f2
TL
3691
3692#ifndef ROCKSDB_LITE
1e59de90
TL
3693 if (name == "backup") {
3694 std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
3695 } else if (name == "restore") {
3696 std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
3697 std::cout << "Restore path: [" << FLAGS_restore_dir << "]"
3698 << std::endl;
3699 }
11fdf7f2
TL
3700 // A trace_file option can be provided both for trace and replay
3701 // operations. But db_bench does not support tracing and replaying at
3702 // the same time, for now. So, start tracing only when it is not a
3703 // replay.
3704 if (FLAGS_trace_file != "" && name != "replay") {
3705 std::unique_ptr<TraceWriter> trace_writer;
3706 Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3707 FLAGS_trace_file, &trace_writer);
3708 if (!s.ok()) {
3709 fprintf(stderr, "Encountered an error starting a trace, %s\n",
3710 s.ToString().c_str());
20effc67 3711 ErrorExit();
11fdf7f2
TL
3712 }
3713 s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
3714 if (!s.ok()) {
3715 fprintf(stderr, "Encountered an error starting a trace, %s\n",
3716 s.ToString().c_str());
20effc67 3717 ErrorExit();
11fdf7f2
TL
3718 }
3719 fprintf(stdout, "Tracing the workload to: [%s]\n",
3720 FLAGS_trace_file.c_str());
3721 }
f67539c2
TL
3722 // Start block cache tracing.
3723 if (!FLAGS_block_cache_trace_file.empty()) {
3724 // Sanity checks.
3725 if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
3726 fprintf(stderr,
3727 "Block cache trace sampling frequency must be higher than "
3728 "0.\n");
20effc67 3729 ErrorExit();
f67539c2
TL
3730 }
3731 if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
3732 fprintf(stderr,
3733 "The maximum file size for block cache tracing must be "
3734 "higher than 0.\n");
20effc67 3735 ErrorExit();
f67539c2
TL
3736 }
3737 block_cache_trace_options_.max_trace_file_size =
3738 FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
3739 block_cache_trace_options_.sampling_frequency =
3740 FLAGS_block_cache_trace_sampling_frequency;
3741 std::unique_ptr<TraceWriter> block_cache_trace_writer;
3742 Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3743 FLAGS_block_cache_trace_file,
3744 &block_cache_trace_writer);
3745 if (!s.ok()) {
3746 fprintf(stderr,
3747 "Encountered an error when creating trace writer, %s\n",
3748 s.ToString().c_str());
20effc67 3749 ErrorExit();
f67539c2
TL
3750 }
3751 s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
3752 std::move(block_cache_trace_writer));
3753 if (!s.ok()) {
3754 fprintf(
3755 stderr,
3756 "Encountered an error when starting block cache tracing, %s\n",
3757 s.ToString().c_str());
20effc67 3758 ErrorExit();
f67539c2
TL
3759 }
3760 fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
3761 FLAGS_block_cache_trace_file.c_str());
3762 }
11fdf7f2
TL
3763#endif // ROCKSDB_LITE
3764
7c673cae
FG
3765 if (num_warmup > 0) {
3766 printf("Warming up benchmark by running %d times\n", num_warmup);
3767 }
3768
3769 for (int i = 0; i < num_warmup; i++) {
3770 RunBenchmark(num_threads, name, method);
3771 }
3772
3773 if (num_repeat > 1) {
3774 printf("Running benchmark for %d times\n", num_repeat);
3775 }
3776
3777 CombinedStats combined_stats;
3778 for (int i = 0; i < num_repeat; i++) {
3779 Stats stats = RunBenchmark(num_threads, name, method);
3780 combined_stats.AddStats(stats);
1e59de90
TL
3781 if (FLAGS_confidence_interval_only) {
3782 combined_stats.ReportWithConfidenceIntervals(name);
3783 } else {
3784 combined_stats.Report(name);
3785 }
7c673cae
FG
3786 }
3787 if (num_repeat > 1) {
1e59de90 3788 combined_stats.ReportFinal(name);
7c673cae
FG
3789 }
3790 }
3791 if (post_process_method != nullptr) {
3792 (this->*post_process_method)();
3793 }
3794 }
11fdf7f2 3795
f67539c2
TL
3796 if (secondary_update_thread_) {
3797 secondary_update_stopped_.store(1, std::memory_order_relaxed);
3798 secondary_update_thread_->join();
3799 secondary_update_thread_.reset();
3800 }
3801
11fdf7f2
TL
3802#ifndef ROCKSDB_LITE
3803 if (name != "replay" && FLAGS_trace_file != "") {
3804 Status s = db_.db->EndTrace();
3805 if (!s.ok()) {
3806 fprintf(stderr, "Encountered an error ending the trace, %s\n",
3807 s.ToString().c_str());
3808 }
3809 }
f67539c2
TL
3810 if (!FLAGS_block_cache_trace_file.empty()) {
3811 Status s = db_.db->EndBlockCacheTrace();
3812 if (!s.ok()) {
3813 fprintf(stderr,
3814 "Encountered an error ending the block cache tracing, %s\n",
3815 s.ToString().c_str());
3816 }
3817 }
11fdf7f2
TL
3818#endif // ROCKSDB_LITE
3819
7c673cae
FG
3820 if (FLAGS_statistics) {
3821 fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3822 }
3823 if (FLAGS_simcache_size >= 0) {
20effc67
TL
3824 fprintf(
3825 stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
3826 static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
7c673cae 3827 }
f67539c2
TL
3828
3829#ifndef ROCKSDB_LITE
3830 if (FLAGS_use_secondary_db) {
3831 fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n",
3832 secondary_db_updates_);
3833 }
3834#endif // ROCKSDB_LITE
7c673cae
FG
3835 }
3836
3837 private:
3838 std::shared_ptr<TimestampEmulator> timestamp_emulator_;
f67539c2
TL
3839 std::unique_ptr<port::Thread> secondary_update_thread_;
3840 std::atomic<int> secondary_update_stopped_{0};
3841#ifndef ROCKSDB_LITE
3842 uint64_t secondary_db_updates_ = 0;
3843#endif // ROCKSDB_LITE
7c673cae
FG
3844 struct ThreadArg {
3845 Benchmark* bm;
3846 SharedState* shared;
3847 ThreadState* thread;
3848 void (Benchmark::*method)(ThreadState*);
3849 };
3850
3851 static void ThreadBody(void* v) {
3852 ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
3853 SharedState* shared = arg->shared;
3854 ThreadState* thread = arg->thread;
3855 {
3856 MutexLock l(&shared->mu);
3857 shared->num_initialized++;
3858 if (shared->num_initialized >= shared->total) {
3859 shared->cv.SignalAll();
3860 }
3861 while (!shared->start) {
3862 shared->cv.Wait();
3863 }
3864 }
3865
1e59de90 3866 SetPerfLevel(static_cast<PerfLevel>(shared->perf_level));
494da23a 3867 perf_context.EnablePerLevelPerfContext();
7c673cae
FG
3868 thread->stats.Start(thread->tid);
3869 (arg->bm->*(arg->method))(thread);
1e59de90
TL
3870 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
3871 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
3872 get_perf_context()->ToString());
3873 }
7c673cae
FG
3874 thread->stats.Stop();
3875
3876 {
3877 MutexLock l(&shared->mu);
3878 shared->num_done++;
3879 if (shared->num_done >= shared->total) {
3880 shared->cv.SignalAll();
3881 }
3882 }
3883 }
3884
3885 Stats RunBenchmark(int n, Slice name,
3886 void (Benchmark::*method)(ThreadState*)) {
3887 SharedState shared;
3888 shared.total = n;
3889 shared.num_initialized = 0;
3890 shared.num_done = 0;
3891 shared.start = false;
3892 if (FLAGS_benchmark_write_rate_limit > 0) {
3893 shared.write_rate_limiter.reset(
3894 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
3895 }
3896 if (FLAGS_benchmark_read_rate_limit > 0) {
11fdf7f2
TL
3897 shared.read_rate_limiter.reset(NewGenericRateLimiter(
3898 FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
3899 10 /* fairness */, RateLimiter::Mode::kReadsOnly));
7c673cae
FG
3900 }
3901
3902 std::unique_ptr<ReporterAgent> reporter_agent;
3903 if (FLAGS_report_interval_seconds > 0) {
3904 reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
3905 FLAGS_report_interval_seconds));
3906 }
3907
3908 ThreadArg* arg = new ThreadArg[n];
3909
3910 for (int i = 0; i < n; i++) {
3911#ifdef NUMA
3912 if (FLAGS_enable_numa) {
3913 // Performs a local allocation of memory to threads in numa node.
3914 int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
3915 numa_exit_on_error = 1;
3916 int numa_node = i % n_nodes;
3917 bitmask* nodes = numa_allocate_nodemask();
3918 numa_bitmask_clearall(nodes);
3919 numa_bitmask_setbit(nodes, numa_node);
3920 // numa_bind() call binds the process to the node and these
3921 // properties are passed on to the thread that is created in
3922 // StartThread method called later in the loop.
3923 numa_bind(nodes);
3924 numa_set_strict(1);
3925 numa_free_nodemask(nodes);
3926 }
3927#endif
3928 arg[i].bm = this;
3929 arg[i].method = method;
3930 arg[i].shared = &shared;
1e59de90
TL
3931 total_thread_count_++;
3932 arg[i].thread = new ThreadState(i, total_thread_count_);
7c673cae
FG
3933 arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3934 arg[i].thread->shared = &shared;
3935 FLAGS_env->StartThread(ThreadBody, &arg[i]);
3936 }
3937
3938 shared.mu.Lock();
3939 while (shared.num_initialized < n) {
3940 shared.cv.Wait();
3941 }
3942
3943 shared.start = true;
3944 shared.cv.SignalAll();
3945 while (shared.num_done < n) {
3946 shared.cv.Wait();
3947 }
3948 shared.mu.Unlock();
3949
3950 // Stats for some threads can be excluded.
3951 Stats merge_stats;
3952 for (int i = 0; i < n; i++) {
3953 merge_stats.Merge(arg[i].thread->stats);
3954 }
3955 merge_stats.Report(name);
3956
3957 for (int i = 0; i < n; i++) {
3958 delete arg[i].thread;
3959 }
3960 delete[] arg;
3961
3962 return merge_stats;
3963 }
3964
1e59de90
TL
3965 template <OperationType kOpType, typename FnType, typename... Args>
3966 static inline void ChecksumBenchmark(FnType fn, ThreadState* thread,
3967 Args... args) {
3968 const int size = FLAGS_block_size; // use --block_size option for db_bench
3969 std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)";
11fdf7f2
TL
3970 const char* label = labels.c_str();
3971
7c673cae 3972 std::string data(size, 'x');
1e59de90
TL
3973 uint64_t bytes = 0;
3974 uint32_t val = 0;
3975 while (bytes < 5000U * uint64_t{1048576}) { // ~5GB
3976 val += static_cast<uint32_t>(fn(data.data(), size, args...));
3977 thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType);
7c673cae
FG
3978 bytes += size;
3979 }
3980 // Print so result is not dead
1e59de90 3981 fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val));
7c673cae
FG
3982
3983 thread->stats.AddBytes(bytes);
3984 thread->stats.AddMessage(label);
3985 }
3986
1e59de90
TL
3987 void Crc32c(ThreadState* thread) {
3988 ChecksumBenchmark<kCrc>(crc32c::Value, thread);
3989 }
3990
7c673cae 3991 void xxHash(ThreadState* thread) {
1e59de90
TL
3992 ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0);
3993 }
7c673cae 3994
1e59de90
TL
3995 void xxHash64(ThreadState* thread) {
3996 ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0);
3997 }
3998
3999 void xxh3(ThreadState* thread) {
4000 ChecksumBenchmark<kHash>(XXH3_64bits, thread);
7c673cae
FG
4001 }
4002
4003 void AcquireLoad(ThreadState* thread) {
4004 int dummy;
4005 std::atomic<void*> ap(&dummy);
4006 int count = 0;
1e59de90 4007 void* ptr = nullptr;
7c673cae
FG
4008 thread->stats.AddMessage("(each op is 1000 loads)");
4009 while (count < 100000) {
4010 for (int i = 0; i < 1000; i++) {
4011 ptr = ap.load(std::memory_order_acquire);
4012 }
4013 count++;
4014 thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
4015 }
4016 if (ptr == nullptr) exit(1); // Disable unused variable warning.
4017 }
4018
1e59de90 4019 void Compress(ThreadState* thread) {
7c673cae
FG
4020 RandomGenerator gen;
4021 Slice input = gen.Generate(FLAGS_block_size);
4022 int64_t bytes = 0;
4023 int64_t produced = 0;
4024 bool ok = true;
4025 std::string compressed;
494da23a
TL
4026 CompressionOptions opts;
4027 CompressionContext context(FLAGS_compression_type_e);
4028 CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
4029 FLAGS_compression_type_e,
4030 FLAGS_sample_for_compression);
7c673cae
FG
4031 // Compress 1G
4032 while (ok && bytes < int64_t(1) << 30) {
4033 compressed.clear();
494da23a 4034 ok = CompressSlice(info, input, &compressed);
7c673cae
FG
4035 produced += compressed.size();
4036 bytes += input.size();
4037 thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
4038 }
4039
4040 if (!ok) {
4041 thread->stats.AddMessage("(compression failure)");
4042 } else {
4043 char buf[340];
4044 snprintf(buf, sizeof(buf), "(output: %.1f%%)",
4045 (produced * 100.0) / bytes);
4046 thread->stats.AddMessage(buf);
4047 thread->stats.AddBytes(bytes);
4048 }
4049 }
4050
1e59de90 4051 void Uncompress(ThreadState* thread) {
7c673cae
FG
4052 RandomGenerator gen;
4053 Slice input = gen.Generate(FLAGS_block_size);
4054 std::string compressed;
4055
494da23a
TL
4056 CompressionContext compression_ctx(FLAGS_compression_type_e);
4057 CompressionOptions compression_opts;
4058 CompressionInfo compression_info(
4059 compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
4060 FLAGS_compression_type_e, FLAGS_sample_for_compression);
11fdf7f2 4061 UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
494da23a
TL
4062 UncompressionInfo uncompression_info(uncompression_ctx,
4063 UncompressionDict::GetEmptyDict(),
4064 FLAGS_compression_type_e);
11fdf7f2 4065
494da23a 4066 bool ok = CompressSlice(compression_info, input, &compressed);
7c673cae 4067 int64_t bytes = 0;
20effc67 4068 size_t uncompressed_size = 0;
7c673cae 4069 while (ok && bytes < 1024 * 1048576) {
20effc67
TL
4070 constexpr uint32_t compress_format_version = 2;
4071
4072 CacheAllocationPtr uncompressed = UncompressData(
4073 uncompression_info, compressed.data(), compressed.size(),
4074 &uncompressed_size, compress_format_version);
4075
4076 ok = uncompressed.get() != nullptr;
7c673cae
FG
4077 bytes += input.size();
4078 thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
4079 }
4080
4081 if (!ok) {
4082 thread->stats.AddMessage("(compression failure)");
4083 } else {
4084 thread->stats.AddBytes(bytes);
4085 }
4086 }
4087
4088 // Returns true if the options is initialized from the specified
4089 // options file.
4090 bool InitializeOptionsFromFile(Options* opts) {
4091#ifndef ROCKSDB_LITE
4092 printf("Initializing RocksDB Options from the specified file\n");
4093 DBOptions db_opts;
4094 std::vector<ColumnFamilyDescriptor> cf_descs;
4095 if (FLAGS_options_file != "") {
f67539c2 4096 auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
7c673cae 4097 &cf_descs);
f67539c2 4098 db_opts.env = FLAGS_env;
7c673cae
FG
4099 if (s.ok()) {
4100 *opts = Options(db_opts, cf_descs[0].options);
4101 return true;
4102 }
4103 fprintf(stderr, "Unable to load options file %s --- %s\n",
4104 FLAGS_options_file.c_str(), s.ToString().c_str());
4105 exit(1);
4106 }
11fdf7f2
TL
4107#else
4108 (void)opts;
7c673cae
FG
4109#endif
4110 return false;
4111 }
4112
4113 void InitializeOptionsFromFlags(Options* opts) {
4114 printf("Initializing RocksDB Options from command-line flags\n");
4115 Options& options = *opts;
1e59de90
TL
4116 ConfigOptions config_options(options);
4117 config_options.ignore_unsupported_options = false;
7c673cae
FG
4118
4119 assert(db_.db == nullptr);
4120
f67539c2 4121 options.env = FLAGS_env;
1e59de90
TL
4122 options.wal_dir = FLAGS_wal_dir;
4123 options.dump_malloc_stats = FLAGS_dump_malloc_stats;
4124 options.stats_dump_period_sec =
4125 static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
4126 options.stats_persist_period_sec =
4127 static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
4128 options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
4129 options.stats_history_buffer_size =
4130 static_cast<size_t>(FLAGS_stats_history_buffer_size);
4131 options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
4132
4133 options.compression_opts.level = FLAGS_compression_level;
4134 options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
4135 options.compression_opts.zstd_max_train_bytes =
4136 FLAGS_compression_zstd_max_train_bytes;
4137 options.compression_opts.parallel_threads =
4138 FLAGS_compression_parallel_threads;
4139 options.compression_opts.max_dict_buffer_bytes =
4140 FLAGS_compression_max_dict_buffer_bytes;
4141 options.compression_opts.use_zstd_dict_trainer =
4142 FLAGS_compression_use_zstd_dict_trainer;
4143
7c673cae 4144 options.max_open_files = FLAGS_open_files;
11fdf7f2
TL
4145 if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
4146 options.write_buffer_manager.reset(
4147 new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
4148 }
1e59de90 4149 options.arena_block_size = FLAGS_arena_block_size;
7c673cae
FG
4150 options.write_buffer_size = FLAGS_write_buffer_size;
4151 options.max_write_buffer_number = FLAGS_max_write_buffer_number;
4152 options.min_write_buffer_number_to_merge =
1e59de90 4153 FLAGS_min_write_buffer_number_to_merge;
7c673cae
FG
4154 options.max_write_buffer_number_to_maintain =
4155 FLAGS_max_write_buffer_number_to_maintain;
f67539c2
TL
4156 options.max_write_buffer_size_to_maintain =
4157 FLAGS_max_write_buffer_size_to_maintain;
11fdf7f2 4158 options.max_background_jobs = FLAGS_max_background_jobs;
7c673cae
FG
4159 options.max_background_compactions = FLAGS_max_background_compactions;
4160 options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
4161 options.max_background_flushes = FLAGS_max_background_flushes;
4162 options.compaction_style = FLAGS_compaction_style_e;
4163 options.compaction_pri = FLAGS_compaction_pri_e;
4164 options.allow_mmap_reads = FLAGS_mmap_read;
4165 options.allow_mmap_writes = FLAGS_mmap_write;
4166 options.use_direct_reads = FLAGS_use_direct_reads;
4167 options.use_direct_io_for_flush_and_compaction =
4168 FLAGS_use_direct_io_for_flush_and_compaction;
1e59de90
TL
4169 options.manual_wal_flush = FLAGS_manual_wal_flush;
4170 options.wal_compression = FLAGS_wal_compression_e;
7c673cae 4171#ifndef ROCKSDB_LITE
494da23a 4172 options.ttl = FLAGS_fifo_compaction_ttl;
7c673cae 4173 options.compaction_options_fifo = CompactionOptionsFIFO(
11fdf7f2 4174 FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
494da23a 4175 FLAGS_fifo_compaction_allow_compaction);
1e59de90 4176 options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
7c673cae 4177#endif // ROCKSDB_LITE
1e59de90 4178 options.prefix_extractor = prefix_extractor_;
7c673cae
FG
4179 if (FLAGS_use_uint64_comparator) {
4180 options.comparator = test::Uint64Comparator();
4181 if (FLAGS_key_size != 8) {
4182 fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
4183 exit(1);
4184 }
4185 }
4186 if (FLAGS_use_stderr_info_logger) {
4187 options.info_log.reset(new StderrLogger());
4188 }
4189 options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
4190 options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
494da23a 4191 options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
7c673cae
FG
4192 if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
4193 options.memtable_insert_with_hint_prefix_extractor.reset(
4194 NewCappedPrefixTransform(
4195 FLAGS_memtable_insert_with_hint_prefix_size));
4196 }
4197 options.bloom_locality = FLAGS_bloom_locality;
4198 options.max_file_opening_threads = FLAGS_file_opening_threads;
7c673cae 4199 options.compaction_readahead_size = FLAGS_compaction_readahead_size;
f67539c2 4200 options.log_readahead_size = FLAGS_log_readahead_size;
7c673cae
FG
4201 options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
4202 options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
4203 options.use_fsync = FLAGS_use_fsync;
4204 options.num_levels = FLAGS_num_levels;
4205 options.target_file_size_base = FLAGS_target_file_size_base;
4206 options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
4207 options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
4208 options.level_compaction_dynamic_level_bytes =
4209 FLAGS_level_compaction_dynamic_level_bytes;
4210 options.max_bytes_for_level_multiplier =
4211 FLAGS_max_bytes_for_level_multiplier;
1e59de90
TL
4212 Status s =
4213 CreateMemTableRepFactory(config_options, &options.memtable_factory);
4214 if (!s.ok()) {
4215 fprintf(stderr, "Could not create memtable factory: %s\n",
4216 s.ToString().c_str());
4217 exit(1);
4218 } else if ((FLAGS_prefix_size == 0) &&
4219 (options.memtable_factory->IsInstanceOf("prefix_hash") ||
4220 options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
4221 fprintf(stderr,
4222 "prefix_size should be non-zero if PrefixHash or "
4223 "HashLinkedList memtablerep is used\n");
7c673cae 4224 exit(1);
7c673cae
FG
4225 }
4226 if (FLAGS_use_plain_table) {
4227#ifndef ROCKSDB_LITE
1e59de90
TL
4228 if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
4229 !options.memtable_factory->IsInstanceOf("hash_linkedlist")) {
4230 fprintf(stderr, "Warning: plain table is used with %s\n",
4231 options.memtable_factory->Name());
7c673cae
FG
4232 }
4233
4234 int bloom_bits_per_key = FLAGS_bloom_bits;
4235 if (bloom_bits_per_key < 0) {
1e59de90 4236 bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
7c673cae
FG
4237 }
4238
4239 PlainTableOptions plain_table_options;
4240 plain_table_options.user_key_len = FLAGS_key_size;
4241 plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
4242 plain_table_options.hash_table_ratio = 0.75;
4243 options.table_factory = std::shared_ptr<TableFactory>(
4244 NewPlainTableFactory(plain_table_options));
4245#else
4246 fprintf(stderr, "Plain table is not supported in lite mode\n");
4247 exit(1);
4248#endif // ROCKSDB_LITE
4249 } else if (FLAGS_use_cuckoo_table) {
4250#ifndef ROCKSDB_LITE
4251 if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
4252 fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
4253 exit(1);
4254 }
11fdf7f2
TL
4255
4256 if (!FLAGS_mmap_read) {
4257 fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
4258 exit(1);
4259 }
4260
f67539c2 4261 ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
7c673cae
FG
4262 table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
4263 table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
1e59de90
TL
4264 options.table_factory =
4265 std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
7c673cae
FG
4266#else
4267 fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
4268 exit(1);
4269#endif // ROCKSDB_LITE
4270 } else {
4271 BlockBasedTableOptions block_based_options;
1e59de90
TL
4272 block_based_options.checksum =
4273 static_cast<ChecksumType>(FLAGS_checksum_type);
7c673cae
FG
4274 if (FLAGS_use_hash_search) {
4275 if (FLAGS_prefix_size == 0) {
4276 fprintf(stderr,
1e59de90 4277 "prefix_size not assigned when enable use_hash_search \n");
7c673cae
FG
4278 exit(1);
4279 }
4280 block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
4281 } else {
4282 block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
4283 }
11fdf7f2 4284 if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
20effc67
TL
4285 if (FLAGS_index_with_first_key) {
4286 fprintf(stderr,
4287 "--index_with_first_key is not compatible with"
4288 " partition index.");
4289 }
11fdf7f2
TL
4290 if (FLAGS_use_hash_search) {
4291 fprintf(stderr,
4292 "use_hash_search is incompatible with "
4293 "partition index and is ignored");
4294 }
4295 block_based_options.index_type =
4296 BlockBasedTableOptions::kTwoLevelIndexSearch;
4297 block_based_options.metadata_block_size = FLAGS_metadata_block_size;
4298 if (FLAGS_partition_index_and_filters) {
4299 block_based_options.partition_filters = true;
4300 }
20effc67
TL
4301 } else if (FLAGS_index_with_first_key) {
4302 block_based_options.index_type =
4303 BlockBasedTableOptions::kBinarySearchWithFirstKey;
4304 }
4305 BlockBasedTableOptions::IndexShorteningMode index_shortening =
4306 block_based_options.index_shortening;
4307 switch (FLAGS_index_shortening_mode) {
4308 case 0:
4309 index_shortening =
4310 BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
4311 break;
4312 case 1:
4313 index_shortening =
4314 BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
4315 break;
4316 case 2:
4317 index_shortening = BlockBasedTableOptions::IndexShorteningMode::
4318 kShortenSeparatorsAndSuccessor;
4319 break;
4320 default:
4321 fprintf(stderr, "Unknown key shortening mode\n");
11fdf7f2 4322 }
20effc67
TL
4323 block_based_options.optimize_filters_for_memory =
4324 FLAGS_optimize_filters_for_memory;
4325 block_based_options.index_shortening = index_shortening;
7c673cae
FG
4326 if (cache_ == nullptr) {
4327 block_based_options.no_block_cache = true;
4328 }
4329 block_based_options.cache_index_and_filter_blocks =
4330 FLAGS_cache_index_and_filter_blocks;
4331 block_based_options.pin_l0_filter_and_index_blocks_in_cache =
4332 FLAGS_pin_l0_filter_and_index_blocks_in_cache;
11fdf7f2
TL
4333 block_based_options.pin_top_level_index_and_filter =
4334 FLAGS_pin_top_level_index_and_filter;
7c673cae
FG
4335 if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps
4336 block_based_options.cache_index_and_filter_blocks_with_high_priority =
4337 true;
4338 }
1e59de90
TL
4339 if (FLAGS_cache_high_pri_pool_ratio + FLAGS_cache_low_pri_pool_ratio >
4340 1.0) {
4341 fprintf(stderr,
4342 "Sum of high_pri_pool_ratio and low_pri_pool_ratio "
4343 "cannot exceed 1.0.\n");
4344 }
7c673cae 4345 block_based_options.block_cache = cache_;
1e59de90
TL
4346 block_based_options.cache_usage_options.options_overrides.insert(
4347 {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
4348 {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
4349 ? CacheEntryRoleOptions::Decision::kEnabled
4350 : CacheEntryRoleOptions::Decision::kDisabled}});
4351 block_based_options.cache_usage_options.options_overrides.insert(
4352 {CacheEntryRole::kFilterConstruction,
4353 {/*.charged = */ FLAGS_charge_filter_construction
4354 ? CacheEntryRoleOptions::Decision::kEnabled
4355 : CacheEntryRoleOptions::Decision::kDisabled}});
4356 block_based_options.cache_usage_options.options_overrides.insert(
4357 {CacheEntryRole::kBlockBasedTableReader,
4358 {/*.charged = */ FLAGS_charge_table_reader
4359 ? CacheEntryRoleOptions::Decision::kEnabled
4360 : CacheEntryRoleOptions::Decision::kDisabled}});
4361 block_based_options.cache_usage_options.options_overrides.insert(
4362 {CacheEntryRole::kFileMetadata,
4363 {/*.charged = */ FLAGS_charge_file_metadata
4364 ? CacheEntryRoleOptions::Decision::kEnabled
4365 : CacheEntryRoleOptions::Decision::kDisabled}});
4366 block_based_options.cache_usage_options.options_overrides.insert(
4367 {CacheEntryRole::kBlobCache,
4368 {/*.charged = */ FLAGS_charge_blob_cache
4369 ? CacheEntryRoleOptions::Decision::kEnabled
4370 : CacheEntryRoleOptions::Decision::kDisabled}});
7c673cae
FG
4371 block_based_options.block_cache_compressed = compressed_cache_;
4372 block_based_options.block_size = FLAGS_block_size;
4373 block_based_options.block_restart_interval = FLAGS_block_restart_interval;
4374 block_based_options.index_block_restart_interval =
4375 FLAGS_index_block_restart_interval;
11fdf7f2
TL
4376 block_based_options.format_version =
4377 static_cast<uint32_t>(FLAGS_format_version);
7c673cae 4378 block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
11fdf7f2
TL
4379 block_based_options.enable_index_compression =
4380 FLAGS_enable_index_compression;
4381 block_based_options.block_align = FLAGS_block_align;
1e59de90
TL
4382 block_based_options.whole_key_filtering = FLAGS_whole_key_filtering;
4383 block_based_options.max_auto_readahead_size =
4384 FLAGS_max_auto_readahead_size;
4385 block_based_options.initial_auto_readahead_size =
4386 FLAGS_initial_auto_readahead_size;
4387 block_based_options.num_file_reads_for_auto_readahead =
4388 FLAGS_num_file_reads_for_auto_readahead;
4389 BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
4390 block_based_options.prepopulate_block_cache;
4391 switch (FLAGS_prepopulate_block_cache) {
4392 case 0:
4393 prepopulate_block_cache =
4394 BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
4395 break;
4396 case 1:
4397 prepopulate_block_cache =
4398 BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
4399 break;
4400 default:
4401 fprintf(stderr, "Unknown prepopulate block cache mode\n");
4402 }
4403 block_based_options.prepopulate_block_cache = prepopulate_block_cache;
11fdf7f2
TL
4404 if (FLAGS_use_data_block_hash_index) {
4405 block_based_options.data_block_index_type =
f67539c2 4406 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
11fdf7f2
TL
4407 } else {
4408 block_based_options.data_block_index_type =
f67539c2 4409 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
11fdf7f2
TL
4410 }
4411 block_based_options.data_block_hash_table_util_ratio =
4412 FLAGS_data_block_hash_table_util_ratio;
7c673cae
FG
4413 if (FLAGS_read_cache_path != "") {
4414#ifndef ROCKSDB_LITE
4415 Status rc_status;
4416
4417 // Read cache need to be provided with a the Logger, we will put all
4418 // reac cache logs in the read cache path in a file named rc_LOG
4419 rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
4420 std::shared_ptr<Logger> read_cache_logger;
4421 if (rc_status.ok()) {
4422 rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
4423 &read_cache_logger);
4424 }
4425
4426 if (rc_status.ok()) {
4427 PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
4428 FLAGS_read_cache_size,
4429 read_cache_logger);
4430
4431 rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
4432 rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
4433 rc_cfg.writer_qdepth = 4;
4434 rc_cfg.writer_dispatch_size = 4 * 1024;
4435
4436 auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
4437 block_based_options.persistent_cache = pcache;
4438 rc_status = pcache->Open();
4439 }
4440
4441 if (!rc_status.ok()) {
4442 fprintf(stderr, "Error initializing read cache, %s\n",
4443 rc_status.ToString().c_str());
4444 exit(1);
4445 }
4446#else
4447 fprintf(stderr, "Read cache is not supported in LITE\n");
4448 exit(1);
4449
4450#endif
4451 }
1e59de90
TL
4452
4453 if (FLAGS_use_blob_cache) {
4454 if (FLAGS_use_shared_block_and_blob_cache) {
4455 options.blob_cache = cache_;
4456 } else {
4457 if (FLAGS_blob_cache_size > 0) {
4458 LRUCacheOptions co;
4459 co.capacity = FLAGS_blob_cache_size;
4460 co.num_shard_bits = FLAGS_blob_cache_numshardbits;
4461 co.memory_allocator = GetCacheAllocator();
4462
4463 options.blob_cache = NewLRUCache(co);
4464 } else {
4465 fprintf(
4466 stderr,
4467 "Unable to create a standalone blob cache if blob_cache_size "
4468 "<= 0.\n");
4469 exit(1);
4470 }
4471 }
4472 switch (FLAGS_prepopulate_blob_cache) {
4473 case 0:
4474 options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
4475 break;
4476 case 1:
4477 options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
4478 break;
4479 default:
4480 fprintf(stderr, "Unknown prepopulate blob cache mode\n");
4481 exit(1);
4482 }
4483
4484 fprintf(stdout,
4485 "Integrated BlobDB: blob cache enabled"
4486 ", block and blob caches shared: %d",
4487 FLAGS_use_shared_block_and_blob_cache);
4488 if (!FLAGS_use_shared_block_and_blob_cache) {
4489 fprintf(stdout,
4490 ", blob cache size %" PRIu64
4491 ", blob cache num shard bits: %d",
4492 FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits);
4493 }
4494 fprintf(stdout, ", blob cache prepopulated: %d\n",
4495 FLAGS_prepopulate_blob_cache);
4496 } else {
4497 fprintf(stdout, "Integrated BlobDB: blob cache disabled\n");
4498 }
4499
7c673cae
FG
4500 options.table_factory.reset(
4501 NewBlockBasedTableFactory(block_based_options));
4502 }
4503 if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
4504 if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
f67539c2 4505 static_cast<unsigned int>(FLAGS_num_levels)) {
7c673cae 4506 fprintf(stderr, "Insufficient number of fanouts specified %d\n",
f67539c2
TL
4507 static_cast<int>(
4508 FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
7c673cae
FG
4509 exit(1);
4510 }
4511 options.max_bytes_for_level_multiplier_additional =
1e59de90 4512 FLAGS_max_bytes_for_level_multiplier_additional_v;
7c673cae
FG
4513 }
4514 options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
4515 options.level0_file_num_compaction_trigger =
4516 FLAGS_level0_file_num_compaction_trigger;
4517 options.level0_slowdown_writes_trigger =
1e59de90 4518 FLAGS_level0_slowdown_writes_trigger;
7c673cae 4519 options.compression = FLAGS_compression_type_e;
1e59de90
TL
4520 if (FLAGS_simulate_hybrid_fs_file != "") {
4521 options.bottommost_temperature = Temperature::kWarm;
4522 }
4523 options.preclude_last_level_data_seconds =
4524 FLAGS_preclude_last_level_data_seconds;
4525 options.preserve_internal_time_seconds =
4526 FLAGS_preserve_internal_time_seconds;
494da23a 4527 options.sample_for_compression = FLAGS_sample_for_compression;
7c673cae
FG
4528 options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
4529 options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
4530 options.max_total_wal_size = FLAGS_max_total_wal_size;
4531
4532 if (FLAGS_min_level_to_compress >= 0) {
4533 assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
4534 options.compression_per_level.resize(FLAGS_num_levels);
4535 for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
4536 options.compression_per_level[i] = kNoCompression;
4537 }
1e59de90 4538 for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) {
7c673cae
FG
4539 options.compression_per_level[i] = FLAGS_compression_type_e;
4540 }
4541 }
7c673cae
FG
4542 options.soft_pending_compaction_bytes_limit =
4543 FLAGS_soft_pending_compaction_bytes_limit;
4544 options.hard_pending_compaction_bytes_limit =
4545 FLAGS_hard_pending_compaction_bytes_limit;
4546 options.delayed_write_rate = FLAGS_delayed_write_rate;
4547 options.allow_concurrent_memtable_write =
4548 FLAGS_allow_concurrent_memtable_write;
1e59de90
TL
4549 options.experimental_mempurge_threshold =
4550 FLAGS_experimental_mempurge_threshold;
11fdf7f2
TL
4551 options.inplace_update_support = FLAGS_inplace_update_support;
4552 options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
7c673cae
FG
4553 options.enable_write_thread_adaptive_yield =
4554 FLAGS_enable_write_thread_adaptive_yield;
11fdf7f2 4555 options.enable_pipelined_write = FLAGS_enable_pipelined_write;
f67539c2 4556 options.unordered_write = FLAGS_unordered_write;
7c673cae
FG
4557 options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
4558 options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
7c673cae
FG
4559 options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
4560 options.max_compaction_bytes = FLAGS_max_compaction_bytes;
4561 options.disable_auto_compactions = FLAGS_disable_auto_compactions;
4562 options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
1e59de90
TL
4563 options.paranoid_checks = FLAGS_paranoid_checks;
4564 options.force_consistency_checks = FLAGS_force_consistency_checks;
4565 options.check_flush_compaction_key_order =
4566 FLAGS_check_flush_compaction_key_order;
20effc67 4567 options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
1e59de90 4568 options.ttl = FLAGS_ttl_seconds;
7c673cae
FG
4569 // fill storage options
4570 options.advise_random_on_open = FLAGS_advise_random_on_open;
4571 options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
4572 options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
4573 options.bytes_per_sync = FLAGS_bytes_per_sync;
4574 options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
4575
4576 // merge operator options
1e59de90
TL
4577 if (!FLAGS_merge_operator.empty()) {
4578 s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator,
4579 &options.merge_operator);
4580 if (!s.ok()) {
4581 fprintf(stderr, "invalid merge operator[%s]: %s\n",
4582 FLAGS_merge_operator.c_str(), s.ToString().c_str());
4583 exit(1);
4584 }
7c673cae
FG
4585 }
4586 options.max_successive_merges = FLAGS_max_successive_merges;
4587 options.report_bg_io_stats = FLAGS_report_bg_io_stats;
4588
4589 // set universal style compaction configurations, if applicable
4590 if (FLAGS_universal_size_ratio != 0) {
4591 options.compaction_options_universal.size_ratio =
1e59de90 4592 FLAGS_universal_size_ratio;
7c673cae
FG
4593 }
4594 if (FLAGS_universal_min_merge_width != 0) {
4595 options.compaction_options_universal.min_merge_width =
1e59de90 4596 FLAGS_universal_min_merge_width;
7c673cae
FG
4597 }
4598 if (FLAGS_universal_max_merge_width != 0) {
4599 options.compaction_options_universal.max_merge_width =
1e59de90 4600 FLAGS_universal_max_merge_width;
7c673cae
FG
4601 }
4602 if (FLAGS_universal_max_size_amplification_percent != 0) {
4603 options.compaction_options_universal.max_size_amplification_percent =
1e59de90 4604 FLAGS_universal_max_size_amplification_percent;
7c673cae
FG
4605 }
4606 if (FLAGS_universal_compression_size_percent != -1) {
4607 options.compaction_options_universal.compression_size_percent =
1e59de90 4608 FLAGS_universal_compression_size_percent;
7c673cae
FG
4609 }
4610 options.compaction_options_universal.allow_trivial_move =
4611 FLAGS_universal_allow_trivial_move;
1e59de90
TL
4612 options.compaction_options_universal.incremental =
4613 FLAGS_universal_incremental;
7c673cae
FG
4614 if (FLAGS_thread_status_per_interval > 0) {
4615 options.enable_thread_tracking = true;
4616 }
7c673cae 4617
20effc67
TL
4618 if (FLAGS_user_timestamp_size > 0) {
4619 if (FLAGS_user_timestamp_size != 8) {
4620 fprintf(stderr, "Only 64 bits timestamps are supported.\n");
4621 exit(1);
4622 }
1e59de90
TL
4623 options.comparator = test::BytewiseComparatorWithU64TsWrapper();
4624 }
4625
4626 options.allow_data_in_errors = FLAGS_allow_data_in_errors;
4627 options.track_and_verify_wals_in_manifest =
4628 FLAGS_track_and_verify_wals_in_manifest;
4629
4630 // Integrated BlobDB
4631 options.enable_blob_files = FLAGS_enable_blob_files;
4632 options.min_blob_size = FLAGS_min_blob_size;
4633 options.blob_file_size = FLAGS_blob_file_size;
4634 options.blob_compression_type =
4635 StringToCompressionType(FLAGS_blob_compression_type.c_str());
4636 options.enable_blob_garbage_collection =
4637 FLAGS_enable_blob_garbage_collection;
4638 options.blob_garbage_collection_age_cutoff =
4639 FLAGS_blob_garbage_collection_age_cutoff;
4640 options.blob_garbage_collection_force_threshold =
4641 FLAGS_blob_garbage_collection_force_threshold;
4642 options.blob_compaction_readahead_size =
4643 FLAGS_blob_compaction_readahead_size;
4644 options.blob_file_starting_level = FLAGS_blob_file_starting_level;
20effc67 4645
7c673cae
FG
4646#ifndef ROCKSDB_LITE
4647 if (FLAGS_readonly && FLAGS_transaction_db) {
4648 fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
4649 exit(1);
4650 }
f67539c2
TL
4651 if (FLAGS_use_secondary_db &&
4652 (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
4653 fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
4654 exit(1);
4655 }
7c673cae 4656#endif // ROCKSDB_LITE
1e59de90
TL
4657 options.memtable_protection_bytes_per_key =
4658 FLAGS_memtable_protection_bytes_per_key;
7c673cae
FG
4659 }
4660
4661 void InitializeOptionsGeneral(Options* opts) {
1e59de90
TL
4662 // Be careful about what is set here to avoid accidentally overwriting
4663 // settings already configured by OPTIONS file. Only configure settings that
4664 // are needed for the benchmark to run, settings for shared objects that
4665 // were not configured already, settings that require dynamically invoking
4666 // APIs, and settings for the benchmark itself.
7c673cae
FG
4667 Options& options = *opts;
4668
1e59de90
TL
4669 // Always set these since they are harmless when not needed and prevent
4670 // a guaranteed failure when they are needed.
4671 options.create_missing_column_families = true;
4672 options.create_if_missing = true;
4673
4674 if (options.statistics == nullptr) {
4675 options.statistics = dbstats;
4676 }
7c673cae 4677
20effc67
TL
4678 auto table_options =
4679 options.table_factory->GetOptions<BlockBasedTableOptions>();
4680 if (table_options != nullptr) {
1e59de90
TL
4681 if (FLAGS_cache_size > 0) {
4682 // This violates this function's rules on when to set options. But we
4683 // have to do it because the case of unconfigured block cache in OPTIONS
4684 // file is indistinguishable (it is sanitized to 8MB by this point, not
4685 // nullptr), and our regression tests assume this will be the shared
4686 // block cache, even with OPTIONS file provided.
11fdf7f2
TL
4687 table_options->block_cache = cache_;
4688 }
1e59de90
TL
4689 if (table_options->filter_policy == nullptr) {
4690 if (FLAGS_bloom_bits < 0) {
4691 table_options->filter_policy = BlockBasedTableOptions().filter_policy;
4692 } else if (FLAGS_bloom_bits == 0) {
4693 table_options->filter_policy.reset();
4694 } else {
4695 table_options->filter_policy.reset(
4696 FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
4697 : NewBloomFilterPolicy(FLAGS_bloom_bits));
4698 }
11fdf7f2
TL
4699 }
4700 }
1e59de90
TL
4701
4702 if (options.row_cache == nullptr) {
4703 if (FLAGS_row_cache_size) {
4704 if (FLAGS_cache_numshardbits >= 1) {
4705 options.row_cache =
4706 NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
4707 } else {
4708 options.row_cache = NewLRUCache(FLAGS_row_cache_size);
4709 }
7c673cae
FG
4710 }
4711 }
1e59de90
TL
4712
4713 if (options.env == Env::Default()) {
4714 options.env = FLAGS_env;
4715 }
7c673cae 4716 if (FLAGS_enable_io_prio) {
1e59de90
TL
4717 options.env->LowerThreadPoolIOPriority(Env::LOW);
4718 options.env->LowerThreadPoolIOPriority(Env::HIGH);
7c673cae 4719 }
11fdf7f2 4720 if (FLAGS_enable_cpu_prio) {
1e59de90
TL
4721 options.env->LowerThreadPoolCPUPriority(Env::LOW);
4722 options.env->LowerThreadPoolCPUPriority(Env::HIGH);
11fdf7f2 4723 }
1e59de90 4724
11fdf7f2
TL
4725 if (FLAGS_sine_write_rate) {
4726 FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
4727 }
4728
1e59de90
TL
4729 if (options.rate_limiter == nullptr) {
4730 if (FLAGS_rate_limiter_bytes_per_sec > 0) {
4731 options.rate_limiter.reset(NewGenericRateLimiter(
4732 FLAGS_rate_limiter_bytes_per_sec,
4733 FLAGS_rate_limiter_refill_period_us, 10 /* fairness */,
4734 // TODO: replace this with a more general FLAG for deciding
4735 // RateLimiter::Mode as now we also rate-limit foreground reads e.g,
4736 // Get()/MultiGet()
4737 FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
4738 : RateLimiter::Mode::kWritesOnly,
4739 FLAGS_rate_limiter_auto_tuned));
11fdf7f2 4740 }
11fdf7f2 4741 }
7c673cae 4742
11fdf7f2 4743 options.listeners.emplace_back(listener_);
1e59de90
TL
4744
4745 if (options.file_checksum_gen_factory == nullptr) {
4746 if (FLAGS_file_checksum) {
4747 options.file_checksum_gen_factory.reset(
4748 new FileChecksumGenCrc32cFactory());
4749 }
4750 }
4751
7c673cae
FG
4752 if (FLAGS_num_multi_db <= 1) {
4753 OpenDb(options, FLAGS_db, &db_);
4754 } else {
4755 multi_dbs_.clear();
4756 multi_dbs_.resize(FLAGS_num_multi_db);
4757 auto wal_dir = options.wal_dir;
4758 for (int i = 0; i < FLAGS_num_multi_db; i++) {
4759 if (!wal_dir.empty()) {
4760 options.wal_dir = GetPathForMultiple(wal_dir, i);
4761 }
4762 OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
4763 }
4764 options.wal_dir = wal_dir;
4765 }
11fdf7f2
TL
4766
4767 // KeepFilter is a noop filter, this can be used to test compaction filter
1e59de90
TL
4768 if (options.compaction_filter == nullptr) {
4769 if (FLAGS_use_keep_filter) {
4770 options.compaction_filter = new KeepFilter();
4771 fprintf(stdout, "A noop compaction filter is used\n");
4772 }
11fdf7f2 4773 }
494da23a
TL
4774
4775 if (FLAGS_use_existing_keys) {
4776 // Only work on single database
4777 assert(db_.db != nullptr);
1e59de90 4778 ReadOptions read_opts; // before read_options_ initialized
494da23a
TL
4779 read_opts.total_order_seek = true;
4780 Iterator* iter = db_.db->NewIterator(read_opts);
4781 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
4782 keys_.emplace_back(iter->key().ToString());
4783 }
4784 delete iter;
4785 FLAGS_num = keys_.size();
4786 }
7c673cae
FG
4787 }
4788
4789 void Open(Options* opts) {
4790 if (!InitializeOptionsFromFile(opts)) {
4791 InitializeOptionsFromFlags(opts);
4792 }
4793
4794 InitializeOptionsGeneral(opts);
4795 }
4796
11fdf7f2 4797 void OpenDb(Options options, const std::string& db_name,
1e59de90
TL
4798 DBWithColumnFamilies* db) {
4799 uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0;
7c673cae
FG
4800 Status s;
4801 // Open with column families if necessary.
4802 if (FLAGS_num_column_families > 1) {
4803 size_t num_hot = FLAGS_num_column_families;
4804 if (FLAGS_num_hot_column_families > 0 &&
4805 FLAGS_num_hot_column_families < FLAGS_num_column_families) {
4806 num_hot = FLAGS_num_hot_column_families;
4807 } else {
4808 FLAGS_num_hot_column_families = FLAGS_num_column_families;
4809 }
4810 std::vector<ColumnFamilyDescriptor> column_families;
4811 for (size_t i = 0; i < num_hot; i++) {
4812 column_families.push_back(ColumnFamilyDescriptor(
1e59de90 4813 ColumnFamilyName(i), ColumnFamilyOptions(options)));
7c673cae 4814 }
11fdf7f2
TL
4815 std::vector<int> cfh_idx_to_prob;
4816 if (!FLAGS_column_family_distribution.empty()) {
4817 std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
4818 std::string cf_prob;
4819 int sum = 0;
4820 while (std::getline(cf_prob_stream, cf_prob, ',')) {
4821 cfh_idx_to_prob.push_back(std::stoi(cf_prob));
4822 sum += cfh_idx_to_prob.back();
4823 }
4824 if (sum != 100) {
4825 fprintf(stderr, "column_family_distribution items must sum to 100\n");
4826 exit(1);
4827 }
4828 if (cfh_idx_to_prob.size() != num_hot) {
4829 fprintf(stderr,
4830 "got %" ROCKSDB_PRIszt
4831 " column_family_distribution items; expected "
4832 "%" ROCKSDB_PRIszt "\n",
4833 cfh_idx_to_prob.size(), num_hot);
4834 exit(1);
4835 }
4836 }
7c673cae
FG
4837#ifndef ROCKSDB_LITE
4838 if (FLAGS_readonly) {
1e59de90
TL
4839 s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh,
4840 &db->db);
7c673cae
FG
4841 } else if (FLAGS_optimistic_transaction_db) {
4842 s = OptimisticTransactionDB::Open(options, db_name, column_families,
4843 &db->cfh, &db->opt_txn_db);
4844 if (s.ok()) {
4845 db->db = db->opt_txn_db->GetBaseDB();
4846 }
4847 } else if (FLAGS_transaction_db) {
4848 TransactionDB* ptr;
4849 TransactionDBOptions txn_db_options;
f67539c2
TL
4850 if (options.unordered_write) {
4851 options.two_write_queues = true;
4852 txn_db_options.skip_concurrency_control = true;
4853 txn_db_options.write_policy = WRITE_PREPARED;
4854 }
7c673cae
FG
4855 s = TransactionDB::Open(options, txn_db_options, db_name,
4856 column_families, &db->cfh, &ptr);
4857 if (s.ok()) {
4858 db->db = ptr;
4859 }
4860 } else {
4861 s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4862 }
4863#else
4864 s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4865#endif // ROCKSDB_LITE
4866 db->cfh.resize(FLAGS_num_column_families);
4867 db->num_created = num_hot;
4868 db->num_hot = num_hot;
11fdf7f2 4869 db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
7c673cae
FG
4870#ifndef ROCKSDB_LITE
4871 } else if (FLAGS_readonly) {
4872 s = DB::OpenForReadOnly(options, db_name, &db->db);
4873 } else if (FLAGS_optimistic_transaction_db) {
4874 s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
4875 if (s.ok()) {
4876 db->db = db->opt_txn_db->GetBaseDB();
4877 }
4878 } else if (FLAGS_transaction_db) {
11fdf7f2 4879 TransactionDB* ptr = nullptr;
7c673cae 4880 TransactionDBOptions txn_db_options;
f67539c2
TL
4881 if (options.unordered_write) {
4882 options.two_write_queues = true;
4883 txn_db_options.skip_concurrency_control = true;
4884 txn_db_options.write_policy = WRITE_PREPARED;
4885 }
11fdf7f2
TL
4886 s = CreateLoggerFromOptions(db_name, options, &options.info_log);
4887 if (s.ok()) {
4888 s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
4889 }
7c673cae
FG
4890 if (s.ok()) {
4891 db->db = ptr;
4892 }
7c673cae 4893 } else if (FLAGS_use_blob_db) {
1e59de90 4894 // Stacked BlobDB
11fdf7f2
TL
4895 blob_db::BlobDBOptions blob_db_options;
4896 blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
f67539c2 4897 blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
11fdf7f2
TL
4898 blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
4899 blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
4900 blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
4901 blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
4902 blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
4903 blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
f67539c2 4904 blob_db_options.compression = FLAGS_blob_db_compression_type_e;
11fdf7f2
TL
4905 blob_db::BlobDB* ptr = nullptr;
4906 s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
4907 if (s.ok()) {
4908 db->db = ptr;
4909 }
f67539c2
TL
4910 } else if (FLAGS_use_secondary_db) {
4911 if (FLAGS_secondary_path.empty()) {
4912 std::string default_secondary_path;
4913 FLAGS_env->GetTestDirectory(&default_secondary_path);
4914 default_secondary_path += "/dbbench_secondary";
4915 FLAGS_secondary_path = default_secondary_path;
4916 }
4917 s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
4918 if (s.ok() && FLAGS_secondary_update_interval > 0) {
4919 secondary_update_thread_.reset(new port::Thread(
4920 [this](int interval, DBWithColumnFamilies* _db) {
4921 while (0 == secondary_update_stopped_.load(
4922 std::memory_order_relaxed)) {
4923 Status secondary_update_status =
4924 _db->db->TryCatchUpWithPrimary();
4925 if (!secondary_update_status.ok()) {
4926 fprintf(stderr, "Failed to catch up with primary: %s\n",
4927 secondary_update_status.ToString().c_str());
4928 break;
4929 }
4930 ++secondary_db_updates_;
4931 FLAGS_env->SleepForMicroseconds(interval * 1000000);
4932 }
4933 },
4934 FLAGS_secondary_update_interval, db));
4935 }
11fdf7f2 4936#endif // ROCKSDB_LITE
7c673cae
FG
4937 } else {
4938 s = DB::Open(options, db_name, &db->db);
4939 }
1e59de90
TL
4940 if (FLAGS_report_open_timing) {
4941 std::cout << "OpenDb: "
4942 << (FLAGS_env->NowNanos() - open_start) / 1000000.0
4943 << " milliseconds\n";
4944 }
7c673cae
FG
4945 if (!s.ok()) {
4946 fprintf(stderr, "open error: %s\n", s.ToString().c_str());
4947 exit(1);
4948 }
4949 }
4950
1e59de90 4951 enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM };
7c673cae
FG
4952
4953 void WriteSeqDeterministic(ThreadState* thread) {
4954 DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
4955 }
4956
4957 void WriteUniqueRandomDeterministic(ThreadState* thread) {
4958 DoDeterministicCompact(thread, open_options_.compaction_style,
4959 UNIQUE_RANDOM);
4960 }
4961
1e59de90 4962 void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); }
7c673cae 4963
1e59de90 4964 void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); }
7c673cae
FG
4965
4966 void WriteUniqueRandom(ThreadState* thread) {
4967 DoWrite(thread, UNIQUE_RANDOM);
4968 }
4969
4970 class KeyGenerator {
4971 public:
11fdf7f2
TL
4972 KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
4973 uint64_t /*num_per_set*/ = 64 * 1024)
4974 : rand_(rand), mode_(mode), num_(num), next_(0) {
7c673cae
FG
4975 if (mode_ == UNIQUE_RANDOM) {
4976 // NOTE: if memory consumption of this approach becomes a concern,
4977 // we can either break it into pieces and only random shuffle a section
4978 // each time. Alternatively, use a bit map implementation
4979 // (https://reviews.facebook.net/differential/diff/54627/)
4980 values_.resize(num_);
4981 for (uint64_t i = 0; i < num_; ++i) {
4982 values_[i] = i;
4983 }
20effc67 4984 RandomShuffle(values_.begin(), values_.end(),
1e59de90 4985 static_cast<uint32_t>(seed_base));
7c673cae
FG
4986 }
4987 }
4988
4989 uint64_t Next() {
4990 switch (mode_) {
4991 case SEQUENTIAL:
4992 return next_++;
4993 case RANDOM:
4994 return rand_->Next() % num_;
4995 case UNIQUE_RANDOM:
11fdf7f2 4996 assert(next_ < num_);
7c673cae
FG
4997 return values_[next_++];
4998 }
4999 assert(false);
5000 return std::numeric_limits<uint64_t>::max();
5001 }
5002
1e59de90
TL
5003 // Only available for UNIQUE_RANDOM mode.
5004 uint64_t Fetch(uint64_t index) {
5005 assert(mode_ == UNIQUE_RANDOM);
5006 assert(index < values_.size());
5007 return values_[index];
5008 }
5009
7c673cae
FG
5010 private:
5011 Random64* rand_;
5012 WriteMode mode_;
5013 const uint64_t num_;
5014 uint64_t next_;
5015 std::vector<uint64_t> values_;
5016 };
5017
1e59de90 5018 DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; }
7c673cae
FG
5019
5020 DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
5021 return SelectDBWithCfh(thread->rand.Next());
5022 }
5023
5024 DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
5025 if (db_.db != nullptr) {
5026 return &db_;
1e59de90 5027 } else {
7c673cae
FG
5028 return &multi_dbs_[rand_int % multi_dbs_.size()];
5029 }
5030 }
5031
11fdf7f2 5032 double SineRate(double x) {
1e59de90 5033 return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d;
11fdf7f2
TL
5034 }
5035
7c673cae
FG
5036 void DoWrite(ThreadState* thread, WriteMode write_mode) {
5037 const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
5038 const int64_t num_ops = writes_ == 0 ? num_ : writes_;
5039
5040 size_t num_key_gens = 1;
5041 if (db_.db == nullptr) {
5042 num_key_gens = multi_dbs_.size();
5043 }
5044 std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
5045 int64_t max_ops = num_ops * num_key_gens;
5046 int64_t ops_per_stage = max_ops;
5047 if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
5048 ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
5049 FLAGS_num_hot_column_families) +
5050 1;
5051 }
5052
5053 Duration duration(test_duration, max_ops, ops_per_stage);
1e59de90 5054 const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_;
7c673cae 5055 for (size_t i = 0; i < num_key_gens; i++) {
11fdf7f2 5056 key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
1e59de90 5057 num_per_key_gen, ops_per_stage));
7c673cae
FG
5058 }
5059
5060 if (num_ != FLAGS_num) {
5061 char msg[100];
5062 snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
5063 thread->stats.AddMessage(msg);
5064 }
5065
5066 RandomGenerator gen;
20effc67 5067 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
1e59de90 5068 FLAGS_write_batch_protection_bytes_per_key,
20effc67 5069 user_timestamp_size_);
7c673cae
FG
5070 Status s;
5071 int64_t bytes = 0;
5072
5073 std::unique_ptr<const char[]> key_guard;
5074 Slice key = AllocateKey(&key_guard);
5075 std::unique_ptr<const char[]> begin_key_guard;
5076 Slice begin_key = AllocateKey(&begin_key_guard);
5077 std::unique_ptr<const char[]> end_key_guard;
5078 Slice end_key = AllocateKey(&end_key_guard);
1e59de90
TL
5079 double p = 0.0;
5080 uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0;
5081 // If user set overwrite_probability flag,
5082 // check if value is in [0.0,1.0].
5083 if (FLAGS_overwrite_probability > 0.0) {
5084 p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability;
5085 // If overwrite set by user, and UNIQUE_RANDOM mode on,
5086 // the overwrite_window_size must be > 0.
5087 if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) {
5088 fprintf(stderr,
5089 "Overwrite_window_size must be strictly greater than 0.\n");
5090 ErrorExit();
5091 }
5092 }
5093
5094 // Default_random_engine provides slightly
5095 // improved throughput over mt19937.
5096 std::default_random_engine overwrite_gen{
5097 static_cast<unsigned int>(seed_base)};
5098 std::bernoulli_distribution overwrite_decider(p);
5099
5100 // Inserted key window is filled with the last N
5101 // keys previously inserted into the DB (with
5102 // N=FLAGS_overwrite_window_size).
5103 // We use a deque struct because:
5104 // - random access is O(1)
5105 // - insertion/removal at beginning/end is also O(1).
5106 std::deque<int64_t> inserted_key_window;
5107 Random64 reservoir_id_gen(seed_base);
5108
5109 // --- Variables used in disposable/persistent keys simulation:
5110 // The following variables are used when
5111 // disposable_entries_batch_size is >0. We simualte a workload
5112 // where the following sequence is repeated multiple times:
5113 // "A set of keys S1 is inserted ('disposable entries'), then after
5114 // some delay another set of keys S2 is inserted ('persistent entries')
5115 // and the first set of keys S1 is deleted. S2 artificially represents
5116 // the insertion of hypothetical results from some undefined computation
5117 // done on the first set of keys S1. The next sequence can start as soon
5118 // as the last disposable entry in the set S1 of this sequence is
5119 // inserted, if the delay is non negligible"
5120 bool skip_for_loop = false, is_disposable_entry = true;
5121 std::vector<uint64_t> disposable_entries_index(num_key_gens, 0);
5122 std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0);
5123 const uint64_t kNumDispAndPersEntries =
5124 FLAGS_disposable_entries_batch_size +
5125 FLAGS_persistent_entries_batch_size;
5126 if (kNumDispAndPersEntries > 0) {
5127 if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) ||
5128 (p > 0.0)) {
5129 fprintf(
5130 stderr,
5131 "Disposable/persistent deletes are not compatible with overwrites "
5132 "and DeleteRanges; and are only supported in filluniquerandom.\n");
5133 ErrorExit();
5134 }
5135 if (FLAGS_disposable_entries_value_size < 0 ||
5136 FLAGS_persistent_entries_value_size < 0) {
5137 fprintf(
5138 stderr,
5139 "disposable_entries_value_size and persistent_entries_value_size"
5140 "have to be positive.\n");
5141 ErrorExit();
5142 }
5143 }
5144 Random rnd_disposable_entry(static_cast<uint32_t>(seed_base));
5145 std::string random_value;
5146 // Queue that stores scheduled timestamp of disposable entries deletes,
5147 // along with starting index of disposable entry keys to delete.
5148 std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q(
5149 num_key_gens);
5150 // --- End of variables used in disposable/persistent keys simulation.
5151
7c673cae
FG
5152 std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
5153 std::vector<Slice> expanded_keys;
5154 if (FLAGS_expand_range_tombstones) {
5155 expanded_key_guards.resize(range_tombstone_width_);
5156 for (auto& expanded_key_guard : expanded_key_guards) {
5157 expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
5158 }
5159 }
5160
20effc67
TL
5161 std::unique_ptr<char[]> ts_guard;
5162 if (user_timestamp_size_ > 0) {
5163 ts_guard.reset(new char[user_timestamp_size_]);
5164 }
5165
7c673cae
FG
5166 int64_t stage = 0;
5167 int64_t num_written = 0;
1e59de90
TL
5168 int64_t next_seq_db_at = num_ops;
5169 size_t id = 0;
5170 int64_t num_range_deletions = 0;
5171
5172 while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) {
7c673cae
FG
5173 if (duration.GetStage() != stage) {
5174 stage = duration.GetStage();
5175 if (db_.db != nullptr) {
5176 db_.CreateNewCf(open_options_, stage);
5177 } else {
5178 for (auto& db : multi_dbs_) {
5179 db.CreateNewCf(open_options_, stage);
5180 }
5181 }
5182 }
5183
1e59de90
TL
5184 if (write_mode != SEQUENTIAL) {
5185 id = thread->rand.Next() % num_key_gens;
5186 } else {
5187 // When doing a sequential load with multiple databases, load them in
5188 // order rather than all at the same time to avoid:
5189 // 1) long delays between flushing memtables
5190 // 2) flushing memtables for all of them at the same point in time
5191 // 3) not putting the same number of keys in each database
5192 if (num_written >= next_seq_db_at) {
5193 next_seq_db_at += num_ops;
5194 id++;
5195 if (id >= num_key_gens) {
5196 fprintf(stderr, "Logic error. Filled all databases\n");
5197 ErrorExit();
5198 }
5199 }
5200 }
7c673cae 5201 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
1e59de90 5202
7c673cae 5203 batch.Clear();
f67539c2 5204 int64_t batch_bytes = 0;
7c673cae
FG
5205
5206 for (int64_t j = 0; j < entries_per_batch_; j++) {
1e59de90
TL
5207 int64_t rand_num = 0;
5208 if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
5209 if ((inserted_key_window.size() > 0) &&
5210 overwrite_decider(overwrite_gen)) {
5211 num_overwrites++;
5212 rand_num = inserted_key_window[reservoir_id_gen.Next() %
5213 inserted_key_window.size()];
5214 } else {
5215 num_unique_keys++;
5216 rand_num = key_gens[id]->Next();
5217 if (inserted_key_window.size() < FLAGS_overwrite_window_size) {
5218 inserted_key_window.push_back(rand_num);
5219 } else {
5220 inserted_key_window.pop_front();
5221 inserted_key_window.push_back(rand_num);
5222 }
5223 }
5224 } else if (kNumDispAndPersEntries > 0) {
5225 // Check if queue is non-empty and if we need to insert
5226 // 'persistent' KV entries (KV entries that are never deleted)
5227 // and delete disposable entries previously inserted.
5228 if (!disposable_entries_q[id].empty() &&
5229 (disposable_entries_q[id].front().first <
5230 FLAGS_env->NowMicros())) {
5231 // If we need to perform a "merge op" pattern,
5232 // we first write all the persistent KV entries not targeted
5233 // by deletes, and then we write the disposable entries deletes.
5234 if (persistent_ent_and_del_index[id] <
5235 FLAGS_persistent_entries_batch_size) {
5236 // Generate key to insert.
5237 rand_num =
5238 key_gens[id]->Fetch(disposable_entries_q[id].front().second +
5239 FLAGS_disposable_entries_batch_size +
5240 persistent_ent_and_del_index[id]);
5241 persistent_ent_and_del_index[id]++;
5242 is_disposable_entry = false;
5243 skip_for_loop = false;
5244 } else if (persistent_ent_and_del_index[id] <
5245 kNumDispAndPersEntries) {
5246 // Find key of the entry to delete.
5247 rand_num =
5248 key_gens[id]->Fetch(disposable_entries_q[id].front().second +
5249 (persistent_ent_and_del_index[id] -
5250 FLAGS_persistent_entries_batch_size));
5251 persistent_ent_and_del_index[id]++;
5252 GenerateKeyFromInt(rand_num, FLAGS_num, &key);
5253 // For the delete operation, everything happens here and we
5254 // skip the rest of the for-loop, which is designed for
5255 // inserts.
5256 if (FLAGS_num_column_families <= 1) {
5257 batch.Delete(key);
5258 } else {
5259 // We use same rand_num as seed for key and column family so
5260 // that we can deterministically find the cfh corresponding to a
5261 // particular key while reading the key.
5262 batch.Delete(db_with_cfh->GetCfh(rand_num), key);
5263 }
5264 // A delete only includes Key+Timestamp (no value).
5265 batch_bytes += key_size_ + user_timestamp_size_;
5266 bytes += key_size_ + user_timestamp_size_;
5267 num_selective_deletes++;
5268 // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
5269 skip_for_loop = true;
5270 } else {
5271 assert(false); // should never reach this point.
5272 }
5273 // If disposable_entries_q needs to be updated (ie: when a selective
5274 // insert+delete was successfully completed, pop the job out of the
5275 // queue).
5276 if (!disposable_entries_q[id].empty() &&
5277 (disposable_entries_q[id].front().first <
5278 FLAGS_env->NowMicros()) &&
5279 persistent_ent_and_del_index[id] == kNumDispAndPersEntries) {
5280 disposable_entries_q[id].pop();
5281 persistent_ent_and_del_index[id] = 0;
5282 }
5283
5284 // If we are deleting disposable entries, skip the rest of the
5285 // for-loop since there is no key-value inserts at this moment in
5286 // time.
5287 if (skip_for_loop) {
5288 continue;
5289 }
5290
5291 }
5292 // If no job is in the queue, then we keep inserting disposable KV
5293 // entries that will be deleted later by a series of deletes.
5294 else {
5295 rand_num = key_gens[id]->Fetch(disposable_entries_index[id]);
5296 disposable_entries_index[id]++;
5297 is_disposable_entry = true;
5298 if ((disposable_entries_index[id] %
5299 FLAGS_disposable_entries_batch_size) == 0) {
5300 // Skip the persistent KV entries inserts for now
5301 disposable_entries_index[id] +=
5302 FLAGS_persistent_entries_batch_size;
5303 }
5304 }
5305 } else {
5306 rand_num = key_gens[id]->Next();
5307 }
7c673cae 5308 GenerateKeyFromInt(rand_num, FLAGS_num, &key);
1e59de90
TL
5309 Slice val;
5310 if (kNumDispAndPersEntries > 0) {
5311 random_value = rnd_disposable_entry.RandomString(
5312 is_disposable_entry ? FLAGS_disposable_entries_value_size
5313 : FLAGS_persistent_entries_value_size);
5314 val = Slice(random_value);
5315 num_unique_keys++;
5316 } else {
5317 val = gen.Generate();
5318 }
11fdf7f2
TL
5319 if (use_blob_db_) {
5320#ifndef ROCKSDB_LITE
1e59de90 5321 // Stacked BlobDB
11fdf7f2
TL
5322 blob_db::BlobDB* blobdb =
5323 static_cast<blob_db::BlobDB*>(db_with_cfh->db);
f67539c2
TL
5324 if (FLAGS_blob_db_max_ttl_range > 0) {
5325 int ttl = rand() % FLAGS_blob_db_max_ttl_range;
5326 s = blobdb->PutWithTTL(write_options_, key, val, ttl);
5327 } else {
5328 s = blobdb->Put(write_options_, key, val);
5329 }
11fdf7f2 5330#endif // ROCKSDB_LITE
7c673cae 5331 } else if (FLAGS_num_column_families <= 1) {
f67539c2 5332 batch.Put(key, val);
7c673cae
FG
5333 } else {
5334 // We use same rand_num as seed for key and column family so that we
5335 // can deterministically find the cfh corresponding to a particular
5336 // key while reading the key.
1e59de90 5337 batch.Put(db_with_cfh->GetCfh(rand_num), key, val);
7c673cae 5338 }
20effc67
TL
5339 batch_bytes += val.size() + key_size_ + user_timestamp_size_;
5340 bytes += val.size() + key_size_ + user_timestamp_size_;
7c673cae 5341 ++num_written;
1e59de90
TL
5342
5343 // If all disposable entries have been inserted, then we need to
5344 // add in the job queue a call for 'persistent entry insertions +
5345 // disposable entry deletions'.
5346 if (kNumDispAndPersEntries > 0 && is_disposable_entry &&
5347 ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) {
5348 // Queue contains [timestamp, starting_idx],
5349 // timestamp = current_time + delay (minimum aboslute time when to
5350 // start inserting the selective deletes) starting_idx = index in the
5351 // keygen of the rand_num to generate the key of the first KV entry to
5352 // delete (= key of the first selective delete).
5353 disposable_entries_q[id].push(std::make_pair(
5354 FLAGS_env->NowMicros() +
5355 FLAGS_disposable_entries_delete_delay /* timestamp */,
5356 disposable_entries_index[id] - kNumDispAndPersEntries
5357 /*starting idx*/));
5358 }
7c673cae 5359 if (writes_per_range_tombstone_ > 0 &&
494da23a
TL
5360 num_written > writes_before_delete_range_ &&
5361 (num_written - writes_before_delete_range_) /
5362 writes_per_range_tombstone_ <=
7c673cae 5363 max_num_range_tombstones_ &&
494da23a
TL
5364 (num_written - writes_before_delete_range_) %
5365 writes_per_range_tombstone_ ==
5366 0) {
1e59de90 5367 num_range_deletions++;
7c673cae
FG
5368 int64_t begin_num = key_gens[id]->Next();
5369 if (FLAGS_expand_range_tombstones) {
5370 for (int64_t offset = 0; offset < range_tombstone_width_;
5371 ++offset) {
5372 GenerateKeyFromInt(begin_num + offset, FLAGS_num,
5373 &expanded_keys[offset]);
11fdf7f2
TL
5374 if (use_blob_db_) {
5375#ifndef ROCKSDB_LITE
1e59de90 5376 // Stacked BlobDB
7c673cae
FG
5377 s = db_with_cfh->db->Delete(write_options_,
5378 expanded_keys[offset]);
11fdf7f2 5379#endif // ROCKSDB_LITE
7c673cae
FG
5380 } else if (FLAGS_num_column_families <= 1) {
5381 batch.Delete(expanded_keys[offset]);
5382 } else {
5383 batch.Delete(db_with_cfh->GetCfh(rand_num),
5384 expanded_keys[offset]);
5385 }
5386 }
5387 } else {
5388 GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
5389 GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
5390 &end_key);
11fdf7f2
TL
5391 if (use_blob_db_) {
5392#ifndef ROCKSDB_LITE
1e59de90 5393 // Stacked BlobDB
7c673cae
FG
5394 s = db_with_cfh->db->DeleteRange(
5395 write_options_, db_with_cfh->db->DefaultColumnFamily(),
5396 begin_key, end_key);
11fdf7f2 5397#endif // ROCKSDB_LITE
7c673cae
FG
5398 } else if (FLAGS_num_column_families <= 1) {
5399 batch.DeleteRange(begin_key, end_key);
5400 } else {
5401 batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
5402 end_key);
5403 }
5404 }
5405 }
5406 }
f67539c2
TL
5407 if (thread->shared->write_rate_limiter.get() != nullptr) {
5408 thread->shared->write_rate_limiter->Request(
1e59de90
TL
5409 batch_bytes, Env::IO_HIGH, nullptr /* stats */,
5410 RateLimiter::OpType::kWrite);
f67539c2
TL
5411 // Set time at which last op finished to Now() to hide latency and
5412 // sleep from rate limiter. Also, do the check once per batch, not
5413 // once per write.
5414 thread->stats.ResetLastOpTime();
5415 }
20effc67
TL
5416 if (user_timestamp_size_ > 0) {
5417 Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
5418 s = batch.UpdateTimestamps(
5419 user_ts, [this](uint32_t) { return user_timestamp_size_; });
20effc67
TL
5420 if (!s.ok()) {
5421 fprintf(stderr, "assign timestamp to write batch: %s\n",
5422 s.ToString().c_str());
5423 ErrorExit();
5424 }
5425 }
11fdf7f2 5426 if (!use_blob_db_) {
1e59de90 5427 // Not stacked BlobDB
7c673cae
FG
5428 s = db_with_cfh->db->Write(write_options_, &batch);
5429 }
5430 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
5431 entries_per_batch_, kWrite);
11fdf7f2
TL
5432 if (FLAGS_sine_write_rate) {
5433 uint64_t now = FLAGS_env->NowMicros();
5434
5435 uint64_t usecs_since_last;
5436 if (now > thread->stats.GetSineInterval()) {
5437 usecs_since_last = now - thread->stats.GetSineInterval();
5438 } else {
5439 usecs_since_last = 0;
5440 }
5441
5442 if (usecs_since_last >
5443 (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
5444 double usecs_since_start =
1e59de90 5445 static_cast<double>(now - thread->stats.GetStart());
11fdf7f2
TL
5446 thread->stats.ResetSineInterval();
5447 uint64_t write_rate =
1e59de90 5448 static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
11fdf7f2 5449 thread->shared->write_rate_limiter.reset(
1e59de90 5450 NewGenericRateLimiter(write_rate));
11fdf7f2
TL
5451 }
5452 }
5453 if (!s.ok()) {
5454 s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
5455 }
5456
7c673cae
FG
5457 if (!s.ok()) {
5458 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
20effc67 5459 ErrorExit();
7c673cae
FG
5460 }
5461 }
1e59de90
TL
5462 if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
5463 fprintf(stdout,
5464 "Number of unique keys inserted: %" PRIu64
5465 ".\nNumber of overwrites: %" PRIu64 "\n",
5466 num_unique_keys, num_overwrites);
5467 } else if (kNumDispAndPersEntries > 0) {
5468 fprintf(stdout,
5469 "Number of unique keys inserted (disposable+persistent): %" PRIu64
5470 ".\nNumber of 'disposable entry delete': %" PRIu64 "\n",
5471 num_written, num_selective_deletes);
5472 }
5473 if (num_range_deletions > 0) {
5474 std::cout << "Number of range deletions: " << num_range_deletions
5475 << std::endl;
5476 }
7c673cae
FG
5477 thread->stats.AddBytes(bytes);
5478 }
5479
5480 Status DoDeterministicCompact(ThreadState* thread,
5481 CompactionStyle compaction_style,
5482 WriteMode write_mode) {
5483#ifndef ROCKSDB_LITE
5484 ColumnFamilyMetaData meta;
5485 std::vector<DB*> db_list;
5486 if (db_.db != nullptr) {
5487 db_list.push_back(db_.db);
5488 } else {
5489 for (auto& db : multi_dbs_) {
5490 db_list.push_back(db.db);
5491 }
5492 }
5493 std::vector<Options> options_list;
5494 for (auto db : db_list) {
5495 options_list.push_back(db->GetOptions());
5496 if (compaction_style != kCompactionStyleFIFO) {
5497 db->SetOptions({{"disable_auto_compactions", "1"},
5498 {"level0_slowdown_writes_trigger", "400000000"},
5499 {"level0_stop_writes_trigger", "400000000"}});
5500 } else {
5501 db->SetOptions({{"disable_auto_compactions", "1"}});
5502 }
5503 }
5504
5505 assert(!db_list.empty());
5506 auto num_db = db_list.size();
5507 size_t num_levels = static_cast<size_t>(open_options_.num_levels);
5508 size_t output_level = open_options_.num_levels - 1;
5509 std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
5510 std::vector<size_t> num_files_at_level0(num_db, 0);
5511 if (compaction_style == kCompactionStyleLevel) {
5512 if (num_levels == 0) {
5513 return Status::InvalidArgument("num_levels should be larger than 1");
5514 }
5515 bool should_stop = false;
5516 while (!should_stop) {
5517 if (sorted_runs[0].empty()) {
5518 DoWrite(thread, write_mode);
5519 } else {
5520 DoWrite(thread, UNIQUE_RANDOM);
5521 }
5522 for (size_t i = 0; i < num_db; i++) {
5523 auto db = db_list[i];
5524 db->Flush(FlushOptions());
5525 db->GetColumnFamilyMetaData(&meta);
5526 if (num_files_at_level0[i] == meta.levels[0].files.size() ||
5527 writes_ == 0) {
5528 should_stop = true;
5529 continue;
5530 }
5531 sorted_runs[i].emplace_back(
5532 meta.levels[0].files.begin(),
5533 meta.levels[0].files.end() - num_files_at_level0[i]);
5534 num_files_at_level0[i] = meta.levels[0].files.size();
5535 if (sorted_runs[i].back().size() == 1) {
5536 should_stop = true;
5537 continue;
5538 }
5539 if (sorted_runs[i].size() == output_level) {
5540 auto& L1 = sorted_runs[i].back();
5541 L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
5542 should_stop = true;
5543 continue;
5544 }
5545 }
1e59de90
TL
5546 writes_ /=
5547 static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
7c673cae
FG
5548 }
5549 for (size_t i = 0; i < num_db; i++) {
5550 if (sorted_runs[i].size() < num_levels - 1) {
1e59de90
TL
5551 fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
5552 num_levels);
7c673cae
FG
5553 exit(1);
5554 }
5555 }
5556 for (size_t i = 0; i < num_db; i++) {
5557 auto db = db_list[i];
5558 auto compactionOptions = CompactionOptions();
11fdf7f2 5559 compactionOptions.compression = FLAGS_compression_type_e;
7c673cae
FG
5560 auto options = db->GetOptions();
5561 MutableCFOptions mutable_cf_options(options);
5562 for (size_t j = 0; j < sorted_runs[i].size(); j++) {
1e59de90
TL
5563 compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
5564 mutable_cf_options, static_cast<int>(output_level),
5565 compaction_style);
7c673cae 5566 std::cout << sorted_runs[i][j].size() << std::endl;
1e59de90
TL
5567 db->CompactFiles(
5568 compactionOptions,
5569 {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
5570 static_cast<int>(output_level - j) /*level*/);
7c673cae
FG
5571 }
5572 }
5573 } else if (compaction_style == kCompactionStyleUniversal) {
5574 auto ratio = open_options_.compaction_options_universal.size_ratio;
5575 bool should_stop = false;
5576 while (!should_stop) {
5577 if (sorted_runs[0].empty()) {
5578 DoWrite(thread, write_mode);
5579 } else {
5580 DoWrite(thread, UNIQUE_RANDOM);
5581 }
5582 for (size_t i = 0; i < num_db; i++) {
5583 auto db = db_list[i];
5584 db->Flush(FlushOptions());
5585 db->GetColumnFamilyMetaData(&meta);
5586 if (num_files_at_level0[i] == meta.levels[0].files.size() ||
5587 writes_ == 0) {
5588 should_stop = true;
5589 continue;
5590 }
5591 sorted_runs[i].emplace_back(
5592 meta.levels[0].files.begin(),
5593 meta.levels[0].files.end() - num_files_at_level0[i]);
5594 num_files_at_level0[i] = meta.levels[0].files.size();
5595 if (sorted_runs[i].back().size() == 1) {
5596 should_stop = true;
5597 continue;
5598 }
5599 num_files_at_level0[i] = meta.levels[0].files.size();
5600 }
1e59de90
TL
5601 writes_ = static_cast<int64_t>(writes_ * static_cast<double>(100) /
5602 (ratio + 200));
7c673cae
FG
5603 }
5604 for (size_t i = 0; i < num_db; i++) {
5605 if (sorted_runs[i].size() < num_levels) {
1e59de90
TL
5606 fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
5607 num_levels);
7c673cae
FG
5608 exit(1);
5609 }
5610 }
5611 for (size_t i = 0; i < num_db; i++) {
5612 auto db = db_list[i];
5613 auto compactionOptions = CompactionOptions();
11fdf7f2 5614 compactionOptions.compression = FLAGS_compression_type_e;
7c673cae
FG
5615 auto options = db->GetOptions();
5616 MutableCFOptions mutable_cf_options(options);
5617 for (size_t j = 0; j < sorted_runs[i].size(); j++) {
1e59de90
TL
5618 compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
5619 mutable_cf_options, static_cast<int>(output_level),
5620 compaction_style);
7c673cae
FG
5621 db->CompactFiles(
5622 compactionOptions,
5623 {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
5624 (output_level > j ? static_cast<int>(output_level - j)
5625 : 0) /*level*/);
5626 }
5627 }
5628 } else if (compaction_style == kCompactionStyleFIFO) {
5629 if (num_levels != 1) {
5630 return Status::InvalidArgument(
1e59de90 5631 "num_levels should be 1 for FIFO compaction");
7c673cae
FG
5632 }
5633 if (FLAGS_num_multi_db != 0) {
5634 return Status::InvalidArgument("Doesn't support multiDB");
5635 }
5636 auto db = db_list[0];
5637 std::vector<std::string> file_names;
5638 while (true) {
5639 if (sorted_runs[0].empty()) {
5640 DoWrite(thread, write_mode);
5641 } else {
5642 DoWrite(thread, UNIQUE_RANDOM);
5643 }
5644 db->Flush(FlushOptions());
5645 db->GetColumnFamilyMetaData(&meta);
5646 auto total_size = meta.levels[0].size;
5647 if (total_size >=
1e59de90 5648 db->GetOptions().compaction_options_fifo.max_table_files_size) {
7c673cae
FG
5649 for (auto file_meta : meta.levels[0].files) {
5650 file_names.emplace_back(file_meta.name);
5651 }
5652 break;
5653 }
5654 }
5655 // TODO(shuzhang1989): Investigate why CompactFiles not working
5656 // auto compactionOptions = CompactionOptions();
5657 // db->CompactFiles(compactionOptions, file_names, 0);
5658 auto compactionOptions = CompactRangeOptions();
5659 db->CompactRange(compactionOptions, nullptr, nullptr);
5660 } else {
5661 fprintf(stdout,
5662 "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
5663 "filldeterministic");
5664 return Status::InvalidArgument("None compaction is not supported");
5665 }
5666
5667// Verify seqno and key range
5668// Note: the seqno get changed at the max level by implementation
5669// optimization, so skip the check of the max level.
5670#ifndef NDEBUG
5671 for (size_t k = 0; k < num_db; k++) {
5672 auto db = db_list[k];
5673 db->GetColumnFamilyMetaData(&meta);
5674 // verify the number of sorted runs
5675 if (compaction_style == kCompactionStyleLevel) {
5676 assert(num_levels - 1 == sorted_runs[k].size());
5677 } else if (compaction_style == kCompactionStyleUniversal) {
5678 assert(meta.levels[0].files.size() + num_levels - 1 ==
5679 sorted_runs[k].size());
5680 } else if (compaction_style == kCompactionStyleFIFO) {
5681 // TODO(gzh): FIFO compaction
5682 db->GetColumnFamilyMetaData(&meta);
5683 auto total_size = meta.levels[0].size;
5684 assert(total_size <=
1e59de90
TL
5685 db->GetOptions().compaction_options_fifo.max_table_files_size);
5686 break;
7c673cae
FG
5687 }
5688
5689 // verify smallest/largest seqno and key range of each sorted run
5690 auto max_level = num_levels - 1;
5691 int level;
5692 for (size_t i = 0; i < sorted_runs[k].size(); i++) {
5693 level = static_cast<int>(max_level - i);
5694 SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
5695 SequenceNumber sorted_run_largest_seqno = 0;
5696 std::string sorted_run_smallest_key, sorted_run_largest_key;
5697 bool first_key = true;
5698 for (auto fileMeta : sorted_runs[k][i]) {
5699 sorted_run_smallest_seqno =
5700 std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
5701 sorted_run_largest_seqno =
5702 std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
5703 if (first_key ||
5704 db->DefaultColumnFamily()->GetComparator()->Compare(
5705 fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
5706 sorted_run_smallest_key = fileMeta.smallestkey;
5707 }
5708 if (first_key ||
5709 db->DefaultColumnFamily()->GetComparator()->Compare(
5710 fileMeta.largestkey, sorted_run_largest_key) > 0) {
5711 sorted_run_largest_key = fileMeta.largestkey;
5712 }
5713 first_key = false;
5714 }
5715 if (compaction_style == kCompactionStyleLevel ||
5716 (compaction_style == kCompactionStyleUniversal && level > 0)) {
5717 SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
5718 SequenceNumber level_largest_seqno = 0;
5719 for (auto fileMeta : meta.levels[level].files) {
5720 level_smallest_seqno =
5721 std::min(level_smallest_seqno, fileMeta.smallest_seqno);
5722 level_largest_seqno =
5723 std::max(level_largest_seqno, fileMeta.largest_seqno);
5724 }
5725 assert(sorted_run_smallest_key ==
5726 meta.levels[level].files.front().smallestkey);
5727 assert(sorted_run_largest_key ==
5728 meta.levels[level].files.back().largestkey);
5729 if (level != static_cast<int>(max_level)) {
5730 // compaction at max_level would change sequence number
5731 assert(sorted_run_smallest_seqno == level_smallest_seqno);
5732 assert(sorted_run_largest_seqno == level_largest_seqno);
5733 }
5734 } else if (compaction_style == kCompactionStyleUniversal) {
5735 // level <= 0 means sorted runs on level 0
5736 auto level0_file =
5737 meta.levels[0].files[sorted_runs[k].size() - 1 - i];
5738 assert(sorted_run_smallest_key == level0_file.smallestkey);
5739 assert(sorted_run_largest_key == level0_file.largestkey);
5740 if (level != static_cast<int>(max_level)) {
5741 assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
5742 assert(sorted_run_largest_seqno == level0_file.largest_seqno);
5743 }
5744 }
5745 }
5746 }
5747#endif
5748 // print the size of each sorted_run
5749 for (size_t k = 0; k < num_db; k++) {
5750 auto db = db_list[k];
5751 fprintf(stdout,
1e59de90
TL
5752 "---------------------- DB %" ROCKSDB_PRIszt
5753 " LSM ---------------------\n",
5754 k);
7c673cae
FG
5755 db->GetColumnFamilyMetaData(&meta);
5756 for (auto& levelMeta : meta.levels) {
5757 if (levelMeta.files.empty()) {
5758 continue;
5759 }
5760 if (levelMeta.level == 0) {
5761 for (auto& fileMeta : levelMeta.files) {
1e59de90 5762 fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n",
7c673cae
FG
5763 levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
5764 }
5765 } else {
5766 fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
5767 levelMeta.level, levelMeta.files.front().name.c_str(),
5768 levelMeta.files.back().name.c_str(), levelMeta.size);
5769 }
5770 }
5771 }
5772 for (size_t i = 0; i < num_db; i++) {
5773 db_list[i]->SetOptions(
5774 {{"disable_auto_compactions",
5775 std::to_string(options_list[i].disable_auto_compactions)},
5776 {"level0_slowdown_writes_trigger",
5777 std::to_string(options_list[i].level0_slowdown_writes_trigger)},
5778 {"level0_stop_writes_trigger",
5779 std::to_string(options_list[i].level0_stop_writes_trigger)}});
5780 }
5781 return Status::OK();
5782#else
11fdf7f2
TL
5783 (void)thread;
5784 (void)compaction_style;
5785 (void)write_mode;
7c673cae
FG
5786 fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
5787 return Status::NotSupported(
5788 "Rocksdb Lite doesn't support filldeterministic");
5789#endif // ROCKSDB_LITE
5790 }
5791
5792 void ReadSequential(ThreadState* thread) {
5793 if (db_.db != nullptr) {
5794 ReadSequential(thread, db_.db);
5795 } else {
5796 for (const auto& db_with_cfh : multi_dbs_) {
5797 ReadSequential(thread, db_with_cfh.db);
5798 }
5799 }
5800 }
5801
5802 void ReadSequential(ThreadState* thread, DB* db) {
1e59de90 5803 ReadOptions options = read_options_;
20effc67
TL
5804 std::unique_ptr<char[]> ts_guard;
5805 Slice ts;
5806 if (user_timestamp_size_ > 0) {
5807 ts_guard.reset(new char[user_timestamp_size_]);
5808 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5809 options.timestamp = &ts;
5810 }
7c673cae 5811
1e59de90
TL
5812 options.adaptive_readahead = FLAGS_adaptive_readahead;
5813 options.async_io = FLAGS_async_io;
5814
7c673cae
FG
5815 Iterator* iter = db->NewIterator(options);
5816 int64_t i = 0;
5817 int64_t bytes = 0;
5818 for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
5819 bytes += iter->key().size() + iter->value().size();
5820 thread->stats.FinishedOps(nullptr, db, 1, kRead);
5821 ++i;
5822
5823 if (thread->shared->read_rate_limiter.get() != nullptr &&
5824 i % 1024 == 1023) {
5825 thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
11fdf7f2
TL
5826 nullptr /* stats */,
5827 RateLimiter::OpType::kRead);
7c673cae
FG
5828 }
5829 }
5830
5831 delete iter;
5832 thread->stats.AddBytes(bytes);
f67539c2
TL
5833 }
5834
5835 void ReadToRowCache(ThreadState* thread) {
5836 int64_t read = 0;
5837 int64_t found = 0;
5838 int64_t bytes = 0;
5839 int64_t key_rand = 0;
f67539c2
TL
5840 std::unique_ptr<const char[]> key_guard;
5841 Slice key = AllocateKey(&key_guard);
5842 PinnableSlice pinnable_val;
5843
5844 while (key_rand < FLAGS_num) {
5845 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5846 // We use same key_rand as seed for key and column family so that we can
5847 // deterministically find the cfh corresponding to a particular key, as it
5848 // is done in DoWrite method.
5849 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5850 key_rand++;
5851 read++;
5852 Status s;
5853 if (FLAGS_num_column_families > 1) {
1e59de90
TL
5854 s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
5855 key, &pinnable_val);
f67539c2
TL
5856 } else {
5857 pinnable_val.Reset();
1e59de90 5858 s = db_with_cfh->db->Get(read_options_,
f67539c2
TL
5859 db_with_cfh->db->DefaultColumnFamily(), key,
5860 &pinnable_val);
5861 }
5862
5863 if (s.ok()) {
5864 found++;
5865 bytes += key.size() + pinnable_val.size();
5866 } else if (!s.IsNotFound()) {
5867 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5868 abort();
5869 }
5870
5871 if (thread->shared->read_rate_limiter.get() != nullptr &&
5872 read % 256 == 255) {
5873 thread->shared->read_rate_limiter->Request(
5874 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5875 }
5876
5877 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5878 }
5879
5880 char msg[100];
5881 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
5882 read);
5883
5884 thread->stats.AddBytes(bytes);
5885 thread->stats.AddMessage(msg);
7c673cae
FG
5886 }
5887
5888 void ReadReverse(ThreadState* thread) {
5889 if (db_.db != nullptr) {
5890 ReadReverse(thread, db_.db);
5891 } else {
5892 for (const auto& db_with_cfh : multi_dbs_) {
5893 ReadReverse(thread, db_with_cfh.db);
5894 }
5895 }
5896 }
5897
5898 void ReadReverse(ThreadState* thread, DB* db) {
1e59de90 5899 Iterator* iter = db->NewIterator(read_options_);
7c673cae
FG
5900 int64_t i = 0;
5901 int64_t bytes = 0;
5902 for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
5903 bytes += iter->key().size() + iter->value().size();
5904 thread->stats.FinishedOps(nullptr, db, 1, kRead);
5905 ++i;
5906 if (thread->shared->read_rate_limiter.get() != nullptr &&
5907 i % 1024 == 1023) {
5908 thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
11fdf7f2
TL
5909 nullptr /* stats */,
5910 RateLimiter::OpType::kRead);
7c673cae
FG
5911 }
5912 }
5913 delete iter;
5914 thread->stats.AddBytes(bytes);
5915 }
5916
5917 void ReadRandomFast(ThreadState* thread) {
5918 int64_t read = 0;
5919 int64_t found = 0;
5920 int64_t nonexist = 0;
1e59de90 5921 ReadOptions options = read_options_;
7c673cae
FG
5922 std::unique_ptr<const char[]> key_guard;
5923 Slice key = AllocateKey(&key_guard);
5924 std::string value;
20effc67
TL
5925 Slice ts;
5926 std::unique_ptr<char[]> ts_guard;
5927 if (user_timestamp_size_ > 0) {
5928 ts_guard.reset(new char[user_timestamp_size_]);
5929 }
7c673cae
FG
5930 DB* db = SelectDBWithCfh(thread)->db;
5931
5932 int64_t pot = 1;
5933 while (pot < FLAGS_num) {
5934 pot <<= 1;
5935 }
5936
5937 Duration duration(FLAGS_duration, reads_);
5938 do {
5939 for (int i = 0; i < 100; ++i) {
5940 int64_t key_rand = thread->rand.Next() & (pot - 1);
5941 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5942 ++read;
20effc67
TL
5943 std::string ts_ret;
5944 std::string* ts_ptr = nullptr;
5945 if (user_timestamp_size_ > 0) {
5946 ts = mock_app_clock_->GetTimestampForRead(thread->rand,
5947 ts_guard.get());
5948 options.timestamp = &ts;
5949 ts_ptr = &ts_ret;
5950 }
5951 auto status = db->Get(options, key, &value, ts_ptr);
7c673cae
FG
5952 if (status.ok()) {
5953 ++found;
5954 } else if (!status.IsNotFound()) {
5955 fprintf(stderr, "Get returned an error: %s\n",
5956 status.ToString().c_str());
5957 abort();
5958 }
5959 if (key_rand >= FLAGS_num) {
5960 ++nonexist;
5961 }
5962 }
5963 if (thread->shared->read_rate_limiter.get() != nullptr) {
11fdf7f2
TL
5964 thread->shared->read_rate_limiter->Request(
5965 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7c673cae
FG
5966 }
5967
5968 thread->stats.FinishedOps(nullptr, db, 100, kRead);
5969 } while (!duration.Done(100));
5970
5971 char msg[100];
1e59de90
TL
5972 snprintf(msg, sizeof(msg),
5973 "(%" PRIu64 " of %" PRIu64
5974 " found, "
7c673cae
FG
5975 "issued %" PRIu64 " non-exist keys)\n",
5976 found, read, nonexist);
5977
5978 thread->stats.AddMessage(msg);
7c673cae
FG
5979 }
5980
5981 int64_t GetRandomKey(Random64* rand) {
5982 uint64_t rand_int = rand->Next();
5983 int64_t key_rand;
5984 if (read_random_exp_range_ == 0) {
5985 key_rand = rand_int % FLAGS_num;
5986 } else {
5987 const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
5988 long double order = -static_cast<long double>(rand_int % kBigInt) /
5989 static_cast<long double>(kBigInt) *
5990 read_random_exp_range_;
5991 long double exp_ran = std::exp(order);
5992 uint64_t rand_num =
5993 static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
5994 // Map to a different number to avoid locality.
5995 const uint64_t kBigPrime = 0x5bd1e995;
5996 // Overflow is like %(2^64). Will have little impact of results.
5997 key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
5998 }
5999 return key_rand;
6000 }
6001
6002 void ReadRandom(ThreadState* thread) {
6003 int64_t read = 0;
6004 int64_t found = 0;
6005 int64_t bytes = 0;
f67539c2 6006 int num_keys = 0;
20effc67 6007 int64_t key_rand = 0;
1e59de90 6008 ReadOptions options = read_options_;
7c673cae
FG
6009 std::unique_ptr<const char[]> key_guard;
6010 Slice key = AllocateKey(&key_guard);
7c673cae 6011 PinnableSlice pinnable_val;
1e59de90
TL
6012 std::vector<PinnableSlice> pinnable_vals;
6013 if (read_operands_) {
6014 // Start off with a small-ish value that'll be increased later if
6015 // `GetMergeOperands()` tells us it is not large enough.
6016 pinnable_vals.resize(8);
6017 }
20effc67
TL
6018 std::unique_ptr<char[]> ts_guard;
6019 Slice ts;
6020 if (user_timestamp_size_ > 0) {
6021 ts_guard.reset(new char[user_timestamp_size_]);
6022 }
7c673cae
FG
6023
6024 Duration duration(FLAGS_duration, reads_);
6025 while (!duration.Done(1)) {
6026 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6027 // We use same key_rand as seed for key and column family so that we can
6028 // deterministically find the cfh corresponding to a particular key, as it
6029 // is done in DoWrite method.
f67539c2
TL
6030 if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
6031 if (++num_keys == entries_per_batch_) {
6032 num_keys = 0;
6033 key_rand = GetRandomKey(&thread->rand);
6034 if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
6035 FLAGS_num) {
6036 key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
6037 }
6038 } else {
6039 key_rand += FLAGS_multiread_stride;
6040 }
6041 } else {
6042 key_rand = GetRandomKey(&thread->rand);
6043 }
20effc67 6044 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
7c673cae 6045 read++;
20effc67
TL
6046 std::string ts_ret;
6047 std::string* ts_ptr = nullptr;
6048 if (user_timestamp_size_ > 0) {
6049 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6050 options.timestamp = &ts;
6051 ts_ptr = &ts_ret;
6052 }
7c673cae 6053 Status s;
1e59de90
TL
6054 pinnable_val.Reset();
6055 for (size_t i = 0; i < pinnable_vals.size(); ++i) {
6056 pinnable_vals[i].Reset();
6057 }
6058 ColumnFamilyHandle* cfh;
7c673cae 6059 if (FLAGS_num_column_families > 1) {
1e59de90 6060 cfh = db_with_cfh->GetCfh(key_rand);
7c673cae 6061 } else {
1e59de90
TL
6062 cfh = db_with_cfh->db->DefaultColumnFamily();
6063 }
6064 if (read_operands_) {
6065 GetMergeOperandsOptions get_merge_operands_options;
6066 get_merge_operands_options.expected_max_number_of_operands =
6067 static_cast<int>(pinnable_vals.size());
6068 int number_of_operands;
6069 s = db_with_cfh->db->GetMergeOperands(
6070 options, cfh, key, pinnable_vals.data(),
6071 &get_merge_operands_options, &number_of_operands);
6072 if (s.IsIncomplete()) {
6073 // Should only happen a few times when we encounter a key that had
6074 // more merge operands than any key seen so far. Production use case
6075 // would typically retry in such event to get all the operands so do
6076 // that here.
6077 pinnable_vals.resize(number_of_operands);
6078 get_merge_operands_options.expected_max_number_of_operands =
6079 static_cast<int>(pinnable_vals.size());
6080 s = db_with_cfh->db->GetMergeOperands(
6081 options, cfh, key, pinnable_vals.data(),
6082 &get_merge_operands_options, &number_of_operands);
6083 }
6084 } else {
6085 s = db_with_cfh->db->Get(options, cfh, key, &pinnable_val, ts_ptr);
7c673cae 6086 }
1e59de90 6087
7c673cae
FG
6088 if (s.ok()) {
6089 found++;
20effc67 6090 bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
1e59de90
TL
6091 for (size_t i = 0; i < pinnable_vals.size(); ++i) {
6092 bytes += pinnable_vals[i].size();
6093 pinnable_vals[i].Reset();
6094 }
7c673cae
FG
6095 } else if (!s.IsNotFound()) {
6096 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
6097 abort();
6098 }
6099
6100 if (thread->shared->read_rate_limiter.get() != nullptr &&
6101 read % 256 == 255) {
11fdf7f2
TL
6102 thread->shared->read_rate_limiter->Request(
6103 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7c673cae
FG
6104 }
6105
6106 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6107 }
6108
6109 char msg[100];
1e59de90
TL
6110 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
6111 read);
7c673cae
FG
6112
6113 thread->stats.AddBytes(bytes);
6114 thread->stats.AddMessage(msg);
7c673cae
FG
6115 }
6116
6117 // Calls MultiGet over a list of keys from a random distribution.
6118 // Returns the total number of keys found.
6119 void MultiReadRandom(ThreadState* thread) {
6120 int64_t read = 0;
1e59de90 6121 int64_t bytes = 0;
7c673cae
FG
6122 int64_t num_multireads = 0;
6123 int64_t found = 0;
1e59de90 6124 ReadOptions options = read_options_;
7c673cae 6125 std::vector<Slice> keys;
1e59de90 6126 std::vector<std::unique_ptr<const char[]>> key_guards;
7c673cae 6127 std::vector<std::string> values(entries_per_batch_);
f67539c2
TL
6128 PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
6129 std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
6130 std::vector<Status> stat_list(entries_per_batch_);
7c673cae
FG
6131 while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
6132 key_guards.push_back(std::unique_ptr<const char[]>());
6133 keys.push_back(AllocateKey(&key_guards.back()));
6134 }
6135
20effc67
TL
6136 std::unique_ptr<char[]> ts_guard;
6137 if (user_timestamp_size_ > 0) {
6138 ts_guard.reset(new char[user_timestamp_size_]);
6139 }
6140
7c673cae 6141 Duration duration(FLAGS_duration, reads_);
1e59de90 6142 while (!duration.Done(entries_per_batch_)) {
7c673cae 6143 DB* db = SelectDB(thread);
f67539c2
TL
6144 if (FLAGS_multiread_stride) {
6145 int64_t key = GetRandomKey(&thread->rand);
6146 if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
6147 static_cast<int64_t>(FLAGS_num)) {
6148 key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
6149 }
6150 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6151 GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
6152 key += FLAGS_multiread_stride;
6153 }
6154 } else {
6155 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6156 GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
6157 }
7c673cae 6158 }
20effc67
TL
6159 Slice ts;
6160 if (user_timestamp_size_ > 0) {
6161 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6162 options.timestamp = &ts;
6163 }
f67539c2
TL
6164 if (!FLAGS_multiread_batched) {
6165 std::vector<Status> statuses = db->MultiGet(options, keys, &values);
6166 assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
6167
6168 read += entries_per_batch_;
6169 num_multireads++;
6170 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6171 if (statuses[i].ok()) {
1e59de90 6172 bytes += keys[i].size() + values[i].size() + user_timestamp_size_;
f67539c2
TL
6173 ++found;
6174 } else if (!statuses[i].IsNotFound()) {
6175 fprintf(stderr, "MultiGet returned an error: %s\n",
6176 statuses[i].ToString().c_str());
6177 abort();
6178 }
6179 }
6180 } else {
6181 db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
6182 keys.data(), pin_values, stat_list.data());
6183
6184 read += entries_per_batch_;
6185 num_multireads++;
6186 for (int64_t i = 0; i < entries_per_batch_; ++i) {
6187 if (stat_list[i].ok()) {
1e59de90
TL
6188 bytes +=
6189 keys[i].size() + pin_values[i].size() + user_timestamp_size_;
f67539c2
TL
6190 ++found;
6191 } else if (!stat_list[i].IsNotFound()) {
6192 fprintf(stderr, "MultiGet returned an error: %s\n",
6193 stat_list[i].ToString().c_str());
6194 abort();
6195 }
6196 stat_list[i] = Status::OK();
6197 pin_values[i].Reset();
7c673cae
FG
6198 }
6199 }
6200 if (thread->shared->read_rate_limiter.get() != nullptr &&
6201 num_multireads % 256 == 255) {
6202 thread->shared->read_rate_limiter->Request(
11fdf7f2
TL
6203 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
6204 RateLimiter::OpType::kRead);
7c673cae
FG
6205 }
6206 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
6207 }
6208
6209 char msg[100];
1e59de90
TL
6210 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
6211 read);
6212 thread->stats.AddBytes(bytes);
7c673cae
FG
6213 thread->stats.AddMessage(msg);
6214 }
6215
20effc67
TL
6216 // Calls ApproximateSize over random key ranges.
6217 void ApproximateSizeRandom(ThreadState* thread) {
6218 int64_t size_sum = 0;
6219 int64_t num_sizes = 0;
6220 const size_t batch_size = entries_per_batch_;
6221 std::vector<Range> ranges;
6222 std::vector<Slice> lkeys;
6223 std::vector<std::unique_ptr<const char[]>> lkey_guards;
6224 std::vector<Slice> rkeys;
6225 std::vector<std::unique_ptr<const char[]>> rkey_guards;
6226 std::vector<uint64_t> sizes;
6227 while (ranges.size() < batch_size) {
6228 // Ugly without C++17 return from emplace_back
6229 lkey_guards.emplace_back();
6230 rkey_guards.emplace_back();
6231 lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
6232 rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
6233 ranges.emplace_back(lkeys.back(), rkeys.back());
6234 sizes.push_back(0);
6235 }
6236 Duration duration(FLAGS_duration, reads_);
6237 while (!duration.Done(1)) {
6238 DB* db = SelectDB(thread);
6239 for (size_t i = 0; i < batch_size; ++i) {
6240 int64_t lkey = GetRandomKey(&thread->rand);
6241 int64_t rkey = GetRandomKey(&thread->rand);
6242 if (lkey > rkey) {
6243 std::swap(lkey, rkey);
6244 }
6245 GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
6246 GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
6247 }
6248 db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
6249 &sizes[0]);
6250 num_sizes += entries_per_batch_;
6251 for (int64_t size : sizes) {
6252 size_sum += size;
6253 }
6254 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
6255 }
6256
6257 char msg[100];
6258 snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
6259 static_cast<double>(size_sum) / static_cast<double>(num_sizes));
6260 thread->stats.AddMessage(msg);
6261 }
6262
f67539c2 6263 // The inverse function of Pareto distribution
494da23a
TL
6264 int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
6265 double ret;
6266 if (k == 0.0) {
6267 ret = theta - sigma * std::log(u);
6268 } else {
6269 ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
6270 }
6271 return static_cast<int64_t>(ceil(ret));
6272 }
f67539c2 6273 // The inverse function of power distribution (y=ax^b)
494da23a
TL
6274 int64_t PowerCdfInversion(double u, double a, double b) {
6275 double ret;
6276 ret = std::pow((u / a), (1 / b));
6277 return static_cast<int64_t>(ceil(ret));
6278 }
6279
6280 // Add the noice to the QPS
6281 double AddNoise(double origin, double noise_ratio) {
6282 if (noise_ratio < 0.0 || noise_ratio > 1.0) {
6283 return origin;
6284 }
6285 int band_int = static_cast<int>(FLAGS_sine_a);
6286 double delta = (rand() % band_int - band_int / 2) * noise_ratio;
6287 if (origin + delta < 0) {
6288 return origin;
6289 } else {
6290 return (origin + delta);
6291 }
6292 }
6293
f67539c2 6294 // Decide the ratio of different query types
494da23a
TL
6295 // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
6296 class QueryDecider {
6297 public:
6298 std::vector<int> type_;
6299 std::vector<double> ratio_;
6300 int range_;
6301
6302 QueryDecider() {}
6303 ~QueryDecider() {}
6304
6305 Status Initiate(std::vector<double> ratio_input) {
6306 int range_max = 1000;
6307 double sum = 0.0;
6308 for (auto& ratio : ratio_input) {
6309 sum += ratio;
6310 }
6311 range_ = 0;
6312 for (auto& ratio : ratio_input) {
6313 range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
6314 type_.push_back(range_);
6315 ratio_.push_back(ratio / sum);
6316 }
6317 return Status::OK();
6318 }
6319
6320 int GetType(int64_t rand_num) {
6321 if (rand_num < 0) {
6322 rand_num = rand_num * (-1);
6323 }
6324 assert(range_ != 0);
6325 int pos = static_cast<int>(rand_num % range_);
6326 for (int i = 0; i < static_cast<int>(type_.size()); i++) {
6327 if (pos < type_[i]) {
6328 return i;
6329 }
6330 }
6331 return 0;
6332 }
6333 };
6334
f67539c2
TL
6335 // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
6336 // to transfer a random value to one keyrange based on the hotness.
6337 struct KeyrangeUnit {
6338 int64_t keyrange_start;
6339 int64_t keyrange_access;
6340 int64_t keyrange_keys;
6341 };
6342
6343 // From our observations, the prefix hotness (key-range hotness) follows
6344 // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
6345 // However, we cannot directly use the inverse function to decide a
6346 // key-range from a random distribution. To achieve it, we create a list of
6347 // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
6348 // decided based on the hotness of the key-range. When a random value is
6349 // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
6350 // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being
6351 // selected is the same as the hotness of this KeyrangeUnit. After that, the
6352 // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
6353 // can based on the power distribution (y=ax^b) to generate the offset of
6354 // the key in the selected key-range. In this way, we generate the keyID
6355 // based on the hotness of the prefix and also the key hotness distribution.
6356 class GenerateTwoTermExpKeys {
6357 public:
20effc67
TL
6358 // Avoid uninitialized warning-as-error in some compilers
6359 int64_t keyrange_rand_max_ = 0;
6360 int64_t keyrange_size_ = 0;
6361 int64_t keyrange_num_ = 0;
f67539c2
TL
6362 std::vector<KeyrangeUnit> keyrange_set_;
6363
f67539c2
TL
6364 // Initiate the KeyrangeUnit vector and calculate the size of each
6365 // KeyrangeUnit.
6366 Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
6367 double prefix_b, double prefix_c,
6368 double prefix_d) {
6369 int64_t amplify = 0;
6370 int64_t keyrange_start = 0;
f67539c2
TL
6371 if (FLAGS_keyrange_num <= 0) {
6372 keyrange_num_ = 1;
6373 } else {
6374 keyrange_num_ = FLAGS_keyrange_num;
6375 }
6376 keyrange_size_ = total_keys / keyrange_num_;
6377
6378 // Calculate the key-range shares size based on the input parameters
6379 for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
6380 // Step 1. Calculate the probability that this key range will be
6381 // accessed in a query. It is based on the two-term expoential
6382 // distribution
6383 double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
6384 prefix_c * std::exp(prefix_d * pfx);
6385 if (keyrange_p < std::pow(10.0, -16.0)) {
6386 keyrange_p = 0.0;
6387 }
6388 // Step 2. Calculate the amplify
6389 // In order to allocate a query to a key-range based on the random
6390 // number generated for this query, we need to extend the probability
6391 // of each key range from [0,1] to [0, amplify]. Amplify is calculated
6392 // by 1/(smallest key-range probability). In this way, we ensure that
6393 // all key-ranges are assigned with an Integer that >=0
6394 if (amplify == 0 && keyrange_p > 0) {
6395 amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
6396 }
6397
6398 // Step 3. For each key-range, we calculate its position in the
6399 // [0, amplify] range, including the start, the size (keyrange_access)
6400 KeyrangeUnit p_unit;
6401 p_unit.keyrange_start = keyrange_start;
6402 if (0.0 >= keyrange_p) {
6403 p_unit.keyrange_access = 0;
6404 } else {
6405 p_unit.keyrange_access =
6406 static_cast<int64_t>(std::floor(amplify * keyrange_p));
6407 }
6408 p_unit.keyrange_keys = keyrange_size_;
6409 keyrange_set_.push_back(p_unit);
6410 keyrange_start += p_unit.keyrange_access;
6411 }
6412 keyrange_rand_max_ = keyrange_start;
6413
6414 // Step 4. Shuffle the key-ranges randomly
6415 // Since the access probability is calculated from small to large,
6416 // If we do not re-allocate them, hot key-ranges are always at the end
6417 // and cold key-ranges are at the begin of the key space. Therefore, the
6418 // key-ranges are shuffled and the rand seed is only decide by the
6419 // key-range hotness distribution. With the same distribution parameters
6420 // the shuffle results are the same.
6421 Random64 rand_loca(keyrange_rand_max_);
6422 for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
6423 int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
6424 assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
6425 pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
6426 std::swap(keyrange_set_[i], keyrange_set_[pos]);
6427 }
6428
6429 // Step 5. Recalculate the prefix start postion after shuffling
6430 int64_t offset = 0;
6431 for (auto& p_unit : keyrange_set_) {
6432 p_unit.keyrange_start = offset;
6433 offset += p_unit.keyrange_access;
6434 }
6435
6436 return Status::OK();
6437 }
6438
6439 // Generate the Key ID according to the input ini_rand and key distribution
6440 int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
6441 double key_dist_b) {
6442 int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
6443
6444 // Calculate and select one key-range that contains the new key
6445 int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
6446 while (start + 1 < end) {
6447 int64_t mid = start + (end - start) / 2;
6448 assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
6449 if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
6450 end = mid;
6451 } else {
6452 start = mid;
6453 }
6454 }
6455 int64_t keyrange_id = start;
6456
6457 // Select one key in the key-range and compose the keyID
6458 int64_t key_offset = 0, key_seed;
20effc67 6459 if (key_dist_a == 0.0 || key_dist_b == 0.0) {
f67539c2
TL
6460 key_offset = ini_rand % keyrange_size_;
6461 } else {
20effc67
TL
6462 double u =
6463 static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
f67539c2 6464 key_seed = static_cast<int64_t>(
20effc67 6465 ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
f67539c2 6466 Random64 rand_key(key_seed);
20effc67 6467 key_offset = rand_key.Next() % keyrange_size_;
f67539c2
TL
6468 }
6469 return keyrange_size_ * keyrange_id + key_offset;
6470 }
6471 };
6472
1e59de90 6473 // The social graph workload mixed with Get, Put, Iterator queries.
f67539c2
TL
6474 // The value size and iterator length follow Pareto distribution.
6475 // The overall key access follow power distribution. If user models the
6476 // workload based on different key-ranges (or different prefixes), user
6477 // can use two-term-exponential distribution to fit the workload. User
1e59de90 6478 // needs to decide the ratio between Get, Put, Iterator queries before
f67539c2 6479 // starting the benchmark.
494da23a 6480 void MixGraph(ThreadState* thread) {
494da23a
TL
6481 int64_t gets = 0;
6482 int64_t puts = 0;
1e59de90 6483 int64_t get_found = 0;
494da23a
TL
6484 int64_t seek = 0;
6485 int64_t seek_found = 0;
6486 int64_t bytes = 0;
1e59de90
TL
6487 double total_scan_length = 0;
6488 double total_val_size = 0;
494da23a
TL
6489 const int64_t default_value_max = 1 * 1024 * 1024;
6490 int64_t value_max = default_value_max;
6491 int64_t scan_len_max = FLAGS_mix_max_scan_len;
6492 double write_rate = 1000000.0;
6493 double read_rate = 1000000.0;
f67539c2 6494 bool use_prefix_modeling = false;
20effc67 6495 bool use_random_modeling = false;
f67539c2 6496 GenerateTwoTermExpKeys gen_exp;
494da23a
TL
6497 std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
6498 FLAGS_mix_seek_ratio};
6499 char value_buffer[default_value_max];
6500 QueryDecider query;
6501 RandomGenerator gen;
6502 Status s;
6503 if (value_max > FLAGS_mix_max_value_size) {
6504 value_max = FLAGS_mix_max_value_size;
6505 }
6506
494da23a
TL
6507 std::unique_ptr<const char[]> key_guard;
6508 Slice key = AllocateKey(&key_guard);
6509 PinnableSlice pinnable_val;
6510 query.Initiate(ratio);
6511
6512 // the limit of qps initiation
1e59de90
TL
6513 if (FLAGS_sine_mix_rate) {
6514 thread->shared->read_rate_limiter.reset(
6515 NewGenericRateLimiter(static_cast<int64_t>(read_rate)));
494da23a 6516 thread->shared->write_rate_limiter.reset(
f67539c2
TL
6517 NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
6518 }
6519
6520 // Decide if user wants to use prefix based key generation
6521 if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
6522 FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
6523 use_prefix_modeling = true;
6524 gen_exp.InitiateExpDistribution(
6525 FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
6526 FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
494da23a 6527 }
20effc67
TL
6528 if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
6529 use_random_modeling = true;
6530 }
494da23a
TL
6531
6532 Duration duration(FLAGS_duration, reads_);
6533 while (!duration.Done(1)) {
6534 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
f67539c2
TL
6535 int64_t ini_rand, rand_v, key_rand, key_seed;
6536 ini_rand = GetRandomKey(&thread->rand);
6537 rand_v = ini_rand % FLAGS_num;
494da23a 6538 double u = static_cast<double>(rand_v) / FLAGS_num;
f67539c2
TL
6539
6540 // Generate the keyID based on the key hotness and prefix hotness
20effc67
TL
6541 if (use_random_modeling) {
6542 key_rand = ini_rand;
6543 } else if (use_prefix_modeling) {
f67539c2
TL
6544 key_rand =
6545 gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
6546 } else {
6547 key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
6548 Random64 rand(key_seed);
6549 key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
6550 }
494da23a
TL
6551 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
6552 int query_type = query.GetType(rand_v);
6553
6554 // change the qps
6555 uint64_t now = FLAGS_env->NowMicros();
6556 uint64_t usecs_since_last;
6557 if (now > thread->stats.GetSineInterval()) {
6558 usecs_since_last = now - thread->stats.GetSineInterval();
6559 } else {
6560 usecs_since_last = 0;
6561 }
6562
1e59de90
TL
6563 if (FLAGS_sine_mix_rate &&
6564 usecs_since_last >
6565 (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
494da23a
TL
6566 double usecs_since_start =
6567 static_cast<double>(now - thread->stats.GetStart());
6568 thread->stats.ResetSineInterval();
6569 double mix_rate_with_noise = AddNoise(
6570 SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
6571 read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
1e59de90 6572 write_rate = mix_rate_with_noise * query.ratio_[1];
494da23a 6573
1e59de90
TL
6574 if (read_rate > 0) {
6575 thread->shared->read_rate_limiter->SetBytesPerSecond(
6576 static_cast<int64_t>(read_rate));
6577 }
6578 if (write_rate > 0) {
6579 thread->shared->write_rate_limiter->SetBytesPerSecond(
6580 static_cast<int64_t>(write_rate));
6581 }
494da23a
TL
6582 }
6583 // Start the query
6584 if (query_type == 0) {
6585 // the Get query
6586 gets++;
494da23a 6587 if (FLAGS_num_column_families > 1) {
1e59de90
TL
6588 s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
6589 key, &pinnable_val);
494da23a
TL
6590 } else {
6591 pinnable_val.Reset();
1e59de90 6592 s = db_with_cfh->db->Get(read_options_,
494da23a
TL
6593 db_with_cfh->db->DefaultColumnFamily(), key,
6594 &pinnable_val);
6595 }
6596
6597 if (s.ok()) {
1e59de90 6598 get_found++;
494da23a
TL
6599 bytes += key.size() + pinnable_val.size();
6600 } else if (!s.IsNotFound()) {
6601 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
6602 abort();
6603 }
6604
1e59de90
TL
6605 if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) {
6606 thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH,
6607 nullptr /*stats*/);
494da23a
TL
6608 }
6609 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
6610 } else if (query_type == 1) {
6611 // the Put query
6612 puts++;
1e59de90
TL
6613 int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta,
6614 FLAGS_value_k, FLAGS_value_sigma);
6615 if (val_size < 10) {
f67539c2
TL
6616 val_size = 10;
6617 } else if (val_size > value_max) {
6618 val_size = val_size % value_max;
494da23a 6619 }
1e59de90
TL
6620 total_val_size += val_size;
6621
494da23a
TL
6622 s = db_with_cfh->db->Put(
6623 write_options_, key,
f67539c2 6624 gen.Generate(static_cast<unsigned int>(val_size)));
494da23a
TL
6625 if (!s.ok()) {
6626 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
20effc67 6627 ErrorExit();
494da23a
TL
6628 }
6629
1e59de90
TL
6630 if (thread->shared->write_rate_limiter && puts % 100 == 0) {
6631 thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH,
6632 nullptr /*stats*/);
494da23a
TL
6633 }
6634 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
6635 } else if (query_type == 2) {
6636 // Seek query
6637 if (db_with_cfh->db != nullptr) {
6638 Iterator* single_iter = nullptr;
1e59de90 6639 single_iter = db_with_cfh->db->NewIterator(read_options_);
494da23a
TL
6640 if (single_iter != nullptr) {
6641 single_iter->Seek(key);
6642 seek++;
494da23a
TL
6643 if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
6644 seek_found++;
6645 }
6646 int64_t scan_length =
6647 ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
6648 FLAGS_iter_sigma) %
6649 scan_len_max;
6650 for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
6651 Slice value = single_iter->value();
6652 memcpy(value_buffer, value.data(),
6653 std::min(value.size(), sizeof(value_buffer)));
6654 bytes += single_iter->key().size() + single_iter->value().size();
6655 single_iter->Next();
6656 assert(single_iter->status().ok());
1e59de90 6657 total_scan_length++;
494da23a
TL
6658 }
6659 }
6660 delete single_iter;
6661 }
6662 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
6663 }
6664 }
6665 char msg[256];
6666 snprintf(msg, sizeof(msg),
1e59de90
TL
6667 "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64
6668 ", reads %" PRIu64 " in %" PRIu64
6669 " found, "
6670 "avg size: %.1f value, %.1f scan)\n",
6671 gets, puts, seek, get_found + seek_found, gets + seek,
6672 total_val_size / puts, total_scan_length / seek);
494da23a
TL
6673
6674 thread->stats.AddBytes(bytes);
6675 thread->stats.AddMessage(msg);
494da23a
TL
6676 }
6677
7c673cae
FG
6678 void IteratorCreation(ThreadState* thread) {
6679 Duration duration(FLAGS_duration, reads_);
1e59de90 6680 ReadOptions options = read_options_;
20effc67
TL
6681 std::unique_ptr<char[]> ts_guard;
6682 if (user_timestamp_size_ > 0) {
6683 ts_guard.reset(new char[user_timestamp_size_]);
6684 }
7c673cae
FG
6685 while (!duration.Done(1)) {
6686 DB* db = SelectDB(thread);
20effc67
TL
6687 Slice ts;
6688 if (user_timestamp_size_ > 0) {
6689 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6690 options.timestamp = &ts;
6691 }
7c673cae
FG
6692 Iterator* iter = db->NewIterator(options);
6693 delete iter;
6694 thread->stats.FinishedOps(nullptr, db, 1, kOthers);
6695 }
6696 }
6697
6698 void IteratorCreationWhileWriting(ThreadState* thread) {
6699 if (thread->tid > 0) {
6700 IteratorCreation(thread);
6701 } else {
6702 BGWriter(thread, kWrite);
6703 }
6704 }
6705
6706 void SeekRandom(ThreadState* thread) {
6707 int64_t read = 0;
6708 int64_t found = 0;
6709 int64_t bytes = 0;
1e59de90 6710 ReadOptions options = read_options_;
20effc67
TL
6711 std::unique_ptr<char[]> ts_guard;
6712 Slice ts;
6713 if (user_timestamp_size_ > 0) {
6714 ts_guard.reset(new char[user_timestamp_size_]);
6715 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6716 options.timestamp = &ts;
6717 }
7c673cae 6718
1e59de90
TL
6719 std::vector<Iterator*> tailing_iters;
6720 if (FLAGS_use_tailing_iterator) {
6721 if (db_.db != nullptr) {
6722 tailing_iters.push_back(db_.db->NewIterator(options));
6723 } else {
6724 for (const auto& db_with_cfh : multi_dbs_) {
6725 tailing_iters.push_back(db_with_cfh.db->NewIterator(options));
6726 }
7c673cae
FG
6727 }
6728 }
1e59de90 6729 options.auto_prefix_mode = FLAGS_auto_prefix_mode;
7c673cae
FG
6730
6731 std::unique_ptr<const char[]> key_guard;
6732 Slice key = AllocateKey(&key_guard);
6733
494da23a
TL
6734 std::unique_ptr<const char[]> upper_bound_key_guard;
6735 Slice upper_bound = AllocateKey(&upper_bound_key_guard);
6736 std::unique_ptr<const char[]> lower_bound_key_guard;
6737 Slice lower_bound = AllocateKey(&lower_bound_key_guard);
6738
7c673cae
FG
6739 Duration duration(FLAGS_duration, reads_);
6740 char value_buffer[256];
6741 while (!duration.Done(1)) {
494da23a 6742 int64_t seek_pos = thread->rand.Next() % FLAGS_num;
f67539c2
TL
6743 GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
6744 &key);
494da23a
TL
6745 if (FLAGS_max_scan_distance != 0) {
6746 if (FLAGS_reverse_iterator) {
6747 GenerateKeyFromInt(
6748 static_cast<uint64_t>(std::max(
6749 static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
6750 FLAGS_num, &lower_bound);
6751 options.iterate_lower_bound = &lower_bound;
6752 } else {
f67539c2
TL
6753 auto min_num =
6754 std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
6755 GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
6756 &upper_bound);
494da23a
TL
6757 options.iterate_upper_bound = &upper_bound;
6758 }
1e59de90
TL
6759 } else if (FLAGS_auto_prefix_mode && prefix_extractor_ &&
6760 !FLAGS_reverse_iterator) {
6761 // Set upper bound to next prefix
6762 auto mutable_upper_bound = const_cast<char*>(upper_bound.data());
6763 std::memcpy(mutable_upper_bound, key.data(), prefix_size_);
6764 mutable_upper_bound[prefix_size_ - 1]++;
6765 upper_bound = Slice(upper_bound.data(), prefix_size_);
6766 options.iterate_upper_bound = &upper_bound;
494da23a
TL
6767 }
6768
1e59de90
TL
6769 // Pick a Iterator to use
6770 uint64_t db_idx_to_use =
6771 (db_.db == nullptr)
6772 ? (uint64_t{thread->rand.Next()} % multi_dbs_.size())
6773 : 0;
6774 std::unique_ptr<Iterator> single_iter;
6775 Iterator* iter_to_use;
6776 if (FLAGS_use_tailing_iterator) {
6777 iter_to_use = tailing_iters[db_idx_to_use];
6778 } else {
7c673cae 6779 if (db_.db != nullptr) {
1e59de90 6780 single_iter.reset(db_.db->NewIterator(options));
7c673cae 6781 } else {
1e59de90 6782 single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options));
7c673cae 6783 }
1e59de90 6784 iter_to_use = single_iter.get();
7c673cae
FG
6785 }
6786
7c673cae
FG
6787 iter_to_use->Seek(key);
6788 read++;
6789 if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
6790 found++;
6791 }
6792
6793 for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
6794 // Copy out iterator's value to make sure we read them.
6795 Slice value = iter_to_use->value();
6796 memcpy(value_buffer, value.data(),
6797 std::min(value.size(), sizeof(value_buffer)));
6798 bytes += iter_to_use->key().size() + iter_to_use->value().size();
6799
6800 if (!FLAGS_reverse_iterator) {
6801 iter_to_use->Next();
6802 } else {
6803 iter_to_use->Prev();
6804 }
6805 assert(iter_to_use->status().ok());
6806 }
6807
6808 if (thread->shared->read_rate_limiter.get() != nullptr &&
6809 read % 256 == 255) {
11fdf7f2
TL
6810 thread->shared->read_rate_limiter->Request(
6811 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7c673cae
FG
6812 }
6813
6814 thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6815 }
1e59de90 6816 for (auto iter : tailing_iters) {
7c673cae
FG
6817 delete iter;
6818 }
6819
6820 char msg[100];
1e59de90
TL
6821 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
6822 read);
7c673cae
FG
6823 thread->stats.AddBytes(bytes);
6824 thread->stats.AddMessage(msg);
7c673cae
FG
6825 }
6826
6827 void SeekRandomWhileWriting(ThreadState* thread) {
6828 if (thread->tid > 0) {
6829 SeekRandom(thread);
6830 } else {
6831 BGWriter(thread, kWrite);
6832 }
6833 }
6834
6835 void SeekRandomWhileMerging(ThreadState* thread) {
6836 if (thread->tid > 0) {
6837 SeekRandom(thread);
6838 } else {
6839 BGWriter(thread, kMerge);
6840 }
6841 }
6842
6843 void DoDelete(ThreadState* thread, bool seq) {
20effc67 6844 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
1e59de90 6845 FLAGS_write_batch_protection_bytes_per_key,
20effc67 6846 user_timestamp_size_);
7c673cae
FG
6847 Duration duration(seq ? 0 : FLAGS_duration, deletes_);
6848 int64_t i = 0;
6849 std::unique_ptr<const char[]> key_guard;
6850 Slice key = AllocateKey(&key_guard);
20effc67
TL
6851 std::unique_ptr<char[]> ts_guard;
6852 Slice ts;
6853 if (user_timestamp_size_ > 0) {
6854 ts_guard.reset(new char[user_timestamp_size_]);
6855 }
7c673cae
FG
6856
6857 while (!duration.Done(entries_per_batch_)) {
6858 DB* db = SelectDB(thread);
6859 batch.Clear();
6860 for (int64_t j = 0; j < entries_per_batch_; ++j) {
6861 const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
6862 GenerateKeyFromInt(k, FLAGS_num, &key);
6863 batch.Delete(key);
6864 }
20effc67
TL
6865 Status s;
6866 if (user_timestamp_size_ > 0) {
6867 ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
6868 s = batch.UpdateTimestamps(
6869 ts, [this](uint32_t) { return user_timestamp_size_; });
20effc67
TL
6870 if (!s.ok()) {
6871 fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
6872 ErrorExit();
6873 }
6874 }
6875 s = db->Write(write_options_, &batch);
7c673cae
FG
6876 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
6877 if (!s.ok()) {
6878 fprintf(stderr, "del error: %s\n", s.ToString().c_str());
6879 exit(1);
6880 }
6881 i += entries_per_batch_;
6882 }
6883 }
6884
1e59de90 6885 void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); }
7c673cae 6886
1e59de90 6887 void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); }
7c673cae
FG
6888
6889 void ReadWhileWriting(ThreadState* thread) {
6890 if (thread->tid > 0) {
6891 ReadRandom(thread);
6892 } else {
6893 BGWriter(thread, kWrite);
6894 }
6895 }
6896
1e59de90
TL
6897 void MultiReadWhileWriting(ThreadState* thread) {
6898 if (thread->tid > 0) {
6899 MultiReadRandom(thread);
6900 } else {
6901 BGWriter(thread, kWrite);
6902 }
6903 }
6904
7c673cae
FG
6905 void ReadWhileMerging(ThreadState* thread) {
6906 if (thread->tid > 0) {
6907 ReadRandom(thread);
6908 } else {
6909 BGWriter(thread, kMerge);
6910 }
6911 }
6912
6913 void BGWriter(ThreadState* thread, enum OperationType write_merge) {
6914 // Special thread that keeps writing until other threads are done.
6915 RandomGenerator gen;
6916 int64_t bytes = 0;
6917
6918 std::unique_ptr<RateLimiter> write_rate_limiter;
6919 if (FLAGS_benchmark_write_rate_limit > 0) {
6920 write_rate_limiter.reset(
6921 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
6922 }
6923
6924 // Don't merge stats from this thread with the readers.
6925 thread->stats.SetExcludeFromMerge();
6926
6927 std::unique_ptr<const char[]> key_guard;
6928 Slice key = AllocateKey(&key_guard);
20effc67 6929 std::unique_ptr<char[]> ts_guard;
1e59de90
TL
6930 std::unique_ptr<const char[]> begin_key_guard;
6931 Slice begin_key = AllocateKey(&begin_key_guard);
6932 std::unique_ptr<const char[]> end_key_guard;
6933 Slice end_key = AllocateKey(&end_key_guard);
6934 uint64_t num_range_deletions = 0;
6935 std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
6936 std::vector<Slice> expanded_keys;
6937 if (FLAGS_expand_range_tombstones) {
6938 expanded_key_guards.resize(range_tombstone_width_);
6939 for (auto& expanded_key_guard : expanded_key_guards) {
6940 expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
6941 }
6942 }
20effc67
TL
6943 if (user_timestamp_size_ > 0) {
6944 ts_guard.reset(new char[user_timestamp_size_]);
6945 }
7c673cae
FG
6946 uint32_t written = 0;
6947 bool hint_printed = false;
6948
6949 while (true) {
6950 DB* db = SelectDB(thread);
6951 {
6952 MutexLock l(&thread->shared->mu);
6953 if (FLAGS_finish_after_writes && written == writes_) {
6954 fprintf(stderr, "Exiting the writer after %u writes...\n", written);
6955 break;
6956 }
6957 if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
6958 // Other threads have finished
6959 if (FLAGS_finish_after_writes) {
6960 // Wait for the writes to be finished
6961 if (!hint_printed) {
6962 fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
f67539c2 6963 static_cast<int>(writes_) - written);
7c673cae
FG
6964 hint_printed = true;
6965 }
6966 } else {
6967 // Finish the write immediately
6968 break;
6969 }
6970 }
6971 }
6972
6973 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6974 Status s;
6975
f67539c2 6976 Slice val = gen.Generate();
20effc67
TL
6977 Slice ts;
6978 if (user_timestamp_size_ > 0) {
6979 ts = mock_app_clock_->Allocate(ts_guard.get());
20effc67 6980 }
7c673cae 6981 if (write_merge == kWrite) {
1e59de90
TL
6982 if (user_timestamp_size_ == 0) {
6983 s = db->Put(write_options_, key, val);
6984 } else {
6985 s = db->Put(write_options_, key, ts, val);
6986 }
7c673cae 6987 } else {
f67539c2 6988 s = db->Merge(write_options_, key, val);
7c673cae 6989 }
20effc67 6990 // Restore write_options_
7c673cae
FG
6991 written++;
6992
6993 if (!s.ok()) {
6994 fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
6995 exit(1);
6996 }
20effc67 6997 bytes += key.size() + val.size() + user_timestamp_size_;
7c673cae
FG
6998 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6999
7000 if (FLAGS_benchmark_write_rate_limit > 0) {
1e59de90
TL
7001 write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
7002 nullptr /* stats */,
7003 RateLimiter::OpType::kWrite);
7004 }
7005
7006 if (writes_per_range_tombstone_ > 0 &&
7007 written > writes_before_delete_range_ &&
7008 (written - writes_before_delete_range_) /
7009 writes_per_range_tombstone_ <=
7010 max_num_range_tombstones_ &&
7011 (written - writes_before_delete_range_) %
7012 writes_per_range_tombstone_ ==
7013 0) {
7014 num_range_deletions++;
7015 int64_t begin_num = thread->rand.Next() % FLAGS_num;
7016 if (FLAGS_expand_range_tombstones) {
7017 for (int64_t offset = 0; offset < range_tombstone_width_; ++offset) {
7018 GenerateKeyFromInt(begin_num + offset, FLAGS_num,
7019 &expanded_keys[offset]);
7020 if (!db->Delete(write_options_, expanded_keys[offset]).ok()) {
7021 fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
7022 exit(1);
7023 }
7024 }
7025 } else {
7026 GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
7027 GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
7028 &end_key);
7029 if (!db->DeleteRange(write_options_, db->DefaultColumnFamily(),
7030 begin_key, end_key)
7031 .ok()) {
7032 fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str());
7033 exit(1);
7034 }
7035 }
7036 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7037 // TODO: DeleteRange is not included in calculcation of bytes/rate
7038 // limiter request
7c673cae
FG
7039 }
7040 }
1e59de90
TL
7041 if (num_range_deletions > 0) {
7042 std::cout << "Number of range deletions: " << num_range_deletions
7043 << std::endl;
7044 }
7c673cae
FG
7045 thread->stats.AddBytes(bytes);
7046 }
7047
11fdf7f2
TL
7048 void ReadWhileScanning(ThreadState* thread) {
7049 if (thread->tid > 0) {
7050 ReadRandom(thread);
7051 } else {
7052 BGScan(thread);
7053 }
7054 }
7055
7056 void BGScan(ThreadState* thread) {
7057 if (FLAGS_num_multi_db > 0) {
7058 fprintf(stderr, "Not supporting multiple DBs.\n");
7059 abort();
7060 }
7061 assert(db_.db != nullptr);
1e59de90 7062 ReadOptions read_options = read_options_;
20effc67
TL
7063 std::unique_ptr<char[]> ts_guard;
7064 Slice ts;
7065 if (user_timestamp_size_ > 0) {
7066 ts_guard.reset(new char[user_timestamp_size_]);
7067 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
7068 read_options.timestamp = &ts;
7069 }
11fdf7f2
TL
7070 Iterator* iter = db_.db->NewIterator(read_options);
7071
7072 fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
7073 Duration duration(FLAGS_duration, reads_);
7074 uint64_t num_seek_to_first = 0;
7075 uint64_t num_next = 0;
7076 while (!duration.Done(1)) {
7077 if (!iter->Valid()) {
7078 iter->SeekToFirst();
7079 num_seek_to_first++;
7080 } else if (!iter->status().ok()) {
7081 fprintf(stderr, "Iterator error: %s\n",
7082 iter->status().ToString().c_str());
7083 abort();
7084 } else {
7085 iter->Next();
7086 num_next++;
7087 }
7088
7089 thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
7090 }
7091 delete iter;
7092 }
7093
7c673cae
FG
7094 // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
7095 // in DB atomically i.e in a single batch. Also refer GetMany.
7096 Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
7097 const Slice& value) {
7098 std::string suffixes[3] = {"2", "1", "0"};
7099 std::string keys[3];
7100
20effc67 7101 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
1e59de90 7102 FLAGS_write_batch_protection_bytes_per_key,
20effc67 7103 user_timestamp_size_);
7c673cae
FG
7104 Status s;
7105 for (int i = 0; i < 3; i++) {
7106 keys[i] = key.ToString() + suffixes[i];
7107 batch.Put(keys[i], value);
7108 }
7109
20effc67
TL
7110 std::unique_ptr<char[]> ts_guard;
7111 if (user_timestamp_size_ > 0) {
7112 ts_guard.reset(new char[user_timestamp_size_]);
7113 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
7114 s = batch.UpdateTimestamps(
7115 ts, [this](uint32_t) { return user_timestamp_size_; });
20effc67
TL
7116 if (!s.ok()) {
7117 fprintf(stderr, "assign timestamp to batch: %s\n",
7118 s.ToString().c_str());
7119 ErrorExit();
7120 }
7121 }
7122
7c673cae
FG
7123 s = db->Write(writeoptions, &batch);
7124 return s;
7125 }
7126
7c673cae
FG
7127 // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
7128 // in DB atomically i.e in a single batch. Also refer GetMany.
7129 Status DeleteMany(DB* db, const WriteOptions& writeoptions,
7130 const Slice& key) {
7131 std::string suffixes[3] = {"1", "2", "0"};
7132 std::string keys[3];
7133
1e59de90
TL
7134 WriteBatch batch(0, 0, FLAGS_write_batch_protection_bytes_per_key,
7135 user_timestamp_size_);
7c673cae
FG
7136 Status s;
7137 for (int i = 0; i < 3; i++) {
7138 keys[i] = key.ToString() + suffixes[i];
7139 batch.Delete(keys[i]);
7140 }
7141
20effc67
TL
7142 std::unique_ptr<char[]> ts_guard;
7143 if (user_timestamp_size_ > 0) {
7144 ts_guard.reset(new char[user_timestamp_size_]);
7145 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
7146 s = batch.UpdateTimestamps(
7147 ts, [this](uint32_t) { return user_timestamp_size_; });
20effc67
TL
7148 if (!s.ok()) {
7149 fprintf(stderr, "assign timestamp to batch: %s\n",
7150 s.ToString().c_str());
7151 ErrorExit();
7152 }
7153 }
7154
7c673cae
FG
7155 s = db->Write(writeoptions, &batch);
7156 return s;
7157 }
7158
7159 // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
7160 // in the same snapshot, and verifies that all the values are identical.
7161 // ASSUMES that PutMany was used to put (K, V) into the DB.
1e59de90 7162 Status GetMany(DB* db, const Slice& key, std::string* value) {
7c673cae
FG
7163 std::string suffixes[3] = {"0", "1", "2"};
7164 std::string keys[3];
7165 Slice key_slices[3];
7166 std::string values[3];
1e59de90 7167 ReadOptions readoptionscopy = read_options_;
20effc67
TL
7168
7169 std::unique_ptr<char[]> ts_guard;
7170 Slice ts;
7171 if (user_timestamp_size_ > 0) {
7172 ts_guard.reset(new char[user_timestamp_size_]);
7173 ts = mock_app_clock_->Allocate(ts_guard.get());
7174 readoptionscopy.timestamp = &ts;
7175 }
7176
7c673cae
FG
7177 readoptionscopy.snapshot = db->GetSnapshot();
7178 Status s;
7179 for (int i = 0; i < 3; i++) {
7180 keys[i] = key.ToString() + suffixes[i];
7181 key_slices[i] = keys[i];
7182 s = db->Get(readoptionscopy, key_slices[i], value);
7183 if (!s.ok() && !s.IsNotFound()) {
7184 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7185 values[i] = "";
7186 // we continue after error rather than exiting so that we can
7187 // find more errors if any
7188 } else if (s.IsNotFound()) {
7189 values[i] = "";
7190 } else {
7191 values[i] = *value;
7192 }
7193 }
7194 db->ReleaseSnapshot(readoptionscopy.snapshot);
7195
7196 if ((values[0] != values[1]) || (values[1] != values[2])) {
7197 fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
7198 key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
7199 values[2].c_str());
7200 // we continue after error rather than exiting so that we can
7201 // find more errors if any
7202 }
7203
7204 return s;
7205 }
7206
7207 // Differs from readrandomwriterandom in the following ways:
7208 // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
7209 // (b) Does deletes as well (per FLAGS_deletepercent)
7210 // (c) In order to achieve high % of 'found' during lookups, and to do
7211 // multiple writes (including puts and deletes) it uses upto
7212 // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
7213 // (d) Does not have a MultiGet option.
7214 void RandomWithVerify(ThreadState* thread) {
7c673cae
FG
7215 RandomGenerator gen;
7216 std::string value;
7217 int64_t found = 0;
7218 int get_weight = 0;
7219 int put_weight = 0;
7220 int delete_weight = 0;
7221 int64_t gets_done = 0;
7222 int64_t puts_done = 0;
7223 int64_t deletes_done = 0;
7224
7225 std::unique_ptr<const char[]> key_guard;
7226 Slice key = AllocateKey(&key_guard);
7227
7228 // the number of iterations is the larger of read_ or write_
7229 for (int64_t i = 0; i < readwrites_; i++) {
7230 DB* db = SelectDB(thread);
7231 if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
7232 // one batch completed, reinitialize for next batch
7233 get_weight = FLAGS_readwritepercent;
7234 delete_weight = FLAGS_deletepercent;
7235 put_weight = 100 - get_weight - delete_weight;
7236 }
7237 GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
1e59de90 7238 FLAGS_numdistinct, &key);
7c673cae
FG
7239 if (get_weight > 0) {
7240 // do all the gets first
1e59de90 7241 Status s = GetMany(db, key, &value);
7c673cae
FG
7242 if (!s.ok() && !s.IsNotFound()) {
7243 fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
7244 // we continue after error rather than exiting so that we can
7245 // find more errors if any
7246 } else if (!s.IsNotFound()) {
7247 found++;
7248 }
7249 get_weight--;
7250 gets_done++;
7251 thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
7252 } else if (put_weight > 0) {
7253 // then do all the corresponding number of puts
7254 // for all the gets we have done earlier
f67539c2 7255 Status s = PutMany(db, write_options_, key, gen.Generate());
7c673cae
FG
7256 if (!s.ok()) {
7257 fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
7258 exit(1);
7259 }
7260 put_weight--;
7261 puts_done++;
7262 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7263 } else if (delete_weight > 0) {
7264 Status s = DeleteMany(db, write_options_, key);
7265 if (!s.ok()) {
7266 fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
7267 exit(1);
7268 }
7269 delete_weight--;
7270 deletes_done++;
7271 thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
7272 }
7273 }
11fdf7f2 7274 char msg[128];
7c673cae 7275 snprintf(msg, sizeof(msg),
1e59de90
TL
7276 "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64
7277 " found:%" PRIu64 ")",
7c673cae
FG
7278 gets_done, puts_done, deletes_done, readwrites_, found);
7279 thread->stats.AddMessage(msg);
7280 }
7281
7282 // This is different from ReadWhileWriting because it does not use
7283 // an extra thread.
7284 void ReadRandomWriteRandom(ThreadState* thread) {
1e59de90 7285 ReadOptions options = read_options_;
7c673cae
FG
7286 RandomGenerator gen;
7287 std::string value;
7288 int64_t found = 0;
7289 int get_weight = 0;
7290 int put_weight = 0;
7291 int64_t reads_done = 0;
7292 int64_t writes_done = 0;
7293 Duration duration(FLAGS_duration, readwrites_);
7294
7295 std::unique_ptr<const char[]> key_guard;
7296 Slice key = AllocateKey(&key_guard);
7297
20effc67
TL
7298 std::unique_ptr<char[]> ts_guard;
7299 if (user_timestamp_size_ > 0) {
7300 ts_guard.reset(new char[user_timestamp_size_]);
7301 }
7302
7c673cae
FG
7303 // the number of iterations is the larger of read_ or write_
7304 while (!duration.Done(1)) {
7305 DB* db = SelectDB(thread);
7306 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
7307 if (get_weight == 0 && put_weight == 0) {
7308 // one batch completed, reinitialize for next batch
7309 get_weight = FLAGS_readwritepercent;
7310 put_weight = 100 - get_weight;
7311 }
7312 if (get_weight > 0) {
7313 // do all the gets first
20effc67
TL
7314 Slice ts;
7315 if (user_timestamp_size_ > 0) {
7316 ts = mock_app_clock_->GetTimestampForRead(thread->rand,
7317 ts_guard.get());
7318 options.timestamp = &ts;
7319 }
7c673cae
FG
7320 Status s = db->Get(options, key, &value);
7321 if (!s.ok() && !s.IsNotFound()) {
7322 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7323 // we continue after error rather than exiting so that we can
7324 // find more errors if any
7325 } else if (!s.IsNotFound()) {
7326 found++;
7327 }
7328 get_weight--;
7329 reads_done++;
7330 thread->stats.FinishedOps(nullptr, db, 1, kRead);
1e59de90 7331 } else if (put_weight > 0) {
7c673cae
FG
7332 // then do all the corresponding number of puts
7333 // for all the gets we have done earlier
1e59de90 7334 Status s;
20effc67 7335 if (user_timestamp_size_ > 0) {
1e59de90
TL
7336 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7337 s = db->Put(write_options_, key, ts, gen.Generate());
7338 } else {
7339 s = db->Put(write_options_, key, gen.Generate());
20effc67 7340 }
7c673cae
FG
7341 if (!s.ok()) {
7342 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
20effc67 7343 ErrorExit();
7c673cae
FG
7344 }
7345 put_weight--;
7346 writes_done++;
7347 thread->stats.FinishedOps(nullptr, db, 1, kWrite);
7348 }
7349 }
7350 char msg[100];
1e59de90
TL
7351 snprintf(msg, sizeof(msg),
7352 "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64
7353 " found:%" PRIu64 ")",
7c673cae
FG
7354 reads_done, writes_done, readwrites_, found);
7355 thread->stats.AddMessage(msg);
7356 }
7357
7358 //
7359 // Read-modify-write for random keys
7360 void UpdateRandom(ThreadState* thread) {
1e59de90 7361 ReadOptions options = read_options_;
7c673cae
FG
7362 RandomGenerator gen;
7363 std::string value;
7364 int64_t found = 0;
7365 int64_t bytes = 0;
7366 Duration duration(FLAGS_duration, readwrites_);
7367
7368 std::unique_ptr<const char[]> key_guard;
7369 Slice key = AllocateKey(&key_guard);
20effc67
TL
7370 std::unique_ptr<char[]> ts_guard;
7371 if (user_timestamp_size_ > 0) {
7372 ts_guard.reset(new char[user_timestamp_size_]);
7373 }
7c673cae
FG
7374 // the number of iterations is the larger of read_ or write_
7375 while (!duration.Done(1)) {
7376 DB* db = SelectDB(thread);
7377 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
20effc67
TL
7378 Slice ts;
7379 if (user_timestamp_size_ > 0) {
7380 // Read with newest timestamp because we are doing rmw.
7381 ts = mock_app_clock_->Allocate(ts_guard.get());
7382 options.timestamp = &ts;
7383 }
7c673cae
FG
7384
7385 auto status = db->Get(options, key, &value);
7386 if (status.ok()) {
7387 ++found;
20effc67 7388 bytes += key.size() + value.size() + user_timestamp_size_;
7c673cae
FG
7389 } else if (!status.IsNotFound()) {
7390 fprintf(stderr, "Get returned an error: %s\n",
7391 status.ToString().c_str());
7392 abort();
7393 }
7394
11fdf7f2
TL
7395 if (thread->shared->write_rate_limiter) {
7396 thread->shared->write_rate_limiter->Request(
f67539c2 7397 key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
11fdf7f2
TL
7398 RateLimiter::OpType::kWrite);
7399 }
7400
f67539c2 7401 Slice val = gen.Generate();
1e59de90 7402 Status s;
20effc67
TL
7403 if (user_timestamp_size_ > 0) {
7404 ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
7405 s = db->Put(write_options_, key, ts, val);
7406 } else {
7407 s = db->Put(write_options_, key, val);
20effc67 7408 }
7c673cae
FG
7409 if (!s.ok()) {
7410 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7411 exit(1);
7412 }
20effc67 7413 bytes += key.size() + val.size() + user_timestamp_size_;
7c673cae
FG
7414 thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
7415 }
7416 char msg[100];
1e59de90
TL
7417 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7418 readwrites_, found);
7c673cae
FG
7419 thread->stats.AddBytes(bytes);
7420 thread->stats.AddMessage(msg);
7421 }
7422
11fdf7f2
TL
7423 // Read-XOR-write for random keys. Xors the existing value with a randomly
7424 // generated value, and stores the result. Assuming A in the array of bytes
7425 // representing the existing value, we generate an array B of the same size,
7426 // then compute C = A^B as C[i]=A[i]^B[i], and store C
7427 void XORUpdateRandom(ThreadState* thread) {
1e59de90 7428 ReadOptions options = read_options_;
11fdf7f2
TL
7429 RandomGenerator gen;
7430 std::string existing_value;
7431 int64_t found = 0;
7432 Duration duration(FLAGS_duration, readwrites_);
7433
7434 BytesXOROperator xor_operator;
7435
7436 std::unique_ptr<const char[]> key_guard;
7437 Slice key = AllocateKey(&key_guard);
20effc67
TL
7438 std::unique_ptr<char[]> ts_guard;
7439 if (user_timestamp_size_ > 0) {
7440 ts_guard.reset(new char[user_timestamp_size_]);
7441 }
11fdf7f2
TL
7442 // the number of iterations is the larger of read_ or write_
7443 while (!duration.Done(1)) {
7444 DB* db = SelectDB(thread);
7445 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
20effc67
TL
7446 Slice ts;
7447 if (user_timestamp_size_ > 0) {
7448 ts = mock_app_clock_->Allocate(ts_guard.get());
7449 options.timestamp = &ts;
7450 }
11fdf7f2
TL
7451
7452 auto status = db->Get(options, key, &existing_value);
7453 if (status.ok()) {
7454 ++found;
7455 } else if (!status.IsNotFound()) {
7456 fprintf(stderr, "Get returned an error: %s\n",
7457 status.ToString().c_str());
7458 exit(1);
7459 }
7460
1e59de90
TL
7461 Slice value =
7462 gen.Generate(static_cast<unsigned int>(existing_value.size()));
11fdf7f2
TL
7463 std::string new_value;
7464
7465 if (status.ok()) {
7466 Slice existing_value_slice = Slice(existing_value);
7467 xor_operator.XOR(&existing_value_slice, value, &new_value);
7468 } else {
7469 xor_operator.XOR(nullptr, value, &new_value);
7470 }
7471
1e59de90 7472 Status s;
20effc67
TL
7473 if (user_timestamp_size_ > 0) {
7474 ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
7475 s = db->Put(write_options_, key, ts, Slice(new_value));
7476 } else {
7477 s = db->Put(write_options_, key, Slice(new_value));
20effc67 7478 }
11fdf7f2
TL
7479 if (!s.ok()) {
7480 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
20effc67 7481 ErrorExit();
11fdf7f2
TL
7482 }
7483 thread->stats.FinishedOps(nullptr, db, 1);
7484 }
7485 char msg[100];
1e59de90
TL
7486 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
7487 readwrites_, found);
11fdf7f2
TL
7488 thread->stats.AddMessage(msg);
7489 }
7490
7c673cae
FG
7491 // Read-modify-write for random keys.
7492 // Each operation causes the key grow by value_size (simulating an append).
7493 // Generally used for benchmarking against merges of similar type
7494 void AppendRandom(ThreadState* thread) {
1e59de90 7495 ReadOptions options = read_options_;
7c673cae
FG
7496 RandomGenerator gen;
7497 std::string value;
7498 int64_t found = 0;
7499 int64_t bytes = 0;
7500
7501 std::unique_ptr<const char[]> key_guard;
7502 Slice key = AllocateKey(&key_guard);
20effc67
TL
7503 std::unique_ptr<char[]> ts_guard;
7504 if (user_timestamp_size_ > 0) {
7505 ts_guard.reset(new char[user_timestamp_size_]);
7506 }
7c673cae
FG
7507 // The number of iterations is the larger of read_ or write_
7508 Duration duration(FLAGS_duration, readwrites_);
7509 while (!duration.Done(1)) {
7510 DB* db = SelectDB(thread);
7511 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
20effc67
TL
7512 Slice ts;
7513 if (user_timestamp_size_ > 0) {
7514 ts = mock_app_clock_->Allocate(ts_guard.get());
7515 options.timestamp = &ts;
7516 }
7c673cae
FG
7517
7518 auto status = db->Get(options, key, &value);
7519 if (status.ok()) {
7520 ++found;
20effc67 7521 bytes += key.size() + value.size() + user_timestamp_size_;
7c673cae
FG
7522 } else if (!status.IsNotFound()) {
7523 fprintf(stderr, "Get returned an error: %s\n",
7524 status.ToString().c_str());
7525 abort();
7526 } else {
7527 // If not existing, then just assume an empty string of data
7528 value.clear();
7529 }
7530
7531 // Update the value (by appending data)
f67539c2 7532 Slice operand = gen.Generate();
7c673cae
FG
7533 if (value.size() > 0) {
7534 // Use a delimiter to match the semantics for StringAppendOperator
1e59de90 7535 value.append(1, ',');
7c673cae
FG
7536 }
7537 value.append(operand.data(), operand.size());
7538
1e59de90 7539 Status s;
20effc67
TL
7540 if (user_timestamp_size_ > 0) {
7541 ts = mock_app_clock_->Allocate(ts_guard.get());
1e59de90
TL
7542 s = db->Put(write_options_, key, ts, value);
7543 } else {
7544 // Write back to the database
7545 s = db->Put(write_options_, key, value);
20effc67 7546 }
7c673cae
FG
7547 if (!s.ok()) {
7548 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
20effc67 7549 ErrorExit();
7c673cae 7550 }
20effc67 7551 bytes += key.size() + value.size() + user_timestamp_size_;
7c673cae
FG
7552 thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
7553 }
7554
7555 char msg[100];
7556 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
1e59de90 7557 readwrites_, found);
7c673cae
FG
7558 thread->stats.AddBytes(bytes);
7559 thread->stats.AddMessage(msg);
7560 }
7561
7562 // Read-modify-write for random keys (using MergeOperator)
7563 // The merge operator to use should be defined by FLAGS_merge_operator
7564 // Adjust FLAGS_value_size so that the keys are reasonable for this operator
7565 // Assumes that the merge operator is non-null (i.e.: is well-defined)
7566 //
7567 // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
7568 // to simulate random additions over 64-bit integers using merge.
7569 //
7570 // The number of merges on the same key can be controlled by adjusting
7571 // FLAGS_merge_keys.
7572 void MergeRandom(ThreadState* thread) {
7573 RandomGenerator gen;
7574 int64_t bytes = 0;
7575 std::unique_ptr<const char[]> key_guard;
7576 Slice key = AllocateKey(&key_guard);
7577 // The number of iterations is the larger of read_ or write_
7578 Duration duration(FLAGS_duration, readwrites_);
7579 while (!duration.Done(1)) {
11fdf7f2
TL
7580 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
7581 int64_t key_rand = thread->rand.Next() % merge_keys_;
7582 GenerateKeyFromInt(key_rand, merge_keys_, &key);
7c673cae 7583
11fdf7f2 7584 Status s;
f67539c2 7585 Slice val = gen.Generate();
11fdf7f2
TL
7586 if (FLAGS_num_column_families > 1) {
7587 s = db_with_cfh->db->Merge(write_options_,
1e59de90 7588 db_with_cfh->GetCfh(key_rand), key, val);
11fdf7f2 7589 } else {
1e59de90
TL
7590 s = db_with_cfh->db->Merge(
7591 write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val);
11fdf7f2 7592 }
7c673cae
FG
7593
7594 if (!s.ok()) {
7595 fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
7596 exit(1);
7597 }
f67539c2 7598 bytes += key.size() + val.size();
11fdf7f2 7599 thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
7c673cae
FG
7600 }
7601
7602 // Print some statistics
7603 char msg[100];
7604 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
7605 thread->stats.AddBytes(bytes);
7606 thread->stats.AddMessage(msg);
7607 }
7608
7609 // Read and merge random keys. The amount of reads and merges are controlled
7610 // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
7611 // keys (and thus also the number of reads and merges on the same key) can be
7612 // adjusted with FLAGS_merge_keys.
7613 //
7614 // As with MergeRandom, the merge operator to use should be defined by
7615 // FLAGS_merge_operator.
7616 void ReadRandomMergeRandom(ThreadState* thread) {
7c673cae
FG
7617 RandomGenerator gen;
7618 std::string value;
7619 int64_t num_hits = 0;
7620 int64_t num_gets = 0;
7621 int64_t num_merges = 0;
7622 size_t max_length = 0;
7623
7624 std::unique_ptr<const char[]> key_guard;
7625 Slice key = AllocateKey(&key_guard);
7626 // the number of iterations is the larger of read_ or write_
7627 Duration duration(FLAGS_duration, readwrites_);
7628 while (!duration.Done(1)) {
7629 DB* db = SelectDB(thread);
7630 GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
7631
7632 bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
7633
7634 if (do_merge) {
f67539c2 7635 Status s = db->Merge(write_options_, key, gen.Generate());
7c673cae
FG
7636 if (!s.ok()) {
7637 fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
7638 exit(1);
7639 }
7640 num_merges++;
7641 thread->stats.FinishedOps(nullptr, db, 1, kMerge);
7642 } else {
1e59de90
TL
7643 Status s = db->Get(read_options_, key, &value);
7644 if (value.length() > max_length) max_length = value.length();
7c673cae
FG
7645
7646 if (!s.ok() && !s.IsNotFound()) {
7647 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
7648 // we continue after error rather than exiting so that we can
7649 // find more errors if any
7650 } else if (!s.IsNotFound()) {
7651 num_hits++;
7652 }
7653 num_gets++;
7654 thread->stats.FinishedOps(nullptr, db, 1, kRead);
7655 }
7656 }
7657
7658 char msg[100];
7659 snprintf(msg, sizeof(msg),
7660 "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
7661 " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
7662 num_gets, num_merges, readwrites_, num_hits, max_length);
7663 thread->stats.AddMessage(msg);
7664 }
7665
7666 void WriteSeqSeekSeq(ThreadState* thread) {
7667 writes_ = FLAGS_num;
7668 DoWrite(thread, SEQUENTIAL);
7669 // exclude writes from the ops/sec calculation
7670 thread->stats.Start(thread->tid);
7671
7672 DB* db = SelectDB(thread);
1e59de90 7673 ReadOptions read_opts = read_options_;
20effc67
TL
7674 std::unique_ptr<char[]> ts_guard;
7675 Slice ts;
7676 if (user_timestamp_size_ > 0) {
7677 ts_guard.reset(new char[user_timestamp_size_]);
7678 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
7679 read_opts.timestamp = &ts;
7680 }
7681 std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
7c673cae
FG
7682
7683 std::unique_ptr<const char[]> key_guard;
7684 Slice key = AllocateKey(&key_guard);
7685 for (int64_t i = 0; i < FLAGS_num; ++i) {
7686 GenerateKeyFromInt(i, FLAGS_num, &key);
7687 iter->Seek(key);
7688 assert(iter->Valid() && iter->key() == key);
7689 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7690
7691 for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
7692 if (!FLAGS_reverse_iterator) {
7693 iter->Next();
7694 } else {
7695 iter->Prev();
7696 }
7697 GenerateKeyFromInt(++i, FLAGS_num, &key);
7698 assert(iter->Valid() && iter->key() == key);
7699 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7700 }
7701
7702 iter->Seek(key);
7703 assert(iter->Valid() && iter->key() == key);
7704 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
7705 }
7706 }
7707
f67539c2
TL
7708 bool binary_search(std::vector<int>& data, int start, int end, int key) {
7709 if (data.empty()) return false;
7710 if (start > end) return false;
7711 int mid = start + (end - start) / 2;
7712 if (mid > static_cast<int>(data.size()) - 1) return false;
7713 if (data[mid] == key) {
7714 return true;
7715 } else if (data[mid] > key) {
7716 return binary_search(data, start, mid - 1, key);
7717 } else {
7718 return binary_search(data, mid + 1, end, key);
7719 }
7720 }
7721
7722 // Does a bunch of merge operations for a key(key1) where the merge operand
7723 // is a sorted list. Next performance comparison is done between doing a Get
7724 // for key1 followed by searching for another key(key2) in the large sorted
7725 // list vs calling GetMergeOperands for key1 and then searching for the key2
7726 // in all the sorted sub-lists. Later case is expected to be a lot faster.
7727 void GetMergeOperands(ThreadState* thread) {
7728 DB* db = SelectDB(thread);
7729 const int kTotalValues = 100000;
7730 const int kListSize = 100;
7731 std::string key = "my_key";
7732 std::string value;
7733
7734 for (int i = 1; i < kTotalValues; i++) {
7735 if (i % kListSize == 0) {
7736 // Remove trailing ','
7737 value.pop_back();
7738 db->Merge(WriteOptions(), key, value);
7739 value.clear();
7740 } else {
7741 value.append(std::to_string(i)).append(",");
7742 }
7743 }
7744
7745 SortList s;
7746 std::vector<int> data;
7747 // This value can be experimented with and it will demonstrate the
7748 // perf difference between doing a Get and searching for lookup_key in the
7749 // resultant large sorted list vs doing GetMergeOperands and searching
7750 // for lookup_key within this resultant sorted sub-lists.
7751 int lookup_key = 1;
7752
7753 // Get API call
7754 std::cout << "--- Get API call --- \n";
7755 PinnableSlice p_slice;
7756 uint64_t st = FLAGS_env->NowNanos();
7757 db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
7758 s.MakeVector(data, p_slice);
7759 bool found =
7760 binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
7761 std::cout << "Found key? " << std::to_string(found) << "\n";
7762 uint64_t sp = FLAGS_env->NowNanos();
7763 std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
7764 std::string* dat_ = p_slice.GetSelf();
7765 std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
7766 << "\n";
7767 data.clear();
7768
7769 // GetMergeOperands API call
7770 std::cout << "--- GetMergeOperands API --- \n";
7771 std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
7772 st = FLAGS_env->NowNanos();
7773 int number_of_operands = 0;
7774 GetMergeOperandsOptions get_merge_operands_options;
7775 get_merge_operands_options.expected_max_number_of_operands =
7776 (kTotalValues / 100) + 1;
7777 db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
7778 a_slice.data(), &get_merge_operands_options,
7779 &number_of_operands);
7780 for (PinnableSlice& psl : a_slice) {
7781 s.MakeVector(data, psl);
7782 found =
7783 binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
7784 data.clear();
7785 if (found) break;
7786 }
7787 std::cout << "Found key? " << std::to_string(found) << "\n";
7788 sp = FLAGS_env->NowNanos();
7789 std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
7790 << " seconds \n";
7791 int to_print = 0;
7792 std::cout << "Sample data from GetMergeOperands API call: ";
7793 for (PinnableSlice& psl : a_slice) {
7794 std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
7795 if (to_print++ > 2) break;
7796 }
7797 }
7798
7c673cae 7799#ifndef ROCKSDB_LITE
1e59de90
TL
7800 void VerifyChecksum(ThreadState* thread) {
7801 DB* db = SelectDB(thread);
7802 ReadOptions ro;
7803 ro.adaptive_readahead = FLAGS_adaptive_readahead;
7804 ro.async_io = FLAGS_async_io;
7805 ro.rate_limiter_priority =
7806 FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
7807 ro.readahead_size = FLAGS_readahead_size;
7808 Status s = db->VerifyChecksum(ro);
7809 if (!s.ok()) {
7810 fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
7811 exit(1);
7812 }
7813 }
7814
7815 void VerifyFileChecksums(ThreadState* thread) {
7816 DB* db = SelectDB(thread);
7817 ReadOptions ro;
7818 ro.adaptive_readahead = FLAGS_adaptive_readahead;
7819 ro.async_io = FLAGS_async_io;
7820 ro.rate_limiter_priority =
7821 FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
7822 ro.readahead_size = FLAGS_readahead_size;
7823 Status s = db->VerifyFileChecksums(ro);
7824 if (!s.ok()) {
7825 fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
7826 s.ToString().c_str());
7827 exit(1);
7828 }
7829 }
7830
7c673cae
FG
7831 // This benchmark stress tests Transactions. For a given --duration (or
7832 // total number of --writes, a Transaction will perform a read-modify-write
7833 // to increment the value of a key in each of N(--transaction-sets) sets of
7834 // keys (where each set has --num keys). If --threads is set, this will be
7835 // done in parallel.
7836 //
7837 // To test transactions, use --transaction_db=true. Not setting this
7838 // parameter
7839 // will run the same benchmark without transactions.
7840 //
7841 // RandomTransactionVerify() will then validate the correctness of the results
7842 // by checking if the sum of all keys in each set is the same.
7843 void RandomTransaction(ThreadState* thread) {
7c673cae 7844 Duration duration(FLAGS_duration, readwrites_);
7c673cae
FG
7845 uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
7846 uint64_t transactions_done = 0;
7847
7848 if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
7849 fprintf(stderr, "invalid value for transaction_sets\n");
7850 abort();
7851 }
7852
7853 TransactionOptions txn_options;
7854 txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
7855 txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
7856
7857 RandomTransactionInserter inserter(&thread->rand, write_options_,
1e59de90 7858 read_options_, FLAGS_num,
7c673cae
FG
7859 num_prefix_ranges);
7860
7861 if (FLAGS_num_multi_db > 1) {
7862 fprintf(stderr,
7863 "Cannot run RandomTransaction benchmark with "
7864 "FLAGS_multi_db > 1.");
7865 abort();
7866 }
7867
7868 while (!duration.Done(1)) {
7869 bool success;
7870
7871 // RandomTransactionInserter will attempt to insert a key for each
7872 // # of FLAGS_transaction_sets
7873 if (FLAGS_optimistic_transaction_db) {
7874 success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
7875 } else if (FLAGS_transaction_db) {
7876 TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
7877 success = inserter.TransactionDBInsert(txn_db, txn_options);
7878 } else {
7879 success = inserter.DBInsert(db_.db);
7880 }
7881
7882 if (!success) {
7883 fprintf(stderr, "Unexpected error: %s\n",
7884 inserter.GetLastStatus().ToString().c_str());
7885 abort();
7886 }
7887
7888 thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
7889 transactions_done++;
7890 }
7891
7892 char msg[100];
7893 if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
7894 snprintf(msg, sizeof(msg),
7895 "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
7896 transactions_done, inserter.GetFailureCount());
7897 } else {
7898 snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
7899 }
7900 thread->stats.AddMessage(msg);
11fdf7f2 7901 thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
7c673cae
FG
7902 }
7903
7904 // Verifies consistency of data after RandomTransaction() has been run.
7905 // Since each iteration of RandomTransaction() incremented a key in each set
7906 // by the same value, the sum of the keys in each set should be the same.
7907 void RandomTransactionVerify() {
7908 if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
7909 // transactions not used, nothing to verify.
7910 return;
7911 }
7912
1e59de90
TL
7913 Status s = RandomTransactionInserter::Verify(
7914 db_.db, static_cast<uint16_t>(FLAGS_transaction_sets));
7c673cae
FG
7915
7916 if (s.ok()) {
7917 fprintf(stdout, "RandomTransactionVerify Success.\n");
7918 } else {
7919 fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
7920 }
7921 }
7922#endif // ROCKSDB_LITE
7923
7924 // Writes and deletes random keys without overwriting keys.
7925 //
7926 // This benchmark is intended to partially replicate the behavior of MyRocks
7927 // secondary indices: All data is stored in keys and updates happen by
7928 // deleting the old version of the key and inserting the new version.
7929 void RandomReplaceKeys(ThreadState* thread) {
7930 std::unique_ptr<const char[]> key_guard;
7931 Slice key = AllocateKey(&key_guard);
20effc67
TL
7932 std::unique_ptr<char[]> ts_guard;
7933 if (user_timestamp_size_ > 0) {
7934 ts_guard.reset(new char[user_timestamp_size_]);
7935 }
7c673cae
FG
7936 std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
7937 size_t max_counter = 50;
7938 RandomGenerator gen;
7939
7940 Status s;
7941 DB* db = SelectDB(thread);
7942 for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
7943 GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
20effc67 7944 if (user_timestamp_size_ > 0) {
1e59de90
TL
7945 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7946 s = db->Put(write_options_, key, ts, gen.Generate());
7947 } else {
7948 s = db->Put(write_options_, key, gen.Generate());
20effc67 7949 }
7c673cae
FG
7950 if (!s.ok()) {
7951 fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7952 exit(1);
7953 }
7954 }
7955
7956 db->GetSnapshot();
7957
7958 std::default_random_engine generator;
7959 std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
7960 FLAGS_stddev);
7961 Duration duration(FLAGS_duration, FLAGS_num);
7962 while (!duration.Done(1)) {
7963 int64_t rnd_id = static_cast<int64_t>(distribution(generator));
7964 int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
7965 static_cast<int64_t>(0));
7966 GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7967 &key);
20effc67 7968 if (user_timestamp_size_ > 0) {
1e59de90
TL
7969 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7970 s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts)
7971 : db->Delete(write_options_, key, ts);
7972 } else {
7973 s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
7974 : db->Delete(write_options_, key);
20effc67 7975 }
7c673cae
FG
7976 if (s.ok()) {
7977 counters[key_id] = (counters[key_id] + 1) % max_counter;
7978 GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7979 &key);
20effc67 7980 if (user_timestamp_size_ > 0) {
1e59de90
TL
7981 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
7982 s = db->Put(write_options_, key, ts, Slice());
7983 } else {
7984 s = db->Put(write_options_, key, Slice());
20effc67 7985 }
7c673cae
FG
7986 }
7987
7988 if (!s.ok()) {
7989 fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7990 exit(1);
7991 }
7992
7993 thread->stats.FinishedOps(nullptr, db, 1, kOthers);
7994 }
7995
7996 char msg[200];
7997 snprintf(msg, sizeof(msg),
7998 "use single deletes: %d, "
7999 "standard deviation: %lf\n",
8000 FLAGS_use_single_deletes, FLAGS_stddev);
8001 thread->stats.AddMessage(msg);
8002 }
8003
8004 void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
7c673cae
FG
8005 int64_t read = 0;
8006 int64_t found = 0;
8007 int64_t bytes = 0;
8008
8009 Iterator* iter = nullptr;
8010 // Only work on single database
8011 assert(db_.db != nullptr);
1e59de90 8012 iter = db_.db->NewIterator(read_options_);
7c673cae
FG
8013
8014 std::unique_ptr<const char[]> key_guard;
8015 Slice key = AllocateKey(&key_guard);
8016
8017 char value_buffer[256];
8018 while (true) {
8019 {
8020 MutexLock l(&thread->shared->mu);
8021 if (thread->shared->num_done >= 1) {
8022 // Write thread have finished
8023 break;
8024 }
8025 }
8026 if (!FLAGS_use_tailing_iterator) {
8027 delete iter;
1e59de90 8028 iter = db_.db->NewIterator(read_options_);
7c673cae
FG
8029 }
8030 // Pick a Iterator to use
8031
8032 int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
8033 GenerateKeyFromInt(key_id, FLAGS_num, &key);
8034 // Reset last 8 bytes to 0
8035 char* start = const_cast<char*>(key.data());
8036 start += key.size() - 8;
8037 memset(start, 0, 8);
8038 ++read;
8039
8040 bool key_found = false;
8041 // Seek the prefix
8042 for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
8043 iter->Next()) {
8044 key_found = true;
8045 // Copy out iterator's value to make sure we read them.
8046 if (do_deletion) {
8047 bytes += iter->key().size();
8048 if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
8049 thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
8050 db_.db->Delete(write_options_, iter->key());
8051 } else {
8052 break;
8053 }
8054 } else {
8055 bytes += iter->key().size() + iter->value().size();
8056 thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
8057 Slice value = iter->value();
8058 memcpy(value_buffer, value.data(),
8059 std::min(value.size(), sizeof(value_buffer)));
8060
8061 assert(iter->status().ok());
8062 }
8063 }
8064 found += key_found;
8065
8066 if (thread->shared->read_rate_limiter.get() != nullptr) {
11fdf7f2
TL
8067 thread->shared->read_rate_limiter->Request(
8068 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7c673cae
FG
8069 }
8070 }
8071 delete iter;
8072
8073 char msg[100];
8074 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
8075 read);
8076 thread->stats.AddBytes(bytes);
8077 thread->stats.AddMessage(msg);
7c673cae
FG
8078 }
8079
8080 void TimeSeriesWrite(ThreadState* thread) {
8081 // Special thread that keeps writing until other threads are done.
8082 RandomGenerator gen;
8083 int64_t bytes = 0;
8084
8085 // Don't merge stats from this thread with the readers.
8086 thread->stats.SetExcludeFromMerge();
8087
8088 std::unique_ptr<RateLimiter> write_rate_limiter;
8089 if (FLAGS_benchmark_write_rate_limit > 0) {
8090 write_rate_limiter.reset(
8091 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
8092 }
8093
8094 std::unique_ptr<const char[]> key_guard;
8095 Slice key = AllocateKey(&key_guard);
8096
8097 Duration duration(FLAGS_duration, writes_);
8098 while (!duration.Done(1)) {
8099 DB* db = SelectDB(thread);
8100
8101 uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
8102 // Write key id
8103 GenerateKeyFromInt(key_id, FLAGS_num, &key);
8104 // Write timestamp
8105
8106 char* start = const_cast<char*>(key.data());
8107 char* pos = start + 8;
8108 int bytes_to_fill =
8109 std::min(key_size_ - static_cast<int>(pos - start), 8);
8110 uint64_t timestamp_value = timestamp_emulator_->Get();
8111 if (port::kLittleEndian) {
8112 for (int i = 0; i < bytes_to_fill; ++i) {
8113 pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
8114 }
8115 } else {
8116 memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
8117 }
8118
8119 timestamp_emulator_->Inc();
8120
8121 Status s;
f67539c2
TL
8122 Slice val = gen.Generate();
8123 s = db->Put(write_options_, key, val);
7c673cae
FG
8124
8125 if (!s.ok()) {
8126 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
20effc67 8127 ErrorExit();
7c673cae 8128 }
f67539c2 8129 bytes = key.size() + val.size();
7c673cae
FG
8130 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
8131 thread->stats.AddBytes(bytes);
8132
8133 if (FLAGS_benchmark_write_rate_limit > 0) {
1e59de90
TL
8134 write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
8135 nullptr /* stats */,
8136 RateLimiter::OpType::kWrite);
7c673cae
FG
8137 }
8138 }
8139 }
8140
8141 void TimeSeries(ThreadState* thread) {
8142 if (thread->tid > 0) {
8143 bool do_deletion = FLAGS_expire_style == "delete" &&
8144 thread->tid <= FLAGS_num_deletion_threads;
8145 TimeSeriesReadOrDelete(thread, do_deletion);
8146 } else {
8147 TimeSeriesWrite(thread);
8148 thread->stats.Stop();
8149 thread->stats.Report("timeseries write");
8150 }
8151 }
8152
8153 void Compact(ThreadState* thread) {
8154 DB* db = SelectDB(thread);
11fdf7f2 8155 CompactRangeOptions cro;
f67539c2
TL
8156 cro.bottommost_level_compaction =
8157 BottommostLevelCompaction::kForceOptimized;
11fdf7f2
TL
8158 db->CompactRange(cro, nullptr, nullptr);
8159 }
8160
8161 void CompactAll() {
8162 if (db_.db != nullptr) {
8163 db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8164 }
8165 for (const auto& db_with_cfh : multi_dbs_) {
8166 db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
8167 }
7c673cae
FG
8168 }
8169
1e59de90
TL
8170#ifndef ROCKSDB_LITE
8171 void WaitForCompactionHelper(DBWithColumnFamilies& db) {
8172 // This is an imperfect way of waiting for compaction. The loop and sleep
8173 // is done because a thread that finishes a compaction job should get a
8174 // chance to pickup a new compaction job.
8175
8176 std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
8177 DB::Properties::kNumRunningFlushes,
8178 DB::Properties::kCompactionPending,
8179 DB::Properties::kNumRunningCompactions};
8180
8181 fprintf(stdout, "waitforcompaction(%s): started\n",
8182 db.db->GetName().c_str());
8183
8184 while (true) {
8185 bool retry = false;
8186
8187 for (const auto& k : keys) {
8188 uint64_t v;
8189 if (!db.db->GetIntProperty(k, &v)) {
8190 fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
8191 db.db->GetName().c_str(), k.c_str());
8192 exit(1);
8193 } else if (v > 0) {
8194 fprintf(stdout,
8195 "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
8196 db.db->GetName().c_str(), k.c_str());
8197 FLAGS_env->SleepForMicroseconds(10 * 1000000);
8198 retry = true;
8199 break;
8200 }
8201 }
8202
8203 if (!retry) {
8204 fprintf(stdout, "waitforcompaction(%s): finished\n",
8205 db.db->GetName().c_str());
8206 return;
8207 }
8208 }
8209 }
8210
8211 void WaitForCompaction() {
8212 // Give background threads a chance to wake
8213 FLAGS_env->SleepForMicroseconds(5 * 1000000);
8214
8215 // I am skeptical that this check race free. I hope that checking twice
8216 // reduces the chance.
8217 if (db_.db != nullptr) {
8218 WaitForCompactionHelper(db_);
8219 WaitForCompactionHelper(db_);
8220 } else {
8221 for (auto& db_with_cfh : multi_dbs_) {
8222 WaitForCompactionHelper(db_with_cfh);
8223 WaitForCompactionHelper(db_with_cfh);
8224 }
8225 }
8226 }
8227
8228 bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
8229 std::vector<LiveFileMetaData> files;
8230 db_with_cfh.db->GetLiveFilesMetaData(&files);
8231
8232 assert(from_level == 0 || from_level == 1);
8233
8234 int real_from_level = from_level;
8235 if (real_from_level > 0) {
8236 // With dynamic leveled compaction the first level with data beyond L0
8237 // might not be L1.
8238 real_from_level = std::numeric_limits<int>::max();
8239
8240 for (auto& f : files) {
8241 if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
8242 }
8243
8244 if (real_from_level == std::numeric_limits<int>::max()) {
8245 fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
8246 return true;
8247 }
8248 }
8249
8250 // The goal is to compact from from_level to the level that follows it,
8251 // and with dynamic leveled compaction the next level might not be
8252 // real_from_level+1
8253 int next_level = std::numeric_limits<int>::max();
8254
8255 std::vector<std::string> files_to_compact;
8256 for (auto& f : files) {
8257 if (f.level == real_from_level)
8258 files_to_compact.push_back(f.name);
8259 else if (f.level > real_from_level && f.level < next_level)
8260 next_level = f.level;
8261 }
8262
8263 if (files_to_compact.empty()) {
8264 fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
8265 return true;
8266 } else if (next_level == std::numeric_limits<int>::max()) {
8267 // There is no data beyond real_from_level. So we are done.
8268 fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
8269 real_from_level);
8270 return true;
8271 }
8272
8273 fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
8274 from_level, static_cast<int>(files_to_compact.size()),
8275 real_from_level, next_level);
8276
8277 ROCKSDB_NAMESPACE::CompactionOptions options;
8278 // Lets RocksDB use the configured compression for this level
8279 options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
8280
8281 ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
8282 db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
8283 options.output_file_size_limit = cfDesc.options.target_file_size_base;
8284
8285 Status status =
8286 db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
8287 if (!status.ok()) {
8288 // This can fail for valid reasons including the operation was aborted
8289 // or a filename is invalid because background compaction removed it.
8290 // Having read the current cases for which an error is raised I prefer
8291 // not to figure out whether an exception should be thrown here.
8292 fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
8293 status.ToString().c_str());
8294 return false;
8295 }
8296 return true;
8297 }
8298
8299 void CompactLevel(int from_level) {
8300 if (db_.db != nullptr) {
8301 while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
8302 }
8303 for (auto& db_with_cfh : multi_dbs_) {
8304 while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
8305 }
8306 }
8307#endif
8308
8309 void Flush() {
8310 FlushOptions flush_opt;
8311 flush_opt.wait = true;
8312
8313 if (db_.db != nullptr) {
8314 Status s;
8315 if (FLAGS_num_column_families > 1) {
8316 s = db_.db->Flush(flush_opt, db_.cfh);
8317 } else {
8318 s = db_.db->Flush(flush_opt, db_.db->DefaultColumnFamily());
8319 }
8320
8321 if (!s.ok()) {
8322 fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
8323 exit(1);
8324 }
8325 } else {
8326 for (const auto& db_with_cfh : multi_dbs_) {
8327 Status s;
8328 if (FLAGS_num_column_families > 1) {
8329 s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
8330 } else {
8331 s = db_with_cfh.db->Flush(flush_opt,
8332 db_with_cfh.db->DefaultColumnFamily());
8333 }
8334
8335 if (!s.ok()) {
8336 fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
8337 exit(1);
8338 }
8339 }
8340 }
8341 fprintf(stdout, "flush memtable\n");
8342 }
8343
7c673cae
FG
8344 void ResetStats() {
8345 if (db_.db != nullptr) {
8346 db_.db->ResetStats();
8347 }
8348 for (const auto& db_with_cfh : multi_dbs_) {
8349 db_with_cfh.db->ResetStats();
8350 }
8351 }
8352
f67539c2
TL
8353 void PrintStatsHistory() {
8354 if (db_.db != nullptr) {
8355 PrintStatsHistoryImpl(db_.db, false);
8356 }
8357 for (const auto& db_with_cfh : multi_dbs_) {
8358 PrintStatsHistoryImpl(db_with_cfh.db, true);
8359 }
8360 }
8361
8362 void PrintStatsHistoryImpl(DB* db, bool print_header) {
8363 if (print_header) {
8364 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8365 }
8366
8367 std::unique_ptr<StatsHistoryIterator> shi;
1e59de90
TL
8368 Status s =
8369 db->GetStatsHistory(0, std::numeric_limits<uint64_t>::max(), &shi);
f67539c2
TL
8370 if (!s.ok()) {
8371 fprintf(stdout, "%s\n", s.ToString().c_str());
8372 return;
8373 }
8374 assert(shi);
8375 while (shi->Valid()) {
8376 uint64_t stats_time = shi->GetStatsTime();
8377 fprintf(stdout, "------ %s ------\n",
8378 TimeToHumanString(static_cast<int>(stats_time)).c_str());
8379 for (auto& entry : shi->GetStatsMap()) {
8380 fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time,
8381 entry.first.c_str(), entry.second);
8382 }
8383 shi->Next();
8384 }
8385 }
8386
7c673cae
FG
8387 void PrintStats(const char* key) {
8388 if (db_.db != nullptr) {
8389 PrintStats(db_.db, key, false);
8390 }
8391 for (const auto& db_with_cfh : multi_dbs_) {
8392 PrintStats(db_with_cfh.db, key, true);
8393 }
8394 }
8395
8396 void PrintStats(DB* db, const char* key, bool print_header = false) {
8397 if (print_header) {
8398 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8399 }
8400 std::string stats;
8401 if (!db->GetProperty(key, &stats)) {
8402 stats = "(failed)";
8403 }
8404 fprintf(stdout, "\n%s\n", stats.c_str());
8405 }
11fdf7f2 8406
1e59de90
TL
8407 void PrintStats(const std::vector<std::string>& keys) {
8408 if (db_.db != nullptr) {
8409 PrintStats(db_.db, keys);
8410 }
8411 for (const auto& db_with_cfh : multi_dbs_) {
8412 PrintStats(db_with_cfh.db, keys, true);
8413 }
8414 }
8415
8416 void PrintStats(DB* db, const std::vector<std::string>& keys,
8417 bool print_header = false) {
8418 if (print_header) {
8419 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
8420 }
8421
8422 for (const auto& key : keys) {
8423 std::string stats;
8424 if (!db->GetProperty(key, &stats)) {
8425 stats = "(failed)";
8426 }
8427 fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
8428 }
8429 }
8430
8431#ifndef ROCKSDB_LITE
8432
11fdf7f2
TL
8433 void Replay(ThreadState* thread) {
8434 if (db_.db != nullptr) {
8435 Replay(thread, &db_);
8436 }
8437 }
8438
8439 void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
8440 Status s;
494da23a 8441 std::unique_ptr<TraceReader> trace_reader;
11fdf7f2
TL
8442 s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
8443 &trace_reader);
8444 if (!s.ok()) {
8445 fprintf(
8446 stderr,
8447 "Encountered an error creating a TraceReader from the trace file. "
8448 "Error: %s\n",
8449 s.ToString().c_str());
8450 exit(1);
8451 }
1e59de90
TL
8452 std::unique_ptr<Replayer> replayer;
8453 s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
8454 std::move(trace_reader), &replayer);
8455 if (!s.ok()) {
8456 fprintf(stderr,
8457 "Encountered an error creating a default Replayer. "
8458 "Error: %s\n",
8459 s.ToString().c_str());
8460 exit(1);
8461 }
8462 s = replayer->Prepare();
8463 if (!s.ok()) {
8464 fprintf(stderr, "Prepare for replay failed. Error: %s\n",
8465 s.ToString().c_str());
8466 }
8467 s = replayer->Replay(
8468 ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads),
8469 FLAGS_trace_replay_fast_forward),
8470 nullptr);
8471 replayer.reset();
11fdf7f2 8472 if (s.ok()) {
1e59de90 8473 fprintf(stdout, "Replay completed from trace_file: %s\n",
11fdf7f2
TL
8474 FLAGS_trace_file.c_str());
8475 } else {
1e59de90 8476 fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str());
11fdf7f2
TL
8477 }
8478 }
1e59de90
TL
8479
8480 void Backup(ThreadState* thread) {
8481 DB* db = SelectDB(thread);
8482 std::unique_ptr<BackupEngineOptions> engine_options(
8483 new BackupEngineOptions(FLAGS_backup_dir));
8484 Status s;
8485 BackupEngine* backup_engine;
8486 if (FLAGS_backup_rate_limit > 0) {
8487 engine_options->backup_rate_limiter.reset(NewGenericRateLimiter(
8488 FLAGS_backup_rate_limit, 100000 /* refill_period_us */,
8489 10 /* fairness */, RateLimiter::Mode::kAllIo));
8490 }
8491 // Build new backup of the entire DB
8492 engine_options->destroy_old_data = true;
8493 s = BackupEngine::Open(FLAGS_env, *engine_options, &backup_engine);
8494 assert(s.ok());
8495 s = backup_engine->CreateNewBackup(db);
8496 assert(s.ok());
8497 std::vector<BackupInfo> backup_info;
8498 backup_engine->GetBackupInfo(&backup_info);
8499 // Verify that a new backup is created
8500 assert(backup_info.size() == 1);
8501 }
8502
8503 void Restore(ThreadState* /* thread */) {
8504 std::unique_ptr<BackupEngineOptions> engine_options(
8505 new BackupEngineOptions(FLAGS_backup_dir));
8506 if (FLAGS_restore_rate_limit > 0) {
8507 engine_options->restore_rate_limiter.reset(NewGenericRateLimiter(
8508 FLAGS_restore_rate_limit, 100000 /* refill_period_us */,
8509 10 /* fairness */, RateLimiter::Mode::kAllIo));
8510 }
8511 BackupEngineReadOnly* backup_engine;
8512 Status s =
8513 BackupEngineReadOnly::Open(FLAGS_env, *engine_options, &backup_engine);
8514 assert(s.ok());
8515 s = backup_engine->RestoreDBFromLatestBackup(FLAGS_restore_dir,
8516 FLAGS_restore_dir);
8517 assert(s.ok());
8518 delete backup_engine;
8519 }
8520
8521#endif // ROCKSDB_LITE
7c673cae
FG
8522};
8523
8524int db_bench_tool(int argc, char** argv) {
f67539c2 8525 ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
1e59de90 8526 ConfigOptions config_options;
7c673cae
FG
8527 static bool initialized = false;
8528 if (!initialized) {
8529 SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
8530 " [OPTIONS]...");
1e59de90 8531 SetVersionString(GetRocksVersionAsString(true));
7c673cae
FG
8532 initialized = true;
8533 }
8534 ParseCommandLineFlags(&argc, &argv, true);
f67539c2
TL
8535 FLAGS_compaction_style_e =
8536 (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
7c673cae
FG
8537#ifndef ROCKSDB_LITE
8538 if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
8539 fprintf(stderr,
8540 "Cannot provide both --statistics and --statistics_string.\n");
8541 exit(1);
8542 }
8543 if (!FLAGS_statistics_string.empty()) {
1e59de90
TL
8544 Status s = Statistics::CreateFromString(config_options,
8545 FLAGS_statistics_string, &dbstats);
7c673cae 8546 if (dbstats == nullptr) {
f67539c2
TL
8547 fprintf(stderr,
8548 "No Statistics registered matching string: %s status=%s\n",
8549 FLAGS_statistics_string.c_str(), s.ToString().c_str());
7c673cae
FG
8550 exit(1);
8551 }
8552 }
8553#endif // ROCKSDB_LITE
8554 if (FLAGS_statistics) {
f67539c2 8555 dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
7c673cae 8556 }
494da23a
TL
8557 if (dbstats) {
8558 dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
8559 }
f67539c2
TL
8560 FLAGS_compaction_pri_e =
8561 (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
7c673cae 8562
f67539c2 8563 std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
7c673cae
FG
8564 FLAGS_max_bytes_for_level_multiplier_additional, ',');
8565 for (size_t j = 0; j < fanout.size(); j++) {
8566 FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
8567#ifndef CYGWIN
8568 std::stoi(fanout[j]));
8569#else
8570 stoi(fanout[j]));
8571#endif
8572 }
8573
8574 FLAGS_compression_type_e =
1e59de90
TL
8575 StringToCompressionType(FLAGS_compression_type.c_str());
8576
8577 FLAGS_wal_compression_e =
8578 StringToCompressionType(FLAGS_wal_compression.c_str());
8579
8580 FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType(
8581 FLAGS_compressed_secondary_cache_compression_type.c_str());
7c673cae
FG
8582
8583#ifndef ROCKSDB_LITE
1e59de90 8584 // Stacked BlobDB
f67539c2 8585 FLAGS_blob_db_compression_type_e =
1e59de90 8586 StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
f67539c2 8587
1e59de90 8588 int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
20effc67 8589 if (env_opts > 1) {
1e59de90 8590 fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
7c673cae 8591 exit(1);
20effc67
TL
8592 }
8593
1e59de90
TL
8594 if (env_opts == 1) {
8595 Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri,
8596 &FLAGS_env, &env_guard);
8597 if (!s.ok()) {
8598 fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
20effc67
TL
8599 exit(1);
8600 }
1e59de90
TL
8601 } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
8602 //**TODO: Make the simulate fs something that can be loaded
8603 // from the ObjectRegistry...
8604 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
8605 NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
8606 FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
8607 /*throughput_multiplier=*/
8608 int{FLAGS_simulate_hybrid_hdd_multipliers},
8609 /*is_full_fs_warm=*/FLAGS_simulate_hdd));
8610 FLAGS_env = composite_env.get();
7c673cae 8611 }
1e59de90
TL
8612
8613 // Let -readonly imply -use_existing_db
8614 FLAGS_use_existing_db |= FLAGS_readonly;
7c673cae 8615#endif // ROCKSDB_LITE
1e59de90
TL
8616
8617 if (FLAGS_build_info) {
8618 std::string build_info;
8619 std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl;
8620 // Similar to --version, nothing else will be done when this flag is set
8621 exit(0);
8622 }
8623
8624 if (!FLAGS_seed) {
8625 uint64_t now = FLAGS_env->GetSystemClock()->NowMicros();
8626 seed_base = static_cast<int64_t>(now);
8627 fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n",
8628 seed_base);
8629 } else {
8630 seed_base = FLAGS_seed;
8631 }
8632
494da23a
TL
8633 if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
8634 fprintf(stderr,
8635 "`-use_existing_db` must be true for `-use_existing_keys` to be "
8636 "settable\n");
8637 exit(1);
8638 }
8639
7c673cae 8640 if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
f67539c2 8641 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
7c673cae 8642 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
f67539c2 8643 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
7c673cae 8644 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
f67539c2 8645 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
7c673cae 8646 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
f67539c2 8647 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
7c673cae
FG
8648 else {
8649 fprintf(stdout, "Unknown compaction fadvice:%s\n",
8650 FLAGS_compaction_fadvice.c_str());
1e59de90 8651 exit(1);
7c673cae
FG
8652 }
8653
f67539c2 8654 FLAGS_value_size_distribution_type_e =
1e59de90 8655 StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
7c673cae 8656
11fdf7f2
TL
8657 // Note options sanitization may increase thread pool sizes according to
8658 // max_background_flushes/max_background_compactions/max_background_jobs
8659 FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
f67539c2 8660 ROCKSDB_NAMESPACE::Env::Priority::HIGH);
11fdf7f2 8661 FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
f67539c2 8662 ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
11fdf7f2 8663 FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
f67539c2 8664 ROCKSDB_NAMESPACE::Env::Priority::LOW);
7c673cae
FG
8665
8666 // Choose a location for the test database if none given with --db=<path>
8667 if (FLAGS_db.empty()) {
8668 std::string default_db_path;
f67539c2 8669 FLAGS_env->GetTestDirectory(&default_db_path);
7c673cae
FG
8670 default_db_path += "/dbbench";
8671 FLAGS_db = default_db_path;
8672 }
8673
1e59de90
TL
8674 if (FLAGS_backup_dir.empty()) {
8675 FLAGS_backup_dir = FLAGS_db + "/backup";
8676 }
8677
8678 if (FLAGS_restore_dir.empty()) {
8679 FLAGS_restore_dir = FLAGS_db + "/restore";
8680 }
8681
7c673cae
FG
8682 if (FLAGS_stats_interval_seconds > 0) {
8683 // When both are set then FLAGS_stats_interval determines the frequency
8684 // at which the timer is checked for FLAGS_stats_interval_seconds
8685 FLAGS_stats_interval = 1000;
8686 }
8687
f67539c2
TL
8688 if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
8689 fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
8690 exit(1);
8691 }
8692
8693 ROCKSDB_NAMESPACE::Benchmark benchmark;
7c673cae 8694 benchmark.Run();
494da23a
TL
8695
8696#ifndef ROCKSDB_LITE
8697 if (FLAGS_print_malloc_stats) {
8698 std::string stats_string;
f67539c2 8699 ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
494da23a
TL
8700 fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
8701 }
8702#endif // ROCKSDB_LITE
8703
7c673cae
FG
8704 return 0;
8705}
f67539c2 8706} // namespace ROCKSDB_NAMESPACE
7c673cae 8707#endif