]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
7c673cae FG |
10 | #ifdef GFLAGS |
11 | #ifdef NUMA | |
12 | #include <numa.h> | |
7c673cae FG |
13 | #endif |
14 | #ifndef OS_WIN | |
15 | #include <unistd.h> | |
16 | #endif | |
17 | #include <fcntl.h> | |
7c673cae FG |
18 | #include <stdio.h> |
19 | #include <stdlib.h> | |
20 | #include <sys/types.h> | |
1e59de90 TL |
21 | #ifdef __APPLE__ |
22 | #include <mach/host_info.h> | |
23 | #include <mach/mach_host.h> | |
24 | #include <sys/sysctl.h> | |
25 | #endif | |
26 | #ifdef __FreeBSD__ | |
27 | #include <sys/sysctl.h> | |
28 | #endif | |
7c673cae | 29 | #include <atomic> |
f67539c2 | 30 | #include <cinttypes> |
7c673cae FG |
31 | #include <condition_variable> |
32 | #include <cstddef> | |
1e59de90 | 33 | #include <iostream> |
11fdf7f2 | 34 | #include <memory> |
7c673cae | 35 | #include <mutex> |
1e59de90 | 36 | #include <queue> |
7c673cae FG |
37 | #include <thread> |
38 | #include <unordered_map> | |
39 | ||
f67539c2 | 40 | #include "db/db_impl/db_impl.h" |
494da23a | 41 | #include "db/malloc_stats.h" |
7c673cae | 42 | #include "db/version_set.h" |
7c673cae FG |
43 | #include "monitoring/histogram.h" |
44 | #include "monitoring/statistics.h" | |
494da23a | 45 | #include "options/cf_options.h" |
7c673cae FG |
46 | #include "port/port.h" |
47 | #include "port/stack_trace.h" | |
48 | #include "rocksdb/cache.h" | |
1e59de90 | 49 | #include "rocksdb/convenience.h" |
7c673cae FG |
50 | #include "rocksdb/db.h" |
51 | #include "rocksdb/env.h" | |
52 | #include "rocksdb/filter_policy.h" | |
53 | #include "rocksdb/memtablerep.h" | |
54 | #include "rocksdb/options.h" | |
55 | #include "rocksdb/perf_context.h" | |
56 | #include "rocksdb/persistent_cache.h" | |
57 | #include "rocksdb/rate_limiter.h" | |
1e59de90 | 58 | #include "rocksdb/secondary_cache.h" |
7c673cae FG |
59 | #include "rocksdb/slice.h" |
60 | #include "rocksdb/slice_transform.h" | |
f67539c2 | 61 | #include "rocksdb/stats_history.h" |
1e59de90 TL |
62 | #include "rocksdb/table.h" |
63 | #include "rocksdb/utilities/backup_engine.h" | |
7c673cae FG |
64 | #include "rocksdb/utilities/object_registry.h" |
65 | #include "rocksdb/utilities/optimistic_transaction_db.h" | |
1e59de90 | 66 | #include "rocksdb/utilities/options_type.h" |
7c673cae | 67 | #include "rocksdb/utilities/options_util.h" |
1e59de90 TL |
68 | #ifndef ROCKSDB_LITE |
69 | #include "rocksdb/utilities/replayer.h" | |
70 | #endif // ROCKSDB_LITE | |
7c673cae FG |
71 | #include "rocksdb/utilities/sim_cache.h" |
72 | #include "rocksdb/utilities/transaction.h" | |
73 | #include "rocksdb/utilities/transaction_db.h" | |
74 | #include "rocksdb/write_batch.h" | |
f67539c2 TL |
75 | #include "test_util/testutil.h" |
76 | #include "test_util/transaction_test_util.h" | |
1e59de90 | 77 | #include "tools/simulated_hybrid_file_system.h" |
11fdf7f2 | 78 | #include "util/cast_util.h" |
7c673cae FG |
79 | #include "util/compression.h" |
80 | #include "util/crc32c.h" | |
1e59de90 | 81 | #include "util/file_checksum_helper.h" |
11fdf7f2 | 82 | #include "util/gflags_compat.h" |
7c673cae FG |
83 | #include "util/mutexlock.h" |
84 | #include "util/random.h" | |
85 | #include "util/stderr_logger.h" | |
86 | #include "util/string_util.h" | |
7c673cae FG |
87 | #include "util/xxhash.h" |
88 | #include "utilities/blob_db/blob_db.h" | |
1e59de90 | 89 | #include "utilities/counted_fs.h" |
7c673cae | 90 | #include "utilities/merge_operators.h" |
11fdf7f2 | 91 | #include "utilities/merge_operators/bytesxor.h" |
f67539c2 | 92 | #include "utilities/merge_operators/sortlist.h" |
7c673cae FG |
93 | #include "utilities/persistent_cache/block_cache_tier.h" |
94 | ||
20effc67 TL |
95 | #ifdef MEMKIND |
96 | #include "memory/memkind_kmem_allocator.h" | |
97 | #endif | |
98 | ||
7c673cae FG |
99 | #ifdef OS_WIN |
100 | #include <io.h> // open/close | |
101 | #endif | |
102 | ||
11fdf7f2 TL |
103 | using GFLAGS_NAMESPACE::ParseCommandLineFlags; |
104 | using GFLAGS_NAMESPACE::RegisterFlagValidator; | |
105 | using GFLAGS_NAMESPACE::SetUsageMessage; | |
1e59de90 TL |
106 | using GFLAGS_NAMESPACE::SetVersionString; |
107 | ||
108 | #ifdef ROCKSDB_LITE | |
109 | #define IF_ROCKSDB_LITE(Then, Else) Then | |
110 | #else | |
111 | #define IF_ROCKSDB_LITE(Then, Else) Else | |
112 | #endif | |
7c673cae FG |
113 | |
114 | DEFINE_string( | |
115 | benchmarks, | |
116 | "fillseq," | |
117 | "fillseqdeterministic," | |
118 | "fillsync," | |
119 | "fillrandom," | |
120 | "filluniquerandomdeterministic," | |
121 | "overwrite," | |
122 | "readrandom," | |
123 | "newiterator," | |
124 | "newiteratorwhilewriting," | |
125 | "seekrandom," | |
126 | "seekrandomwhilewriting," | |
127 | "seekrandomwhilemerging," | |
128 | "readseq," | |
129 | "readreverse," | |
130 | "compact," | |
11fdf7f2 | 131 | "compactall," |
1e59de90 TL |
132 | "flush," |
133 | IF_ROCKSDB_LITE("", | |
134 | "compact0," | |
135 | "compact1," | |
136 | "waitforcompaction," | |
137 | ) | |
7c673cae | 138 | "multireadrandom," |
494da23a | 139 | "mixgraph," |
7c673cae | 140 | "readseq," |
f67539c2 | 141 | "readtorowcache," |
7c673cae FG |
142 | "readtocache," |
143 | "readreverse," | |
144 | "readwhilewriting," | |
145 | "readwhilemerging," | |
11fdf7f2 | 146 | "readwhilescanning," |
7c673cae FG |
147 | "readrandomwriterandom," |
148 | "updaterandom," | |
11fdf7f2 | 149 | "xorupdaterandom," |
20effc67 | 150 | "approximatesizerandom," |
7c673cae FG |
151 | "randomwithverify," |
152 | "fill100K," | |
153 | "crc32c," | |
154 | "xxhash," | |
1e59de90 TL |
155 | "xxhash64," |
156 | "xxh3," | |
7c673cae FG |
157 | "compress," |
158 | "uncompress," | |
159 | "acquireload," | |
160 | "fillseekseq," | |
161 | "randomtransaction," | |
162 | "randomreplacekeys," | |
f67539c2 | 163 | "timeseries," |
1e59de90 TL |
164 | "getmergeoperands,", |
165 | "readrandomoperands," | |
166 | "backup," | |
167 | "restore" | |
7c673cae FG |
168 | |
169 | "Comma-separated list of operations to run in the specified" | |
170 | " order. Available benchmarks:\n" | |
171 | "\tfillseq -- write N values in sequential key" | |
172 | " order in async mode\n" | |
173 | "\tfillseqdeterministic -- write N values in the specified" | |
174 | " key order and keep the shape of the LSM tree\n" | |
175 | "\tfillrandom -- write N values in random key order in async" | |
176 | " mode\n" | |
177 | "\tfilluniquerandomdeterministic -- write N values in a random" | |
178 | " key order and keep the shape of the LSM tree\n" | |
1e59de90 TL |
179 | "\toverwrite -- overwrite N values in random key order in " |
180 | "async mode\n" | |
f67539c2 | 181 | "\tfillsync -- write N/1000 values in random key order in " |
7c673cae FG |
182 | "sync mode\n" |
183 | "\tfill100K -- write N/1000 100K values in random order in" | |
184 | " async mode\n" | |
185 | "\tdeleteseq -- delete N keys in sequential order\n" | |
186 | "\tdeleterandom -- delete N keys in random order\n" | |
187 | "\treadseq -- read N times sequentially\n" | |
188 | "\treadtocache -- 1 thread reading database sequentially\n" | |
189 | "\treadreverse -- read N times in reverse order\n" | |
190 | "\treadrandom -- read N times in random order\n" | |
191 | "\treadmissing -- read N missing keys in random order\n" | |
192 | "\treadwhilewriting -- 1 writer, N threads doing random " | |
193 | "reads\n" | |
194 | "\treadwhilemerging -- 1 merger, N threads doing random " | |
195 | "reads\n" | |
11fdf7f2 TL |
196 | "\treadwhilescanning -- 1 thread doing full table scan, " |
197 | "N threads doing random reads\n" | |
7c673cae FG |
198 | "\treadrandomwriterandom -- N threads doing random-read, " |
199 | "random-write\n" | |
7c673cae FG |
200 | "\tupdaterandom -- N threads doing read-modify-write for random " |
201 | "keys\n" | |
11fdf7f2 TL |
202 | "\txorupdaterandom -- N threads doing read-XOR-write for " |
203 | "random keys\n" | |
7c673cae FG |
204 | "\tappendrandom -- N threads doing read-modify-write with " |
205 | "growing values\n" | |
206 | "\tmergerandom -- same as updaterandom/appendrandom using merge" | |
207 | " operator. " | |
208 | "Must be used with merge_operator\n" | |
209 | "\treadrandommergerandom -- perform N random read-or-merge " | |
210 | "operations. Must be used with merge_operator\n" | |
211 | "\tnewiterator -- repeated iterator creation\n" | |
212 | "\tseekrandom -- N random seeks, call Next seek_nexts times " | |
213 | "per seek\n" | |
214 | "\tseekrandomwhilewriting -- seekrandom and 1 thread doing " | |
215 | "overwrite\n" | |
216 | "\tseekrandomwhilemerging -- seekrandom and 1 thread doing " | |
217 | "merge\n" | |
1e59de90 TL |
218 | "\tcrc32c -- repeated crc32c of <block size> data\n" |
219 | "\txxhash -- repeated xxHash of <block size> data\n" | |
220 | "\txxhash64 -- repeated xxHash64 of <block size> data\n" | |
221 | "\txxh3 -- repeated XXH3 of <block size> data\n" | |
7c673cae FG |
222 | "\tacquireload -- load N*1000 times\n" |
223 | "\tfillseekseq -- write N values in sequential key, then read " | |
224 | "them by seeking to each key\n" | |
225 | "\trandomtransaction -- execute N random transactions and " | |
226 | "verify correctness\n" | |
227 | "\trandomreplacekeys -- randomly replaces N keys by deleting " | |
228 | "the old version and putting the new version\n\n" | |
229 | "\ttimeseries -- 1 writer generates time series data " | |
230 | "and multiple readers doing random reads on id\n\n" | |
231 | "Meta operations:\n" | |
11fdf7f2 TL |
232 | "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n" |
233 | "\tcompactall -- Compact the entire DB\n" | |
1e59de90 TL |
234 | IF_ROCKSDB_LITE("", |
235 | "\tcompact0 -- compact L0 into L1\n" | |
236 | "\tcompact1 -- compact L1 into L2\n" | |
237 | "\twaitforcompaction - pause until compaction is (probably) done\n" | |
238 | ) | |
239 | "\tflush - flush the memtable\n" | |
7c673cae FG |
240 | "\tstats -- Print DB stats\n" |
241 | "\tresetstats -- Reset DB stats\n" | |
242 | "\tlevelstats -- Print the number of files and bytes per level\n" | |
1e59de90 | 243 | "\tmemstats -- Print memtable stats\n" |
7c673cae | 244 | "\tsstables -- Print sstable info\n" |
11fdf7f2 | 245 | "\theapprofile -- Dump a heap profile (if supported by this port)\n" |
1e59de90 | 246 | IF_ROCKSDB_LITE("", |
f67539c2 | 247 | "\treplay -- replay the trace file specified with trace_file\n" |
1e59de90 | 248 | ) |
f67539c2 TL |
249 | "\tgetmergeoperands -- Insert lots of merge records which are a list of " |
250 | "sorted ints for a key and then compare performance of lookup for another " | |
1e59de90 TL |
251 | "key by doing a Get followed by binary searching in the large sorted list " |
252 | "vs doing a GetMergeOperands and binary searching in the operands which " | |
253 | "are sorted sub-lists. The MergeOperator used is sortlist.h\n" | |
254 | "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An " | |
255 | "operation includes a rare but possible retry in case it got " | |
256 | "`Status::Incomplete()`. This happens upon encountering more keys than " | |
257 | "have ever been seen by the thread (or eight initially)\n" | |
258 | "\tbackup -- Create a backup of the current DB and verify that a new backup is corrected. " | |
259 | "Rate limit can be specified through --backup_rate_limit\n" | |
260 | "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n"); | |
7c673cae FG |
261 | |
262 | DEFINE_int64(num, 1000000, "Number of key/values to place in database"); | |
263 | ||
264 | DEFINE_int64(numdistinct, 1000, | |
265 | "Number of distinct keys to use. Used in RandomWithVerify to " | |
266 | "read/write on fewer keys so that gets are more likely to find the" | |
267 | " key and puts are more likely to update the same key"); | |
268 | ||
269 | DEFINE_int64(merge_keys, -1, | |
270 | "Number of distinct keys to use for MergeRandom and " | |
271 | "ReadRandomMergeRandom. " | |
272 | "If negative, there will be FLAGS_num keys."); | |
273 | DEFINE_int32(num_column_families, 1, "Number of Column Families to use."); | |
274 | ||
275 | DEFINE_int32( | |
276 | num_hot_column_families, 0, | |
277 | "Number of Hot Column Families. If more than 0, only write to this " | |
278 | "number of column families. After finishing all the writes to them, " | |
279 | "create new set of column families and insert to them. Only used " | |
280 | "when num_column_families > 1."); | |
281 | ||
11fdf7f2 TL |
282 | DEFINE_string(column_family_distribution, "", |
283 | "Comma-separated list of percentages, where the ith element " | |
284 | "indicates the probability of an op using the ith column family. " | |
285 | "The number of elements must be `num_hot_column_families` if " | |
286 | "specified; otherwise, it must be `num_column_families`. The " | |
287 | "sum of elements must be 100. E.g., if `num_column_families=4`, " | |
288 | "and `num_hot_column_families=0`, a valid list could be " | |
289 | "\"10,20,30,40\"."); | |
290 | ||
1e59de90 TL |
291 | DEFINE_int64(reads, -1, |
292 | "Number of read operations to do. " | |
7c673cae FG |
293 | "If negative, do FLAGS_num reads."); |
294 | ||
1e59de90 TL |
295 | DEFINE_int64(deletes, -1, |
296 | "Number of delete operations to do. " | |
7c673cae FG |
297 | "If negative, do FLAGS_num deletions."); |
298 | ||
299 | DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); | |
300 | ||
1e59de90 TL |
301 | DEFINE_int64(seed, 0, |
302 | "Seed base for random number generators. " | |
303 | "When 0 it is derived from the current time."); | |
304 | static int64_t seed_base; | |
7c673cae FG |
305 | |
306 | DEFINE_int32(threads, 1, "Number of concurrent threads to run."); | |
307 | ||
1e59de90 TL |
308 | DEFINE_int32(duration, 0, |
309 | "Time in seconds for the random-ops tests to run." | |
7c673cae FG |
310 | " When 0 then num & reads determine the test duration"); |
311 | ||
f67539c2 TL |
312 | DEFINE_string(value_size_distribution_type, "fixed", |
313 | "Value size distribution type: fixed, uniform, normal"); | |
314 | ||
315 | DEFINE_int32(value_size, 100, "Size of each value in fixed distribution"); | |
316 | static unsigned int value_size = 100; | |
317 | ||
318 | DEFINE_int32(value_size_min, 100, "Min size of random value"); | |
319 | ||
320 | DEFINE_int32(value_size_max, 102400, "Max size of random value"); | |
7c673cae FG |
321 | |
322 | DEFINE_int32(seek_nexts, 0, | |
323 | "How many times to call Next() after Seek() in " | |
324 | "fillseekseq, seekrandom, seekrandomwhilewriting and " | |
325 | "seekrandomwhilemerging"); | |
326 | ||
327 | DEFINE_bool(reverse_iterator, false, | |
328 | "When true use Prev rather than Next for iterators that do " | |
329 | "Seek and then Next"); | |
330 | ||
1e59de90 TL |
331 | DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark"); |
332 | ||
494da23a TL |
333 | DEFINE_int64(max_scan_distance, 0, |
334 | "Used to define iterate_upper_bound (or iterate_lower_bound " | |
335 | "if FLAGS_reverse_iterator is set to true) when value is nonzero"); | |
336 | ||
7c673cae FG |
337 | DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); |
338 | ||
7c673cae FG |
339 | DEFINE_int64(batch_size, 1, "Batch size"); |
340 | ||
11fdf7f2 | 341 | static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) { |
7c673cae FG |
342 | return true; |
343 | } | |
344 | ||
345 | static bool ValidateUint32Range(const char* flagname, uint64_t value) { | |
346 | if (value > std::numeric_limits<uint32_t>::max()) { | |
347 | fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname, | |
348 | (unsigned long)value); | |
349 | return false; | |
350 | } | |
351 | return true; | |
352 | } | |
353 | ||
354 | DEFINE_int32(key_size, 16, "size of each key"); | |
355 | ||
20effc67 TL |
356 | DEFINE_int32(user_timestamp_size, 0, |
357 | "number of bytes in a user-defined timestamp"); | |
358 | ||
7c673cae FG |
359 | DEFINE_int32(num_multi_db, 0, |
360 | "Number of DBs used in the benchmark. 0 means single DB."); | |
361 | ||
1e59de90 TL |
362 | DEFINE_double(compression_ratio, 0.5, |
363 | "Arrange to generate values that shrink to this fraction of " | |
364 | "their original size after compression"); | |
365 | ||
366 | DEFINE_double( | |
367 | overwrite_probability, 0.0, | |
368 | "Used in 'filluniquerandom' benchmark: for each write operation, " | |
369 | "we give a probability to perform an overwrite instead. The key used for " | |
370 | "the overwrite is randomly chosen from the last 'overwrite_window_size' " | |
371 | "keys previously inserted into the DB. " | |
372 | "Valid overwrite_probability values: [0.0, 1.0]."); | |
373 | ||
374 | DEFINE_uint32(overwrite_window_size, 1, | |
375 | "Used in 'filluniquerandom' benchmark. For each write operation," | |
376 | " when the overwrite_probability flag is set by the user, the " | |
377 | "key used to perform an overwrite is randomly chosen from the " | |
378 | "last 'overwrite_window_size' keys previously inserted into DB. " | |
379 | "Warning: large values can affect throughput. " | |
380 | "Valid overwrite_window_size values: [1, kMaxUint32]."); | |
381 | ||
382 | DEFINE_uint64( | |
383 | disposable_entries_delete_delay, 0, | |
384 | "Minimum delay in microseconds for the series of Deletes " | |
385 | "to be issued. When 0 the insertion of the last disposable entry is " | |
386 | "immediately followed by the issuance of the Deletes. " | |
387 | "(only compatible with fillanddeleteuniquerandom benchmark)."); | |
388 | ||
389 | DEFINE_uint64(disposable_entries_batch_size, 0, | |
390 | "Number of consecutively inserted disposable KV entries " | |
391 | "that will be deleted after 'delete_delay' microseconds. " | |
392 | "A series of Deletes is always issued once all the " | |
393 | "disposable KV entries it targets have been inserted " | |
394 | "into the DB. When 0 no deletes are issued and a " | |
395 | "regular 'filluniquerandom' benchmark occurs. " | |
396 | "(only compatible with fillanddeleteuniquerandom benchmark)"); | |
397 | ||
398 | DEFINE_int32(disposable_entries_value_size, 64, | |
399 | "Size of the values (in bytes) of the entries targeted by " | |
400 | "selective deletes. " | |
401 | "(only compatible with fillanddeleteuniquerandom benchmark)"); | |
402 | ||
403 | DEFINE_uint64( | |
404 | persistent_entries_batch_size, 0, | |
405 | "Number of KV entries being inserted right before the deletes " | |
406 | "targeting the disposable KV entries are issued. These " | |
407 | "persistent keys are not targeted by the deletes, and will always " | |
408 | "remain valid in the DB. (only compatible with " | |
409 | "--benchmarks='fillanddeleteuniquerandom' " | |
410 | "and used when--disposable_entries_batch_size is > 0)."); | |
411 | ||
412 | DEFINE_int32(persistent_entries_value_size, 64, | |
413 | "Size of the values (in bytes) of the entries not targeted by " | |
414 | "deletes. (only compatible with " | |
415 | "--benchmarks='fillanddeleteuniquerandom' " | |
416 | "and used when--disposable_entries_batch_size is > 0)."); | |
7c673cae FG |
417 | |
418 | DEFINE_double(read_random_exp_range, 0.0, | |
419 | "Read random's key will be generated using distribution of " | |
420 | "num * exp(-r) where r is uniform number from 0 to this value. " | |
421 | "The larger the number is, the more skewed the reads are. " | |
422 | "Only used in readrandom and multireadrandom benchmarks."); | |
423 | ||
424 | DEFINE_bool(histogram, false, "Print histogram of operation timings"); | |
425 | ||
1e59de90 TL |
426 | DEFINE_bool(confidence_interval_only, false, |
427 | "Print 95% confidence interval upper and lower bounds only for " | |
428 | "aggregate stats."); | |
429 | ||
7c673cae FG |
430 | DEFINE_bool(enable_numa, false, |
431 | "Make operations aware of NUMA architecture and bind memory " | |
432 | "and cpus corresponding to nodes together. In NUMA, memory " | |
433 | "in same node as CPUs are closer when compared to memory in " | |
434 | "other nodes. Reads can be faster when the process is bound to " | |
435 | "CPU and memory of same node. Use \"$numactl --hardware\" command " | |
436 | "to see NUMA memory architecture."); | |
437 | ||
f67539c2 TL |
438 | DEFINE_int64(db_write_buffer_size, |
439 | ROCKSDB_NAMESPACE::Options().db_write_buffer_size, | |
7c673cae FG |
440 | "Number of bytes to buffer in all memtables before compacting"); |
441 | ||
11fdf7f2 TL |
442 | DEFINE_bool(cost_write_buffer_to_cache, false, |
443 | "The usage of memtable is costed to the block cache"); | |
444 | ||
1e59de90 TL |
445 | DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size, |
446 | "The size, in bytes, of one block in arena memory allocation."); | |
447 | ||
f67539c2 | 448 | DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size, |
7c673cae FG |
449 | "Number of bytes to buffer in memtable before compacting"); |
450 | ||
451 | DEFINE_int32(max_write_buffer_number, | |
f67539c2 | 452 | ROCKSDB_NAMESPACE::Options().max_write_buffer_number, |
7c673cae | 453 | "The number of in-memory memtables. Each memtable is of size" |
11fdf7f2 | 454 | " write_buffer_size bytes."); |
7c673cae FG |
455 | |
456 | DEFINE_int32(min_write_buffer_number_to_merge, | |
f67539c2 | 457 | ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge, |
7c673cae FG |
458 | "The minimum number of write buffers that will be merged together" |
459 | "before writing to storage. This is cheap because it is an" | |
460 | "in-memory merge. If this feature is not enabled, then all these" | |
461 | "write buffers are flushed to L0 as separate files and this " | |
462 | "increases read amplification because a get request has to check" | |
463 | " in all of these files. Also, an in-memory merge may result in" | |
464 | " writing less data to storage if there are duplicate records " | |
465 | " in each of these individual write buffers."); | |
466 | ||
467 | DEFINE_int32(max_write_buffer_number_to_maintain, | |
f67539c2 | 468 | ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain, |
7c673cae FG |
469 | "The total maximum number of write buffers to maintain in memory " |
470 | "including copies of buffers that have already been flushed. " | |
471 | "Unlike max_write_buffer_number, this parameter does not affect " | |
472 | "flushing. This controls the minimum amount of write history " | |
473 | "that will be available in memory for conflict checking when " | |
474 | "Transactions are used. If this value is too low, some " | |
475 | "transactions may fail at commit time due to not being able to " | |
476 | "determine whether there were any write conflicts. Setting this " | |
477 | "value to 0 will cause write buffers to be freed immediately " | |
478 | "after they are flushed. If this value is set to -1, " | |
479 | "'max_write_buffer_number' will be used."); | |
480 | ||
f67539c2 TL |
481 | DEFINE_int64(max_write_buffer_size_to_maintain, |
482 | ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain, | |
483 | "The total maximum size of write buffers to maintain in memory " | |
484 | "including copies of buffers that have already been flushed. " | |
485 | "Unlike max_write_buffer_number, this parameter does not affect " | |
486 | "flushing. This controls the minimum amount of write history " | |
487 | "that will be available in memory for conflict checking when " | |
488 | "Transactions are used. If this value is too low, some " | |
489 | "transactions may fail at commit time due to not being able to " | |
490 | "determine whether there were any write conflicts. Setting this " | |
491 | "value to 0 will cause write buffers to be freed immediately " | |
492 | "after they are flushed. If this value is set to -1, " | |
493 | "'max_write_buffer_number' will be used."); | |
494 | ||
11fdf7f2 | 495 | DEFINE_int32(max_background_jobs, |
f67539c2 | 496 | ROCKSDB_NAMESPACE::Options().max_background_jobs, |
11fdf7f2 TL |
497 | "The maximum number of concurrent background jobs that can occur " |
498 | "in parallel."); | |
499 | ||
500 | DEFINE_int32(num_bottom_pri_threads, 0, | |
501 | "The number of threads in the bottom-priority thread pool (used " | |
502 | "by universal compaction only)."); | |
503 | ||
504 | DEFINE_int32(num_high_pri_threads, 0, | |
505 | "The maximum number of concurrent background compactions" | |
506 | " that can occur in parallel."); | |
507 | ||
508 | DEFINE_int32(num_low_pri_threads, 0, | |
509 | "The maximum number of concurrent background compactions" | |
510 | " that can occur in parallel."); | |
511 | ||
7c673cae | 512 | DEFINE_int32(max_background_compactions, |
f67539c2 | 513 | ROCKSDB_NAMESPACE::Options().max_background_compactions, |
7c673cae FG |
514 | "The maximum number of concurrent background compactions" |
515 | " that can occur in parallel."); | |
516 | ||
7c673cae FG |
517 | DEFINE_uint64(subcompactions, 1, |
518 | "Maximum number of subcompactions to divide L0-L1 compactions " | |
519 | "into."); | |
1e59de90 TL |
520 | static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = |
521 | RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); | |
7c673cae FG |
522 | |
523 | DEFINE_int32(max_background_flushes, | |
f67539c2 | 524 | ROCKSDB_NAMESPACE::Options().max_background_flushes, |
7c673cae FG |
525 | "The maximum number of concurrent background flushes" |
526 | " that can occur in parallel."); | |
527 | ||
f67539c2 TL |
528 | static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e; |
529 | DEFINE_int32(compaction_style, | |
530 | (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style, | |
7c673cae FG |
531 | "style of compaction: level-based, universal and fifo"); |
532 | ||
f67539c2 TL |
533 | static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e; |
534 | DEFINE_int32(compaction_pri, | |
535 | (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri, | |
7c673cae FG |
536 | "priority of files to compaction: by size or by data age"); |
537 | ||
538 | DEFINE_int32(universal_size_ratio, 0, | |
1e59de90 TL |
539 | "Percentage flexibility while comparing file size " |
540 | "(for universal compaction only)."); | |
7c673cae | 541 | |
1e59de90 TL |
542 | DEFINE_int32(universal_min_merge_width, 0, |
543 | "The minimum number of files in a single compaction run " | |
544 | "(for universal compaction only)."); | |
7c673cae | 545 | |
1e59de90 TL |
546 | DEFINE_int32(universal_max_merge_width, 0, |
547 | "The max number of files to compact in universal style " | |
548 | "compaction"); | |
7c673cae FG |
549 | |
550 | DEFINE_int32(universal_max_size_amplification_percent, 0, | |
551 | "The max size amplification for universal style compaction"); | |
552 | ||
553 | DEFINE_int32(universal_compression_size_percent, -1, | |
554 | "The percentage of the database to compress for universal " | |
555 | "compaction. -1 means compress everything."); | |
556 | ||
557 | DEFINE_bool(universal_allow_trivial_move, false, | |
558 | "Allow trivial move in universal compaction."); | |
559 | ||
1e59de90 TL |
560 | DEFINE_bool(universal_incremental, false, |
561 | "Enable incremental compactions in universal compaction."); | |
562 | ||
7c673cae FG |
563 | DEFINE_int64(cache_size, 8 << 20, // 8MB |
564 | "Number of bytes to use as a cache of uncompressed data"); | |
565 | ||
1e59de90 | 566 | DEFINE_int32(cache_numshardbits, -1, |
7c673cae FG |
567 | "Number of shards for the block cache" |
568 | " is 2 ** cache_numshardbits. Negative means use default settings." | |
569 | " This is applied only if FLAGS_cache_size is non-negative."); | |
570 | ||
571 | DEFINE_double(cache_high_pri_pool_ratio, 0.0, | |
572 | "Ratio of block cache reserve for high pri blocks. " | |
573 | "If > 0.0, we also enable " | |
574 | "cache_index_and_filter_blocks_with_high_priority."); | |
575 | ||
1e59de90 TL |
576 | DEFINE_double(cache_low_pri_pool_ratio, 0.0, |
577 | "Ratio of block cache reserve for low pri blocks."); | |
578 | ||
579 | DEFINE_string(cache_type, "lru_cache", "Type of block cache."); | |
580 | ||
581 | DEFINE_bool(use_compressed_secondary_cache, false, | |
582 | "Use the CompressedSecondaryCache as the secondary cache."); | |
583 | ||
584 | DEFINE_int64(compressed_secondary_cache_size, 8 << 20, // 8MB | |
585 | "Number of bytes to use as a cache of data"); | |
586 | ||
587 | DEFINE_int32(compressed_secondary_cache_numshardbits, 6, | |
588 | "Number of shards for the block cache" | |
589 | " is 2 ** compressed_secondary_cache_numshardbits." | |
590 | " Negative means use default settings." | |
591 | " This is applied only if FLAGS_cache_size is non-negative."); | |
592 | ||
593 | DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio, 0.0, | |
594 | "Ratio of block cache reserve for high pri blocks. " | |
595 | "If > 0.0, we also enable " | |
596 | "cache_index_and_filter_blocks_with_high_priority."); | |
597 | ||
598 | DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio, 0.0, | |
599 | "Ratio of block cache reserve for low pri blocks."); | |
600 | ||
601 | DEFINE_string(compressed_secondary_cache_compression_type, "lz4", | |
602 | "The compression algorithm to use for large " | |
603 | "values stored in CompressedSecondaryCache."); | |
604 | static enum ROCKSDB_NAMESPACE::CompressionType | |
605 | FLAGS_compressed_secondary_cache_compression_type_e = | |
606 | ROCKSDB_NAMESPACE::kLZ4Compression; | |
607 | ||
608 | DEFINE_uint32( | |
609 | compressed_secondary_cache_compress_format_version, 2, | |
610 | "compress_format_version can have two values: " | |
611 | "compress_format_version == 1 -- decompressed size is not included" | |
612 | " in the block header." | |
613 | "compress_format_version == 2 -- decompressed size is included" | |
614 | " in the block header in varint32 format."); | |
7c673cae FG |
615 | |
616 | DEFINE_int64(simcache_size, -1, | |
617 | "Number of bytes to use as a simcache of " | |
618 | "uncompressed data. Nagative value disables simcache."); | |
619 | ||
620 | DEFINE_bool(cache_index_and_filter_blocks, false, | |
621 | "Cache index/filter blocks in block cache."); | |
622 | ||
1e59de90 TL |
623 | DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false, |
624 | "Use JemallocNodumpAllocator for block/blob cache."); | |
625 | ||
20effc67 | 626 | DEFINE_bool(use_cache_memkind_kmem_allocator, false, |
1e59de90 | 627 | "Use memkind kmem allocator for block/blob cache."); |
20effc67 | 628 | |
11fdf7f2 TL |
629 | DEFINE_bool(partition_index_and_filters, false, |
630 | "Partition index and filter blocks."); | |
631 | ||
632 | DEFINE_bool(partition_index, false, "Partition index blocks"); | |
633 | ||
20effc67 TL |
634 | DEFINE_bool(index_with_first_key, false, "Include first key in the index"); |
635 | ||
636 | DEFINE_bool( | |
637 | optimize_filters_for_memory, | |
638 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory, | |
639 | "Minimize memory footprint of filters"); | |
640 | ||
641 | DEFINE_int64( | |
642 | index_shortening_mode, 2, | |
643 | "mode to shorten index: 0 for no shortening; 1 for only shortening " | |
644 | "separaters; 2 for shortening shortening and successor"); | |
645 | ||
11fdf7f2 | 646 | DEFINE_int64(metadata_block_size, |
f67539c2 | 647 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size, |
11fdf7f2 TL |
648 | "Max partition size when partitioning index/filters"); |
649 | ||
650 | // The default reduces the overhead of reading time with flash. With HDD, which | |
651 | // offers much less throughput, however, this number better to be set to 1. | |
652 | DEFINE_int32(ops_between_duration_checks, 1000, | |
653 | "Check duration limit every x ops"); | |
654 | ||
7c673cae FG |
655 | DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false, |
656 | "Pin index/filter blocks of L0 files in block cache."); | |
657 | ||
11fdf7f2 TL |
658 | DEFINE_bool( |
659 | pin_top_level_index_and_filter, false, | |
660 | "Pin top-level index of partitioned index/filter blocks in block cache."); | |
661 | ||
7c673cae | 662 | DEFINE_int32(block_size, |
f67539c2 TL |
663 | static_cast<int32_t>( |
664 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size), | |
7c673cae FG |
665 | "Number of bytes in a block."); |
666 | ||
f67539c2 TL |
667 | DEFINE_int32(format_version, |
668 | static_cast<int32_t>( | |
669 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version), | |
670 | "Format version of SST files."); | |
11fdf7f2 | 671 | |
7c673cae | 672 | DEFINE_int32(block_restart_interval, |
f67539c2 | 673 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval, |
7c673cae FG |
674 | "Number of keys between restart points " |
675 | "for delta encoding of keys in data block."); | |
676 | ||
f67539c2 TL |
677 | DEFINE_int32( |
678 | index_block_restart_interval, | |
679 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval, | |
680 | "Number of keys between restart points " | |
681 | "for delta encoding of keys in index block."); | |
7c673cae FG |
682 | |
683 | DEFINE_int32(read_amp_bytes_per_bit, | |
f67539c2 | 684 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit, |
7c673cae FG |
685 | "Number of bytes per bit to be used in block read-amp bitmap"); |
686 | ||
f67539c2 TL |
687 | DEFINE_bool( |
688 | enable_index_compression, | |
689 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression, | |
690 | "Compress the index block"); | |
11fdf7f2 | 691 | |
f67539c2 TL |
692 | DEFINE_bool(block_align, |
693 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align, | |
11fdf7f2 TL |
694 | "Align data blocks on page size"); |
695 | ||
1e59de90 TL |
696 | DEFINE_int64(prepopulate_block_cache, 0, |
697 | "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 " | |
698 | "to insert during flush"); | |
699 | ||
11fdf7f2 TL |
700 | DEFINE_bool(use_data_block_hash_index, false, |
701 | "if use kDataBlockBinaryAndHash " | |
702 | "instead of kDataBlockBinarySearch. " | |
703 | "This is valid if only we use BlockTable"); | |
704 | ||
705 | DEFINE_double(data_block_hash_table_util_ratio, 0.75, | |
706 | "util ratio for data block hash index table. " | |
707 | "This is only valid if use_data_block_hash_index is " | |
708 | "set to true"); | |
709 | ||
7c673cae FG |
710 | DEFINE_int64(compressed_cache_size, -1, |
711 | "Number of bytes to use as a cache of compressed data."); | |
712 | ||
713 | DEFINE_int64(row_cache_size, 0, | |
714 | "Number of bytes to use as a cache of individual rows" | |
715 | " (0 = disabled)."); | |
716 | ||
f67539c2 | 717 | DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files, |
7c673cae FG |
718 | "Maximum number of files to keep open at the same time" |
719 | " (use default if == 0)"); | |
720 | ||
f67539c2 TL |
721 | DEFINE_int32(file_opening_threads, |
722 | ROCKSDB_NAMESPACE::Options().max_file_opening_threads, | |
7c673cae FG |
723 | "If open_files is set to -1, this option set the number of " |
724 | "threads that will be used to open files during DB::Open()"); | |
725 | ||
7c673cae FG |
726 | DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size"); |
727 | ||
f67539c2 TL |
728 | DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size"); |
729 | ||
7c673cae FG |
730 | DEFINE_int32(random_access_max_buffer_size, 1024 * 1024, |
731 | "Maximum windows randomaccess buffer size"); | |
732 | ||
733 | DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024, | |
734 | "Maximum write buffer for Writable File"); | |
735 | ||
1e59de90 TL |
736 | DEFINE_int32(bloom_bits, -1, |
737 | "Bloom filter bits per key. Negative means use default." | |
738 | "Zero disables."); | |
739 | ||
740 | DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter"); | |
741 | ||
7c673cae FG |
742 | DEFINE_double(memtable_bloom_size_ratio, 0, |
743 | "Ratio of memtable size used for bloom filter. 0 means no bloom " | |
744 | "filter."); | |
494da23a TL |
745 | DEFINE_bool(memtable_whole_key_filtering, false, |
746 | "Try to use whole key bloom filter in memtables."); | |
7c673cae FG |
747 | DEFINE_bool(memtable_use_huge_page, false, |
748 | "Try to use huge page in memtables."); | |
749 | ||
1e59de90 TL |
750 | DEFINE_bool(whole_key_filtering, |
751 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering, | |
752 | "Use whole keys (in addition to prefixes) in SST bloom filter."); | |
753 | ||
754 | DEFINE_bool(use_existing_db, false, | |
755 | "If true, do not destroy the existing database. If you set this " | |
756 | "flag and also specify a benchmark that wants a fresh database, " | |
757 | "that benchmark will fail."); | |
7c673cae | 758 | |
494da23a TL |
759 | DEFINE_bool(use_existing_keys, false, |
760 | "If true, uses existing keys in the DB, " | |
761 | "rather than generating new ones. This involves some startup " | |
762 | "latency to load all keys into memory. It is supported for the " | |
763 | "same read/overwrite benchmarks as `-use_existing_db=true`, which " | |
764 | "must also be set for this flag to be enabled. When this flag is " | |
765 | "set, the value for `-num` will be ignored."); | |
766 | ||
7c673cae FG |
767 | DEFINE_bool(show_table_properties, false, |
768 | "If true, then per-level table" | |
769 | " properties will be printed on every stats-interval when" | |
770 | " stats_interval is set and stats_per_interval is on."); | |
771 | ||
772 | DEFINE_string(db, "", "Use the db with the following name."); | |
773 | ||
1e59de90 TL |
774 | DEFINE_bool(progress_reports, true, |
775 | "If true, db_bench will report number of finished operations."); | |
776 | ||
7c673cae FG |
777 | // Read cache flags |
778 | ||
779 | DEFINE_string(read_cache_path, "", | |
780 | "If not empty string, a read cache will be used in this path"); | |
781 | ||
782 | DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024, | |
783 | "Maximum size of the read cache"); | |
784 | ||
785 | DEFINE_bool(read_cache_direct_write, true, | |
786 | "Whether to use Direct IO for writing to the read cache"); | |
787 | ||
788 | DEFINE_bool(read_cache_direct_read, true, | |
789 | "Whether to use Direct IO for reading from read cache"); | |
790 | ||
11fdf7f2 TL |
791 | DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter"); |
792 | ||
7c673cae FG |
793 | static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) { |
794 | if (value >= 20) { | |
1e59de90 TL |
795 | fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname, |
796 | value); | |
7c673cae FG |
797 | return false; |
798 | } | |
799 | return true; | |
800 | } | |
801 | ||
11fdf7f2 | 802 | DEFINE_bool(verify_checksum, true, |
1e59de90 TL |
803 | "Verify checksum for every block read from storage"); |
804 | ||
805 | DEFINE_int32(checksum_type, | |
806 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum, | |
807 | "ChecksumType as an int"); | |
7c673cae FG |
808 | |
809 | DEFINE_bool(statistics, false, "Database statistics"); | |
f67539c2 | 810 | DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers, |
494da23a | 811 | "stats level for statistics"); |
7c673cae | 812 | DEFINE_string(statistics_string, "", "Serialized statistics string"); |
f67539c2 | 813 | static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats; |
7c673cae | 814 | |
1e59de90 TL |
815 | DEFINE_int64(writes, -1, |
816 | "Number of write operations to do. If negative, do --num reads."); | |
7c673cae | 817 | |
1e59de90 TL |
818 | DEFINE_bool(finish_after_writes, false, |
819 | "Write thread terminates after all writes are finished"); | |
7c673cae FG |
820 | |
821 | DEFINE_bool(sync, false, "Sync all writes to disk"); | |
822 | ||
823 | DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); | |
824 | ||
825 | DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); | |
826 | ||
1e59de90 TL |
827 | DEFINE_bool(manual_wal_flush, false, |
828 | "If true, buffer WAL until buffer is full or a manual FlushWAL()."); | |
829 | ||
830 | DEFINE_string(wal_compression, "none", | |
831 | "Algorithm to use for WAL compression. none to disable."); | |
832 | static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e = | |
833 | ROCKSDB_NAMESPACE::kNoCompression; | |
834 | ||
7c673cae FG |
835 | DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); |
836 | ||
837 | DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench", | |
838 | "Truth key/values used when using verify"); | |
839 | ||
840 | DEFINE_int32(num_levels, 7, "The total number of levels"); | |
841 | ||
f67539c2 TL |
842 | DEFINE_int64(target_file_size_base, |
843 | ROCKSDB_NAMESPACE::Options().target_file_size_base, | |
7c673cae FG |
844 | "Target file size at level-1"); |
845 | ||
846 | DEFINE_int32(target_file_size_multiplier, | |
f67539c2 | 847 | ROCKSDB_NAMESPACE::Options().target_file_size_multiplier, |
7c673cae FG |
848 | "A multiplier to compute target level-N file size (N >= 2)"); |
849 | ||
850 | DEFINE_uint64(max_bytes_for_level_base, | |
f67539c2 | 851 | ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base, |
7c673cae FG |
852 | "Max bytes for level-1"); |
853 | ||
854 | DEFINE_bool(level_compaction_dynamic_level_bytes, false, | |
855 | "Whether level size base is dynamic"); | |
856 | ||
857 | DEFINE_double(max_bytes_for_level_multiplier, 10, | |
858 | "A multiplier to compute max bytes for level-N (N >= 2)"); | |
859 | ||
860 | static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v; | |
861 | DEFINE_string(max_bytes_for_level_multiplier_additional, "", | |
862 | "A vector that specifies additional fanout per level"); | |
863 | ||
864 | DEFINE_int32(level0_stop_writes_trigger, | |
f67539c2 | 865 | ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger, |
1e59de90 | 866 | "Number of files in level-0 that will trigger put stop."); |
7c673cae FG |
867 | |
868 | DEFINE_int32(level0_slowdown_writes_trigger, | |
f67539c2 | 869 | ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger, |
1e59de90 | 870 | "Number of files in level-0 that will slow down writes."); |
7c673cae FG |
871 | |
872 | DEFINE_int32(level0_file_num_compaction_trigger, | |
f67539c2 | 873 | ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger, |
1e59de90 | 874 | "Number of files in level-0 when compactions start."); |
7c673cae | 875 | |
20effc67 TL |
876 | DEFINE_uint64(periodic_compaction_seconds, |
877 | ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds, | |
878 | "Files older than this will be picked up for compaction and" | |
879 | " rewritten to the same level"); | |
880 | ||
1e59de90 TL |
881 | DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl"); |
882 | ||
7c673cae | 883 | static bool ValidateInt32Percent(const char* flagname, int32_t value) { |
1e59de90 TL |
884 | if (value <= 0 || value >= 100) { |
885 | fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname, | |
886 | value); | |
7c673cae FG |
887 | return false; |
888 | } | |
889 | return true; | |
890 | } | |
1e59de90 TL |
891 | DEFINE_int32(readwritepercent, 90, |
892 | "Ratio of reads to reads/writes (expressed as percentage) for " | |
893 | "the ReadRandomWriteRandom workload. The default value 90 means " | |
894 | "90% operations out of all reads and writes operations are " | |
895 | "reads. In other words, 9 gets for every 1 put."); | |
896 | ||
897 | DEFINE_int32(mergereadpercent, 70, | |
898 | "Ratio of merges to merges&reads (expressed as percentage) for " | |
899 | "the ReadRandomMergeRandom workload. The default value 70 means " | |
900 | "70% out of all read and merge operations are merges. In other " | |
901 | "words, 7 merges for every 3 gets."); | |
902 | ||
903 | DEFINE_int32(deletepercent, 2, | |
904 | "Percentage of deletes out of reads/writes/deletes (used in " | |
905 | "RandomWithVerify only). RandomWithVerify " | |
7c673cae FG |
906 | "calculates writepercent as (100 - FLAGS_readwritepercent - " |
907 | "deletepercent), so deletepercent must be smaller than (100 - " | |
908 | "FLAGS_readwritepercent)"); | |
909 | ||
1e59de90 TL |
910 | DEFINE_bool(optimize_filters_for_hits, |
911 | ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits, | |
7c673cae FG |
912 | "Optimizes bloom filters for workloads for most lookups return " |
913 | "a value. For now this doesn't create bloom filters for the max " | |
914 | "level of the LSM to reduce metadata that should fit in RAM. "); | |
915 | ||
1e59de90 TL |
916 | DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks, |
917 | "RocksDB will aggressively check consistency of the data."); | |
918 | ||
919 | DEFINE_bool(force_consistency_checks, | |
920 | ROCKSDB_NAMESPACE::Options().force_consistency_checks, | |
921 | "Runs consistency checks on the LSM every time a change is " | |
922 | "applied."); | |
923 | ||
924 | DEFINE_bool(check_flush_compaction_key_order, | |
925 | ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order, | |
926 | "During flush or compaction, check whether keys inserted to " | |
927 | "output files are in order."); | |
928 | ||
7c673cae FG |
929 | DEFINE_uint64(delete_obsolete_files_period_micros, 0, |
930 | "Ignored. Left here for backward compatibility"); | |
931 | ||
494da23a TL |
932 | DEFINE_int64(writes_before_delete_range, 0, |
933 | "Number of writes before DeleteRange is called regularly."); | |
934 | ||
7c673cae | 935 | DEFINE_int64(writes_per_range_tombstone, 0, |
494da23a | 936 | "Number of writes between range tombstones"); |
7c673cae FG |
937 | |
938 | DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range"); | |
939 | ||
940 | DEFINE_int64(max_num_range_tombstones, 0, | |
1e59de90 | 941 | "Maximum number of range tombstones to insert."); |
7c673cae FG |
942 | |
943 | DEFINE_bool(expand_range_tombstones, false, | |
944 | "Expand range tombstone into sequential regular tombstones."); | |
945 | ||
946 | #ifndef ROCKSDB_LITE | |
11fdf7f2 | 947 | // Transactions Options |
7c673cae FG |
948 | DEFINE_bool(optimistic_transaction_db, false, |
949 | "Open a OptimisticTransactionDB instance. " | |
950 | "Required for randomtransaction benchmark."); | |
951 | ||
952 | DEFINE_bool(transaction_db, false, | |
953 | "Open a TransactionDB instance. " | |
954 | "Required for randomtransaction benchmark."); | |
955 | ||
956 | DEFINE_uint64(transaction_sets, 2, | |
957 | "Number of keys each transaction will " | |
958 | "modify (use in RandomTransaction only). Max: 9999"); | |
959 | ||
960 | DEFINE_bool(transaction_set_snapshot, false, | |
961 | "Setting to true will have each transaction call SetSnapshot()" | |
962 | " upon creation."); | |
963 | ||
964 | DEFINE_int32(transaction_sleep, 0, | |
965 | "Max microseconds to sleep in between " | |
966 | "reading and writing a value (used in RandomTransaction only). "); | |
967 | ||
968 | DEFINE_uint64(transaction_lock_timeout, 100, | |
969 | "If using a transaction_db, specifies the lock wait timeout in" | |
970 | " milliseconds before failing a transaction waiting on a lock"); | |
971 | DEFINE_string( | |
972 | options_file, "", | |
973 | "The path to a RocksDB options file. If specified, then db_bench will " | |
974 | "run with the RocksDB options in the default column family of the " | |
975 | "specified options file. " | |
976 | "Note that with this setting, db_bench will ONLY accept the following " | |
977 | "RocksDB options related command-line arguments, all other arguments " | |
978 | "that are related to RocksDB options will be ignored:\n" | |
979 | "\t--use_existing_db\n" | |
494da23a | 980 | "\t--use_existing_keys\n" |
7c673cae FG |
981 | "\t--statistics\n" |
982 | "\t--row_cache_size\n" | |
983 | "\t--row_cache_numshardbits\n" | |
984 | "\t--enable_io_prio\n" | |
985 | "\t--dump_malloc_stats\n" | |
986 | "\t--num_multi_db\n"); | |
987 | ||
11fdf7f2 | 988 | // FIFO Compaction Options |
7c673cae FG |
989 | DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0, |
990 | "The limit of total table file sizes to trigger FIFO compaction"); | |
11fdf7f2 TL |
991 | |
992 | DEFINE_bool(fifo_compaction_allow_compaction, true, | |
993 | "Allow compaction in FIFO compaction."); | |
994 | ||
995 | DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds."); | |
996 | ||
1e59de90 TL |
997 | DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction."); |
998 | ||
999 | // Stacked BlobDB Options | |
1000 | DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance."); | |
11fdf7f2 | 1001 | |
f67539c2 TL |
1002 | DEFINE_bool( |
1003 | blob_db_enable_gc, | |
1004 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection, | |
1e59de90 | 1005 | "[Stacked BlobDB] Enable BlobDB garbage collection."); |
11fdf7f2 | 1006 | |
f67539c2 TL |
1007 | DEFINE_double( |
1008 | blob_db_gc_cutoff, | |
1009 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff, | |
1e59de90 | 1010 | "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection."); |
11fdf7f2 | 1011 | |
f67539c2 TL |
1012 | DEFINE_bool(blob_db_is_fifo, |
1013 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo, | |
1e59de90 | 1014 | "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB."); |
f67539c2 TL |
1015 | |
1016 | DEFINE_uint64(blob_db_max_db_size, | |
1017 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size, | |
1e59de90 TL |
1018 | "[Stacked BlobDB] Max size limit of the directory where blob " |
1019 | "files are stored."); | |
11fdf7f2 | 1020 | |
1e59de90 TL |
1021 | DEFINE_uint64(blob_db_max_ttl_range, 0, |
1022 | "[Stacked BlobDB] TTL range to generate BlobDB data (in " | |
1023 | "seconds). 0 means no TTL."); | |
11fdf7f2 | 1024 | |
1e59de90 TL |
1025 | DEFINE_uint64( |
1026 | blob_db_ttl_range_secs, | |
1027 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs, | |
1028 | "[Stacked BlobDB] TTL bucket size to use when creating blob files."); | |
11fdf7f2 | 1029 | |
1e59de90 TL |
1030 | DEFINE_uint64( |
1031 | blob_db_min_blob_size, | |
1032 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, | |
1033 | "[Stacked BlobDB] Smallest blob to store in a file. Blobs " | |
1034 | "smaller than this will be inlined with the key in the LSM tree."); | |
11fdf7f2 | 1035 | |
f67539c2 TL |
1036 | DEFINE_uint64(blob_db_bytes_per_sync, |
1037 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, | |
1e59de90 | 1038 | "[Stacked BlobDB] Bytes to sync blob file at."); |
11fdf7f2 | 1039 | |
f67539c2 TL |
1040 | DEFINE_uint64(blob_db_file_size, |
1041 | ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size, | |
1e59de90 | 1042 | "[Stacked BlobDB] Target size of each blob file."); |
11fdf7f2 | 1043 | |
1e59de90 TL |
1044 | DEFINE_string( |
1045 | blob_db_compression_type, "snappy", | |
1046 | "[Stacked BlobDB] Algorithm to use to compress blobs in blob files."); | |
f67539c2 TL |
1047 | static enum ROCKSDB_NAMESPACE::CompressionType |
1048 | FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; | |
1049 | ||
1e59de90 TL |
1050 | #endif // ROCKSDB_LITE |
1051 | ||
1052 | // Integrated BlobDB options | |
1053 | DEFINE_bool( | |
1054 | enable_blob_files, | |
1055 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files, | |
1056 | "[Integrated BlobDB] Enable writing large values to separate blob files."); | |
1057 | ||
1058 | DEFINE_uint64(min_blob_size, | |
1059 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size, | |
1060 | "[Integrated BlobDB] The size of the smallest value to be stored " | |
1061 | "separately in a blob file."); | |
1062 | ||
1063 | DEFINE_uint64(blob_file_size, | |
1064 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size, | |
1065 | "[Integrated BlobDB] The size limit for blob files."); | |
1066 | ||
1067 | DEFINE_string(blob_compression_type, "none", | |
1068 | "[Integrated BlobDB] The compression algorithm to use for large " | |
1069 | "values stored in blob files."); | |
1070 | ||
1071 | DEFINE_bool(enable_blob_garbage_collection, | |
1072 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() | |
1073 | .enable_blob_garbage_collection, | |
1074 | "[Integrated BlobDB] Enable blob garbage collection."); | |
1075 | ||
1076 | DEFINE_double(blob_garbage_collection_age_cutoff, | |
1077 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() | |
1078 | .blob_garbage_collection_age_cutoff, | |
1079 | "[Integrated BlobDB] The cutoff in terms of blob file age for " | |
1080 | "garbage collection."); | |
1081 | ||
1082 | DEFINE_double(blob_garbage_collection_force_threshold, | |
1083 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() | |
1084 | .blob_garbage_collection_force_threshold, | |
1085 | "[Integrated BlobDB] The threshold for the ratio of garbage in " | |
1086 | "the oldest blob files for forcing garbage collection."); | |
1087 | ||
1088 | DEFINE_uint64(blob_compaction_readahead_size, | |
1089 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() | |
1090 | .blob_compaction_readahead_size, | |
1091 | "[Integrated BlobDB] Compaction readahead for blob files."); | |
1092 | ||
1093 | DEFINE_int32( | |
1094 | blob_file_starting_level, | |
1095 | ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level, | |
1096 | "[Integrated BlobDB] The starting level for blob files."); | |
1097 | ||
1098 | DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache."); | |
1099 | ||
1100 | DEFINE_bool( | |
1101 | use_shared_block_and_blob_cache, true, | |
1102 | "[Integrated BlobDB] Use a shared backing cache for both block " | |
1103 | "cache and blob cache. It only takes effect if use_blob_cache is enabled."); | |
1104 | ||
1105 | DEFINE_uint64( | |
1106 | blob_cache_size, 8 << 20, | |
1107 | "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only " | |
1108 | "takes effect if the block and blob caches are different " | |
1109 | "(use_shared_block_and_blob_cache = false)."); | |
1110 | ||
1111 | DEFINE_int32(blob_cache_numshardbits, 6, | |
1112 | "[Integrated BlobDB] Number of shards for the blob cache is 2 ** " | |
1113 | "blob_cache_numshardbits. Negative means use default settings. " | |
1114 | "It only takes effect if blob_cache_size is greater than 0, and " | |
1115 | "the block and blob caches are different " | |
1116 | "(use_shared_block_and_blob_cache = false)."); | |
1117 | ||
1118 | DEFINE_int32(prepopulate_blob_cache, 0, | |
1119 | "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 " | |
1120 | "to disable and 1 to insert during flush."); | |
1121 | ||
1122 | #ifndef ROCKSDB_LITE | |
1123 | ||
f67539c2 TL |
1124 | // Secondary DB instance Options |
1125 | DEFINE_bool(use_secondary_db, false, | |
1126 | "Open a RocksDB secondary instance. A primary instance can be " | |
1127 | "running in another db_bench process."); | |
1128 | ||
1129 | DEFINE_string(secondary_path, "", | |
1130 | "Path to a directory used by the secondary instance to store " | |
1131 | "private files, e.g. info log."); | |
1132 | ||
1133 | DEFINE_int32(secondary_update_interval, 5, | |
1134 | "Secondary instance attempts to catch up with the primary every " | |
1135 | "secondary_update_interval seconds."); | |
1136 | ||
7c673cae FG |
1137 | #endif // ROCKSDB_LITE |
1138 | ||
1139 | DEFINE_bool(report_bg_io_stats, false, | |
1140 | "Measure times spents on I/Os while in compactions. "); | |
1141 | ||
1142 | DEFINE_bool(use_stderr_info_logger, false, | |
1143 | "Write info logs to stderr instead of to LOG file. "); | |
1144 | ||
1e59de90 TL |
1145 | #ifndef ROCKSDB_LITE |
1146 | ||
11fdf7f2 | 1147 | DEFINE_string(trace_file, "", "Trace workload to a file. "); |
7c673cae | 1148 | |
1e59de90 TL |
1149 | DEFINE_double(trace_replay_fast_forward, 1.0, |
1150 | "Fast forward trace replay, must > 0.0."); | |
f67539c2 TL |
1151 | DEFINE_int32(block_cache_trace_sampling_frequency, 1, |
1152 | "Block cache trace sampling frequency, termed s. It uses spatial " | |
1153 | "downsampling and samples accesses to one out of s blocks."); | |
1154 | DEFINE_int64( | |
1155 | block_cache_trace_max_trace_file_size_in_bytes, | |
1156 | uint64_t{64} * 1024 * 1024 * 1024, | |
1157 | "The maximum block cache trace file size in bytes. Block cache accesses " | |
1158 | "will not be logged if the trace file size exceeds this threshold. Default " | |
1159 | "is 64 GB."); | |
1160 | DEFINE_string(block_cache_trace_file, "", "Block cache trace file path."); | |
1161 | DEFINE_int32(trace_replay_threads, 1, | |
1162 | "The number of threads to replay, must >=1."); | |
1163 | ||
1e59de90 TL |
1164 | DEFINE_bool(io_uring_enabled, true, |
1165 | "If true, enable the use of IO uring if the platform supports it"); | |
1166 | extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; } | |
1167 | #endif // ROCKSDB_LITE | |
1168 | ||
1169 | DEFINE_bool(adaptive_readahead, false, | |
1170 | "carry forward internal auto readahead size from one file to next " | |
1171 | "file at each level during iteration"); | |
1172 | ||
1173 | DEFINE_bool(rate_limit_user_ops, false, | |
1174 | "When true use Env::IO_USER priority level to charge internal rate " | |
1175 | "limiter for reads associated with user operations."); | |
1176 | ||
1177 | DEFINE_bool(file_checksum, false, | |
1178 | "When true use FileChecksumGenCrc32cFactory for " | |
1179 | "file_checksum_gen_factory."); | |
1180 | ||
1181 | DEFINE_bool(rate_limit_auto_wal_flush, false, | |
1182 | "When true use Env::IO_USER priority level to charge internal rate " | |
1183 | "limiter for automatic WAL flush (`Options::manual_wal_flush` == " | |
1184 | "false) after the user write operation."); | |
1185 | ||
1186 | DEFINE_bool(async_io, false, | |
1187 | "When set true, RocksDB does asynchronous reads for internal auto " | |
1188 | "readahead prefetching."); | |
1189 | ||
1190 | DEFINE_bool(optimize_multiget_for_io, true, | |
1191 | "When set true, RocksDB does asynchronous reads for SST files in " | |
1192 | "multiple levels for MultiGet."); | |
1193 | ||
1194 | DEFINE_bool(charge_compression_dictionary_building_buffer, false, | |
1195 | "Setting for " | |
1196 | "CacheEntryRoleOptions::charged of " | |
1197 | "CacheEntryRole::kCompressionDictionaryBuildingBuffer"); | |
1198 | ||
1199 | DEFINE_bool(charge_filter_construction, false, | |
1200 | "Setting for " | |
1201 | "CacheEntryRoleOptions::charged of " | |
1202 | "CacheEntryRole::kFilterConstruction"); | |
1203 | ||
1204 | DEFINE_bool(charge_table_reader, false, | |
1205 | "Setting for " | |
1206 | "CacheEntryRoleOptions::charged of " | |
1207 | "CacheEntryRole::kBlockBasedTableReader"); | |
1208 | ||
1209 | DEFINE_bool(charge_file_metadata, false, | |
1210 | "Setting for " | |
1211 | "CacheEntryRoleOptions::charged of " | |
1212 | "CacheEntryRole::kFileMetadata"); | |
1213 | ||
1214 | DEFINE_bool(charge_blob_cache, false, | |
1215 | "Setting for " | |
1216 | "CacheEntryRoleOptions::charged of " | |
1217 | "CacheEntryRole::kBlobCache"); | |
1218 | ||
1219 | DEFINE_uint64(backup_rate_limit, 0ull, | |
1220 | "If non-zero, db_bench will rate limit reads and writes for DB " | |
1221 | "backup. This " | |
1222 | "is the global rate in ops/second."); | |
1223 | ||
1224 | DEFINE_uint64(restore_rate_limit, 0ull, | |
1225 | "If non-zero, db_bench will rate limit reads and writes for DB " | |
1226 | "restore. This " | |
1227 | "is the global rate in ops/second."); | |
1228 | ||
1229 | DEFINE_string(backup_dir, "", | |
1230 | "If not empty string, use the given dir for backup."); | |
1231 | ||
1232 | DEFINE_string(restore_dir, "", | |
1233 | "If not empty string, use the given dir for restore."); | |
1234 | ||
1235 | DEFINE_uint64( | |
1236 | initial_auto_readahead_size, | |
1237 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size, | |
1238 | "RocksDB does auto-readahead for iterators on noticing more than two reads " | |
1239 | "for a table file if user doesn't provide readahead_size. The readahead " | |
1240 | "size starts at initial_auto_readahead_size"); | |
1241 | ||
1242 | DEFINE_uint64( | |
1243 | max_auto_readahead_size, | |
1244 | ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size, | |
1245 | "Rocksdb implicit readahead starts at " | |
1246 | "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every " | |
1247 | "additional read upto max_auto_readahead_size"); | |
1248 | ||
1249 | DEFINE_uint64( | |
1250 | num_file_reads_for_auto_readahead, | |
1251 | ROCKSDB_NAMESPACE::BlockBasedTableOptions() | |
1252 | .num_file_reads_for_auto_readahead, | |
1253 | "Rocksdb implicit readahead is enabled if reads are sequential and " | |
1254 | "num_file_reads_for_auto_readahead indicates after how many sequential " | |
1255 | "reads into that file internal auto prefetching should be start."); | |
1256 | ||
f67539c2 TL |
1257 | static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( |
1258 | const char* ctype) { | |
7c673cae FG |
1259 | assert(ctype); |
1260 | ||
1261 | if (!strcasecmp(ctype, "none")) | |
f67539c2 | 1262 | return ROCKSDB_NAMESPACE::kNoCompression; |
7c673cae | 1263 | else if (!strcasecmp(ctype, "snappy")) |
f67539c2 | 1264 | return ROCKSDB_NAMESPACE::kSnappyCompression; |
7c673cae | 1265 | else if (!strcasecmp(ctype, "zlib")) |
f67539c2 | 1266 | return ROCKSDB_NAMESPACE::kZlibCompression; |
7c673cae | 1267 | else if (!strcasecmp(ctype, "bzip2")) |
f67539c2 | 1268 | return ROCKSDB_NAMESPACE::kBZip2Compression; |
7c673cae | 1269 | else if (!strcasecmp(ctype, "lz4")) |
f67539c2 | 1270 | return ROCKSDB_NAMESPACE::kLZ4Compression; |
7c673cae | 1271 | else if (!strcasecmp(ctype, "lz4hc")) |
f67539c2 | 1272 | return ROCKSDB_NAMESPACE::kLZ4HCCompression; |
7c673cae | 1273 | else if (!strcasecmp(ctype, "xpress")) |
f67539c2 | 1274 | return ROCKSDB_NAMESPACE::kXpressCompression; |
7c673cae | 1275 | else if (!strcasecmp(ctype, "zstd")) |
f67539c2 | 1276 | return ROCKSDB_NAMESPACE::kZSTD; |
1e59de90 TL |
1277 | else { |
1278 | fprintf(stderr, "Cannot parse compression type '%s'\n", ctype); | |
1279 | exit(1); | |
1280 | } | |
7c673cae FG |
1281 | } |
1282 | ||
1283 | static std::string ColumnFamilyName(size_t i) { | |
1284 | if (i == 0) { | |
f67539c2 | 1285 | return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName; |
7c673cae FG |
1286 | } else { |
1287 | char name[100]; | |
1288 | snprintf(name, sizeof(name), "column_family_name_%06zu", i); | |
1289 | return std::string(name); | |
1290 | } | |
1291 | } | |
1292 | ||
1293 | DEFINE_string(compression_type, "snappy", | |
1294 | "Algorithm to use to compress the database"); | |
f67539c2 TL |
1295 | static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e = |
1296 | ROCKSDB_NAMESPACE::kSnappyCompression; | |
7c673cae | 1297 | |
494da23a TL |
1298 | DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression"); |
1299 | ||
f67539c2 | 1300 | DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level, |
11fdf7f2 TL |
1301 | "Compression level. The meaning of this value is library-" |
1302 | "dependent. If unset, we try to use the default for the library " | |
1303 | "specified in `--compression_type`"); | |
7c673cae | 1304 | |
11fdf7f2 | 1305 | DEFINE_int32(compression_max_dict_bytes, |
f67539c2 | 1306 | ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes, |
7c673cae FG |
1307 | "Maximum size of dictionary used to prime the compression " |
1308 | "library."); | |
1309 | ||
11fdf7f2 | 1310 | DEFINE_int32(compression_zstd_max_train_bytes, |
f67539c2 | 1311 | ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes, |
11fdf7f2 TL |
1312 | "Maximum size of training data passed to zstd's dictionary " |
1313 | "trainer."); | |
7c673cae | 1314 | |
1e59de90 TL |
1315 | DEFINE_int32(min_level_to_compress, -1, |
1316 | "If non-negative, compression starts" | |
7c673cae FG |
1317 | " from this level. Levels with number < min_level_to_compress are" |
1318 | " not compressed. Otherwise, apply compression_type to " | |
1319 | "all levels."); | |
1320 | ||
20effc67 TL |
1321 | DEFINE_int32(compression_parallel_threads, 1, |
1322 | "Number of threads for parallel compression."); | |
1323 | ||
1e59de90 TL |
1324 | DEFINE_uint64(compression_max_dict_buffer_bytes, |
1325 | ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes, | |
1326 | "Maximum bytes to buffer to collect samples for dictionary."); | |
1327 | ||
1328 | DEFINE_bool(compression_use_zstd_dict_trainer, | |
1329 | ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer, | |
1330 | "If true, use ZSTD_TrainDictionary() to create dictionary, else" | |
1331 | "use ZSTD_FinalizeDictionary() to create dictionary"); | |
1332 | ||
7c673cae FG |
1333 | static bool ValidateTableCacheNumshardbits(const char* flagname, |
1334 | int32_t value) { | |
1e59de90 TL |
1335 | if (0 >= value || value >= 20) { |
1336 | fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val < 20\n", | |
7c673cae FG |
1337 | flagname, value); |
1338 | return false; | |
1339 | } | |
1340 | return true; | |
1341 | } | |
1342 | DEFINE_int32(table_cache_numshardbits, 4, ""); | |
1343 | ||
1344 | #ifndef ROCKSDB_LITE | |
20effc67 | 1345 | DEFINE_string(env_uri, "", |
1e59de90 | 1346 | "URI for registry Env lookup. Mutually exclusive with --fs_uri"); |
20effc67 TL |
1347 | DEFINE_string(fs_uri, "", |
1348 | "URI for registry Filesystem lookup. Mutually exclusive" | |
1e59de90 | 1349 | " with --env_uri." |
20effc67 | 1350 | " Creates a default environment with the specified filesystem."); |
7c673cae | 1351 | #endif // ROCKSDB_LITE |
1e59de90 TL |
1352 | DEFINE_string(simulate_hybrid_fs_file, "", |
1353 | "File for Store Metadata for Simulate hybrid FS. Empty means " | |
1354 | "disable the feature. Now, if it is set, last_level_temperature " | |
1355 | "is set to kWarm."); | |
1356 | DEFINE_int32(simulate_hybrid_hdd_multipliers, 1, | |
1357 | "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs " | |
1358 | "are simulated."); | |
1359 | DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD."); | |
1360 | ||
1361 | DEFINE_int64( | |
1362 | preclude_last_level_data_seconds, 0, | |
1363 | "Preclude the latest data from the last level. (Used for tiered storage)"); | |
1364 | ||
1365 | DEFINE_int64(preserve_internal_time_seconds, 0, | |
1366 | "Preserve the internal time information which stores with SST."); | |
f67539c2 TL |
1367 | |
1368 | static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard; | |
1369 | ||
1370 | static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default(); | |
7c673cae | 1371 | |
1e59de90 TL |
1372 | DEFINE_int64(stats_interval, 0, |
1373 | "Stats are reported every N operations when this is greater than " | |
1374 | "zero. When 0 the interval grows over time."); | |
1375 | ||
1376 | DEFINE_int64(stats_interval_seconds, 0, | |
1377 | "Report stats every N seconds. This overrides stats_interval when" | |
1378 | " both are > 0."); | |
7c673cae | 1379 | |
1e59de90 TL |
1380 | DEFINE_int32(stats_per_interval, 0, |
1381 | "Reports additional stats per interval when this is greater than " | |
1382 | "0."); | |
7c673cae | 1383 | |
1e59de90 TL |
1384 | DEFINE_uint64(slow_usecs, 1000000, |
1385 | "A message is printed for operations that take at least this " | |
1386 | "many microseconds."); | |
7c673cae FG |
1387 | |
1388 | DEFINE_int64(report_interval_seconds, 0, | |
1e59de90 | 1389 | "If greater than zero, it will write simple stats in CSV format " |
7c673cae FG |
1390 | "to --report_file every N seconds"); |
1391 | ||
1392 | DEFINE_string(report_file, "report.csv", | |
1393 | "Filename where some simple stats are reported to (if " | |
1394 | "--report_interval_seconds is bigger than 0)"); | |
1395 | ||
1396 | DEFINE_int32(thread_status_per_interval, 0, | |
1397 | "Takes and report a snapshot of the current status of each thread" | |
1398 | " when this is greater than 0."); | |
1399 | ||
f67539c2 TL |
1400 | DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable, |
1401 | "Level of perf collection"); | |
7c673cae | 1402 | |
7c673cae FG |
1403 | DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024, |
1404 | "Slowdown writes if pending compaction bytes exceed this number"); | |
1405 | ||
1406 | DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024, | |
1407 | "Stop writes if pending compaction bytes exceed this number"); | |
1408 | ||
1409 | DEFINE_uint64(delayed_write_rate, 8388608u, | |
1410 | "Limited bytes allowed to DB when soft_rate_limit or " | |
1411 | "level0_slowdown_writes_trigger triggers"); | |
1412 | ||
11fdf7f2 TL |
1413 | DEFINE_bool(enable_pipelined_write, true, |
1414 | "Allow WAL and memtable writes to be pipelined"); | |
1415 | ||
20effc67 TL |
1416 | DEFINE_bool( |
1417 | unordered_write, false, | |
1418 | "Enable the unordered write feature, which provides higher throughput but " | |
1419 | "relaxes the guarantees around atomic reads and immutable snapshots"); | |
f67539c2 | 1420 | |
11fdf7f2 | 1421 | DEFINE_bool(allow_concurrent_memtable_write, true, |
7c673cae FG |
1422 | "Allow multi-writers to update mem tables in parallel."); |
1423 | ||
1e59de90 TL |
1424 | DEFINE_double(experimental_mempurge_threshold, 0.0, |
1425 | "Maximum useful payload ratio estimate that triggers a mempurge " | |
1426 | "(memtable garbage collection)."); | |
1427 | ||
f67539c2 TL |
1428 | DEFINE_bool(inplace_update_support, |
1429 | ROCKSDB_NAMESPACE::Options().inplace_update_support, | |
11fdf7f2 TL |
1430 | "Support in-place memtable update for smaller or same-size values"); |
1431 | ||
1432 | DEFINE_uint64(inplace_update_num_locks, | |
f67539c2 | 1433 | ROCKSDB_NAMESPACE::Options().inplace_update_num_locks, |
11fdf7f2 TL |
1434 | "Number of RW locks to protect in-place memtable updates"); |
1435 | ||
1436 | DEFINE_bool(enable_write_thread_adaptive_yield, true, | |
7c673cae FG |
1437 | "Use a yielding spin loop for brief writer thread waits."); |
1438 | ||
1439 | DEFINE_uint64( | |
1440 | write_thread_max_yield_usec, 100, | |
1441 | "Maximum microseconds for enable_write_thread_adaptive_yield operation."); | |
1442 | ||
1443 | DEFINE_uint64(write_thread_slow_yield_usec, 3, | |
1444 | "The threshold at which a slow yield is considered a signal that " | |
1445 | "other processes or threads want the core."); | |
1446 | ||
7c673cae FG |
1447 | DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value."); |
1448 | ||
1e59de90 TL |
1449 | DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000, |
1450 | "Set refill period on rate limiter."); | |
1451 | ||
11fdf7f2 TL |
1452 | DEFINE_bool(rate_limiter_auto_tuned, false, |
1453 | "Enable dynamic adjustment of rate limit according to demand for " | |
1454 | "background I/O"); | |
1455 | ||
1e59de90 | 1456 | DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit"); |
11fdf7f2 | 1457 | |
1e59de90 TL |
1458 | DEFINE_uint64( |
1459 | sine_write_rate_interval_milliseconds, 10000, | |
1460 | "Interval of which the sine wave write_rate_limit is recalculated"); | |
11fdf7f2 | 1461 | |
1e59de90 | 1462 | DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d"); |
11fdf7f2 | 1463 | |
1e59de90 | 1464 | DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d"); |
11fdf7f2 | 1465 | |
1e59de90 | 1466 | DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d"); |
11fdf7f2 | 1467 | |
1e59de90 | 1468 | DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d"); |
11fdf7f2 TL |
1469 | |
1470 | DEFINE_bool(rate_limit_bg_reads, false, | |
1471 | "Use options.rate_limiter on compaction reads"); | |
1472 | ||
7c673cae FG |
1473 | DEFINE_uint64( |
1474 | benchmark_write_rate_limit, 0, | |
1475 | "If non-zero, db_bench will rate-limit the writes going into RocksDB. This " | |
1476 | "is the global rate in bytes/second."); | |
1477 | ||
494da23a | 1478 | // the parameters of mix_graph |
f67539c2 TL |
1479 | DEFINE_double(keyrange_dist_a, 0.0, |
1480 | "The parameter 'a' of prefix average access distribution " | |
1481 | "f(x)=a*exp(b*x)+c*exp(d*x)"); | |
1482 | DEFINE_double(keyrange_dist_b, 0.0, | |
1483 | "The parameter 'b' of prefix average access distribution " | |
1484 | "f(x)=a*exp(b*x)+c*exp(d*x)"); | |
1485 | DEFINE_double(keyrange_dist_c, 0.0, | |
1486 | "The parameter 'c' of prefix average access distribution" | |
1487 | "f(x)=a*exp(b*x)+c*exp(d*x)"); | |
1488 | DEFINE_double(keyrange_dist_d, 0.0, | |
1489 | "The parameter 'd' of prefix average access distribution" | |
1490 | "f(x)=a*exp(b*x)+c*exp(d*x)"); | |
1491 | DEFINE_int64(keyrange_num, 1, | |
1492 | "The number of key ranges that are in the same prefix " | |
1e59de90 | 1493 | "group, each prefix range will have its key access distribution"); |
494da23a | 1494 | DEFINE_double(key_dist_a, 0.0, |
1e59de90 | 1495 | "The parameter 'a' of key access distribution model f(x)=a*x^b"); |
494da23a | 1496 | DEFINE_double(key_dist_b, 0.0, |
1e59de90 | 1497 | "The parameter 'b' of key access distribution model f(x)=a*x^b"); |
494da23a TL |
1498 | DEFINE_double(value_theta, 0.0, |
1499 | "The parameter 'theta' of Generized Pareto Distribution " | |
1500 | "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); | |
1e59de90 TL |
1501 | // Use reasonable defaults based on the mixgraph paper |
1502 | DEFINE_double(value_k, 0.2615, | |
494da23a TL |
1503 | "The parameter 'k' of Generized Pareto Distribution " |
1504 | "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); | |
1e59de90 TL |
1505 | // Use reasonable defaults based on the mixgraph paper |
1506 | DEFINE_double(value_sigma, 25.45, | |
494da23a TL |
1507 | "The parameter 'theta' of Generized Pareto Distribution " |
1508 | "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); | |
1509 | DEFINE_double(iter_theta, 0.0, | |
1510 | "The parameter 'theta' of Generized Pareto Distribution " | |
1511 | "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); | |
1e59de90 TL |
1512 | // Use reasonable defaults based on the mixgraph paper |
1513 | DEFINE_double(iter_k, 2.517, | |
494da23a TL |
1514 | "The parameter 'k' of Generized Pareto Distribution " |
1515 | "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); | |
1e59de90 TL |
1516 | // Use reasonable defaults based on the mixgraph paper |
1517 | DEFINE_double(iter_sigma, 14.236, | |
494da23a TL |
1518 | "The parameter 'sigma' of Generized Pareto Distribution " |
1519 | "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); | |
1520 | DEFINE_double(mix_get_ratio, 1.0, | |
1521 | "The ratio of Get queries of mix_graph workload"); | |
1522 | DEFINE_double(mix_put_ratio, 0.0, | |
1523 | "The ratio of Put queries of mix_graph workload"); | |
1524 | DEFINE_double(mix_seek_ratio, 0.0, | |
1525 | "The ratio of Seek queries of mix_graph workload"); | |
1526 | DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator"); | |
494da23a TL |
1527 | DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload"); |
1528 | DEFINE_double( | |
1529 | sine_mix_rate_noise, 0.0, | |
1530 | "Add the noise ratio to the sine rate, it is between 0.0 and 1.0"); | |
1531 | DEFINE_bool(sine_mix_rate, false, | |
1532 | "Enable the sine QPS control on the mix workload"); | |
1533 | DEFINE_uint64( | |
1534 | sine_mix_rate_interval_milliseconds, 10000, | |
1535 | "Interval of which the sine wave read_rate_limit is recalculated"); | |
1536 | DEFINE_int64(mix_accesses, -1, | |
1537 | "The total query accesses of mix_graph workload"); | |
1538 | ||
7c673cae FG |
1539 | DEFINE_uint64( |
1540 | benchmark_read_rate_limit, 0, | |
1541 | "If non-zero, db_bench will rate-limit the reads from RocksDB. This " | |
1542 | "is the global rate in ops/second."); | |
1543 | ||
f67539c2 TL |
1544 | DEFINE_uint64(max_compaction_bytes, |
1545 | ROCKSDB_NAMESPACE::Options().max_compaction_bytes, | |
7c673cae FG |
1546 | "Max bytes allowed in one compaction"); |
1547 | ||
1548 | #ifndef ROCKSDB_LITE | |
1549 | DEFINE_bool(readonly, false, "Run read only benchmarks."); | |
494da23a TL |
1550 | |
1551 | DEFINE_bool(print_malloc_stats, false, | |
1552 | "Print malloc stats to stdout after benchmarks finish."); | |
7c673cae FG |
1553 | #endif // ROCKSDB_LITE |
1554 | ||
1555 | DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); | |
1556 | ||
1557 | DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); | |
1e59de90 TL |
1558 | DEFINE_uint64(wal_size_limit_MB, 0, |
1559 | "Set the size limit for the WAL Files in MB."); | |
7c673cae FG |
1560 | DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size"); |
1561 | ||
f67539c2 | 1562 | DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads, |
7c673cae FG |
1563 | "Allow reads to occur via mmap-ing files"); |
1564 | ||
f67539c2 | 1565 | DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes, |
7c673cae FG |
1566 | "Allow writes to occur via mmap-ing files"); |
1567 | ||
f67539c2 | 1568 | DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads, |
7c673cae FG |
1569 | "Use O_DIRECT for reading data"); |
1570 | ||
1571 | DEFINE_bool(use_direct_io_for_flush_and_compaction, | |
f67539c2 | 1572 | ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction, |
11fdf7f2 | 1573 | "Use O_DIRECT for background flush and compaction writes"); |
7c673cae | 1574 | |
f67539c2 TL |
1575 | DEFINE_bool(advise_random_on_open, |
1576 | ROCKSDB_NAMESPACE::Options().advise_random_on_open, | |
7c673cae FG |
1577 | "Advise random access on table file open"); |
1578 | ||
1579 | DEFINE_string(compaction_fadvice, "NORMAL", | |
1580 | "Access pattern advice when a file is compacted"); | |
1581 | static auto FLAGS_compaction_fadvice_e = | |
f67539c2 | 1582 | ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start; |
7c673cae FG |
1583 | |
1584 | DEFINE_bool(use_tailing_iterator, false, | |
1585 | "Use tailing iterator to access a series of keys instead of get"); | |
1586 | ||
f67539c2 | 1587 | DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex, |
7c673cae FG |
1588 | "Use adaptive mutex"); |
1589 | ||
f67539c2 | 1590 | DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync, |
7c673cae FG |
1591 | "Allows OS to incrementally sync SST files to disk while they are" |
1592 | " being written, in the background. Issue one request for every" | |
1593 | " bytes_per_sync written. 0 turns it off."); | |
1594 | ||
f67539c2 TL |
1595 | DEFINE_uint64(wal_bytes_per_sync, |
1596 | ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync, | |
7c673cae FG |
1597 | "Allows OS to incrementally sync WAL files to disk while they are" |
1598 | " being written, in the background. Issue one request for every" | |
1599 | " wal_bytes_per_sync written. 0 turns it off."); | |
1600 | ||
1601 | DEFINE_bool(use_single_deletes, true, | |
1602 | "Use single deletes (used in RandomReplaceKeys only)."); | |
1603 | ||
1604 | DEFINE_double(stddev, 2000.0, | |
1605 | "Standard deviation of normal distribution used for picking keys" | |
1606 | " (used in RandomReplaceKeys only)."); | |
1607 | ||
1608 | DEFINE_int32(key_id_range, 100000, | |
1609 | "Range of possible value of key id (used in TimeSeries only)."); | |
1610 | ||
1611 | DEFINE_string(expire_style, "none", | |
1612 | "Style to remove expired time entries. Can be one of the options " | |
1613 | "below: none (do not expired data), compaction_filter (use a " | |
1614 | "compaction filter to remove expired data), delete (seek IDs and " | |
1615 | "remove expired data) (used in TimeSeries only)."); | |
1616 | ||
1617 | DEFINE_uint64( | |
1618 | time_range, 100000, | |
1619 | "Range of timestamp that store in the database (used in TimeSeries" | |
1620 | " only)."); | |
1621 | ||
1622 | DEFINE_int32(num_deletion_threads, 1, | |
1623 | "Number of threads to do deletion (used in TimeSeries and delete " | |
1624 | "expire_style only)."); | |
1625 | ||
1e59de90 TL |
1626 | DEFINE_int32(max_successive_merges, 0, |
1627 | "Maximum number of successive merge operations on a key in the " | |
1628 | "memtable"); | |
7c673cae FG |
1629 | |
1630 | static bool ValidatePrefixSize(const char* flagname, int32_t value) { | |
1e59de90 | 1631 | if (value < 0 || value >= 2000000000) { |
7c673cae FG |
1632 | fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", |
1633 | flagname, value); | |
1634 | return false; | |
1635 | } | |
1636 | return true; | |
1637 | } | |
f67539c2 | 1638 | |
1e59de90 TL |
1639 | DEFINE_int32(prefix_size, 0, |
1640 | "control the prefix size for HashSkipList and plain table"); | |
1641 | DEFINE_int64(keys_per_prefix, 0, | |
1642 | "control average number of keys generated per prefix, 0 means no " | |
1643 | "special handling of the prefix, i.e. use the prefix comes with " | |
1644 | "the generated random number."); | |
f67539c2 TL |
1645 | DEFINE_bool(total_order_seek, false, |
1646 | "Enable total order seek regardless of index format."); | |
1647 | DEFINE_bool(prefix_same_as_start, false, | |
1648 | "Enforce iterator to return keys with prefix same as seek key."); | |
1649 | DEFINE_bool( | |
1650 | seek_missing_prefix, false, | |
1651 | "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8"); | |
1652 | ||
7c673cae FG |
1653 | DEFINE_int32(memtable_insert_with_hint_prefix_size, 0, |
1654 | "If non-zero, enable " | |
1655 | "memtable insert with hint with the given prefix size."); | |
1e59de90 TL |
1656 | DEFINE_bool(enable_io_prio, false, |
1657 | "Lower the background flush/compaction threads' IO priority"); | |
1658 | DEFINE_bool(enable_cpu_prio, false, | |
1659 | "Lower the background flush/compaction threads' CPU priority"); | |
1660 | DEFINE_bool(identity_as_first_hash, false, | |
1661 | "the first hash function of cuckoo table becomes an identity " | |
1662 | "function. This is only valid when key is 8 bytes"); | |
7c673cae | 1663 | DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG "); |
f67539c2 TL |
1664 | DEFINE_uint64(stats_dump_period_sec, |
1665 | ROCKSDB_NAMESPACE::Options().stats_dump_period_sec, | |
11fdf7f2 | 1666 | "Gap between printing stats to log in seconds"); |
494da23a | 1667 | DEFINE_uint64(stats_persist_period_sec, |
f67539c2 | 1668 | ROCKSDB_NAMESPACE::Options().stats_persist_period_sec, |
494da23a | 1669 | "Gap between persisting stats in seconds"); |
f67539c2 TL |
1670 | DEFINE_bool(persist_stats_to_disk, |
1671 | ROCKSDB_NAMESPACE::Options().persist_stats_to_disk, | |
1672 | "whether to persist stats to disk"); | |
494da23a | 1673 | DEFINE_uint64(stats_history_buffer_size, |
f67539c2 | 1674 | ROCKSDB_NAMESPACE::Options().stats_history_buffer_size, |
494da23a | 1675 | "Max number of stats snapshots to keep in memory"); |
1e59de90 TL |
1676 | DEFINE_bool(avoid_flush_during_recovery, |
1677 | ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery, | |
1678 | "If true, avoids flushing the recovered WAL data where possible."); | |
f67539c2 TL |
1679 | DEFINE_int64(multiread_stride, 0, |
1680 | "Stride length for the keys in a MultiGet batch"); | |
1681 | DEFINE_bool(multiread_batched, false, "Use the new MultiGet API"); | |
7c673cae | 1682 | |
7c673cae FG |
1683 | DEFINE_string(memtablerep, "skip_list", ""); |
1684 | DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count"); | |
1e59de90 TL |
1685 | DEFINE_bool(use_plain_table, false, |
1686 | "if use plain table instead of block-based table format"); | |
7c673cae FG |
1687 | DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format"); |
1688 | DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table."); | |
1e59de90 TL |
1689 | DEFINE_bool(use_hash_search, false, |
1690 | "if use kHashSearch instead of kBinarySearch. " | |
7c673cae | 1691 | "This is valid if only we use BlockTable"); |
1e59de90 TL |
1692 | DEFINE_string(merge_operator, "", |
1693 | "The merge operator to use with the database." | |
7c673cae FG |
1694 | "If a new merge operator is specified, be sure to use fresh" |
1695 | " database The possible merge operators are defined in" | |
1696 | " utilities/merge_operators.h"); | |
1e59de90 TL |
1697 | DEFINE_int32(skip_list_lookahead, 0, |
1698 | "Used with skip_list memtablerep; try linear search first for " | |
1699 | "this many steps from the previous position"); | |
1700 | DEFINE_bool(report_file_operations, false, | |
1701 | "if report number of file operations"); | |
1702 | DEFINE_bool(report_open_timing, false, "if report open timing"); | |
f67539c2 | 1703 | DEFINE_int32(readahead_size, 0, "Iterator readahead size"); |
7c673cae | 1704 | |
20effc67 TL |
1705 | DEFINE_bool(read_with_latest_user_timestamp, true, |
1706 | "If true, always use the current latest timestamp for read. If " | |
1707 | "false, choose a random timestamp from the past."); | |
1708 | ||
1e59de90 TL |
1709 | #ifndef ROCKSDB_LITE |
1710 | DEFINE_string(secondary_cache_uri, "", | |
1711 | "Full URI for creating a custom secondary cache object"); | |
1712 | static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache; | |
1713 | #endif // ROCKSDB_LITE | |
7c673cae | 1714 | |
11fdf7f2 | 1715 | static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) = |
7c673cae FG |
1716 | RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); |
1717 | ||
11fdf7f2 | 1718 | static const bool FLAGS_key_size_dummy __attribute__((__unused__)) = |
7c673cae FG |
1719 | RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize); |
1720 | ||
11fdf7f2 | 1721 | static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) = |
7c673cae FG |
1722 | RegisterFlagValidator(&FLAGS_cache_numshardbits, |
1723 | &ValidateCacheNumshardbits); | |
1724 | ||
11fdf7f2 | 1725 | static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) = |
7c673cae FG |
1726 | RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent); |
1727 | ||
1728 | DEFINE_int32(disable_seek_compaction, false, | |
1729 | "Not used, left here for backwards compatibility"); | |
1730 | ||
1e59de90 TL |
1731 | DEFINE_bool(allow_data_in_errors, |
1732 | ROCKSDB_NAMESPACE::Options().allow_data_in_errors, | |
1733 | "If true, allow logging data, e.g. key, value in LOG files."); | |
1734 | ||
11fdf7f2 | 1735 | static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) = |
7c673cae | 1736 | RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent); |
1e59de90 TL |
1737 | static const bool FLAGS_table_cache_numshardbits_dummy |
1738 | __attribute__((__unused__)) = RegisterFlagValidator( | |
1739 | &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits); | |
7c673cae | 1740 | |
1e59de90 TL |
1741 | DEFINE_uint32(write_batch_protection_bytes_per_key, 0, |
1742 | "Size of per-key-value checksum in each write batch. Currently " | |
1743 | "only value 0 and 8 are supported."); | |
7c673cae | 1744 | |
1e59de90 TL |
1745 | DEFINE_uint32( |
1746 | memtable_protection_bytes_per_key, 0, | |
1747 | "Enable memtable per key-value checksum protection. " | |
1748 | "Each entry in memtable will be suffixed by a per key-value checksum. " | |
1749 | "This options determines the size of such checksums. " | |
1750 | "Supported values: 0, 1, 2, 4, 8."); | |
7c673cae | 1751 | |
1e59de90 TL |
1752 | DEFINE_bool(build_info, false, |
1753 | "Print the build info via GetRocksBuildInfoAsString"); | |
7c673cae | 1754 | |
1e59de90 TL |
1755 | DEFINE_bool(track_and_verify_wals_in_manifest, false, |
1756 | "If true, enable WAL tracking in the MANIFEST"); | |
7c673cae | 1757 | |
1e59de90 TL |
1758 | namespace ROCKSDB_NAMESPACE { |
1759 | namespace { | |
1760 | static Status CreateMemTableRepFactory( | |
1761 | const ConfigOptions& config_options, | |
1762 | std::shared_ptr<MemTableRepFactory>* factory) { | |
1763 | Status s; | |
1764 | if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) { | |
1765 | factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead)); | |
1766 | #ifndef ROCKSDB_LITE | |
1767 | } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) { | |
1768 | factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count)); | |
1769 | } else if (!strcasecmp(FLAGS_memtablerep.c_str(), | |
1770 | VectorRepFactory::kNickName())) { | |
1771 | factory->reset(new VectorRepFactory()); | |
1772 | } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) { | |
1773 | factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count)); | |
1774 | #endif // ROCKSDB_LITE | |
1775 | } else { | |
1776 | std::unique_ptr<MemTableRepFactory> unique; | |
1777 | s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep, | |
1778 | &unique); | |
7c673cae | 1779 | if (s.ok()) { |
1e59de90 | 1780 | factory->reset(unique.release()); |
7c673cae | 1781 | } |
7c673cae | 1782 | } |
1e59de90 TL |
1783 | return s; |
1784 | } | |
7c673cae FG |
1785 | |
1786 | } // namespace | |
1787 | ||
1e59de90 | 1788 | enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal }; |
f67539c2 TL |
1789 | |
1790 | static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed; | |
1791 | ||
1792 | static enum DistributionType StringToDistributionType(const char* ctype) { | |
1793 | assert(ctype); | |
1794 | ||
1795 | if (!strcasecmp(ctype, "fixed")) | |
1796 | return kFixed; | |
1797 | else if (!strcasecmp(ctype, "uniform")) | |
1798 | return kUniform; | |
1799 | else if (!strcasecmp(ctype, "normal")) | |
1800 | return kNormal; | |
1801 | ||
1802 | fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype); | |
1e59de90 | 1803 | exit(1); |
f67539c2 TL |
1804 | } |
1805 | ||
1806 | class BaseDistribution { | |
1807 | public: | |
20effc67 TL |
1808 | BaseDistribution(unsigned int _min, unsigned int _max) |
1809 | : min_value_size_(_min), max_value_size_(_max) {} | |
f67539c2 TL |
1810 | virtual ~BaseDistribution() {} |
1811 | ||
1812 | unsigned int Generate() { | |
1813 | auto val = Get(); | |
1814 | if (NeedTruncate()) { | |
1815 | val = std::max(min_value_size_, val); | |
1816 | val = std::min(max_value_size_, val); | |
1817 | } | |
1818 | return val; | |
1819 | } | |
1e59de90 | 1820 | |
f67539c2 TL |
1821 | private: |
1822 | virtual unsigned int Get() = 0; | |
1e59de90 | 1823 | virtual bool NeedTruncate() { return true; } |
f67539c2 TL |
1824 | unsigned int min_value_size_; |
1825 | unsigned int max_value_size_; | |
1826 | }; | |
1827 | ||
1e59de90 | 1828 | class FixedDistribution : public BaseDistribution { |
f67539c2 | 1829 | public: |
1e59de90 TL |
1830 | FixedDistribution(unsigned int size) |
1831 | : BaseDistribution(size, size), size_(size) {} | |
1832 | ||
f67539c2 | 1833 | private: |
1e59de90 TL |
1834 | virtual unsigned int Get() override { return size_; } |
1835 | virtual bool NeedTruncate() override { return false; } | |
f67539c2 TL |
1836 | unsigned int size_; |
1837 | }; | |
1838 | ||
1e59de90 TL |
1839 | class NormalDistribution : public BaseDistribution, |
1840 | public std::normal_distribution<double> { | |
f67539c2 | 1841 | public: |
20effc67 TL |
1842 | NormalDistribution(unsigned int _min, unsigned int _max) |
1843 | : BaseDistribution(_min, _max), | |
1844 | // 99.7% values within the range [min, max]. | |
1845 | std::normal_distribution<double>( | |
1846 | (double)(_min + _max) / 2.0 /*mean*/, | |
1847 | (double)(_max - _min) / 6.0 /*stddev*/), | |
1848 | gen_(rd_()) {} | |
1849 | ||
f67539c2 TL |
1850 | private: |
1851 | virtual unsigned int Get() override { | |
1852 | return static_cast<unsigned int>((*this)(gen_)); | |
1853 | } | |
1854 | std::random_device rd_; | |
1855 | std::mt19937 gen_; | |
1856 | }; | |
1857 | ||
1e59de90 TL |
1858 | class UniformDistribution : public BaseDistribution, |
1859 | public std::uniform_int_distribution<unsigned int> { | |
f67539c2 | 1860 | public: |
20effc67 TL |
1861 | UniformDistribution(unsigned int _min, unsigned int _max) |
1862 | : BaseDistribution(_min, _max), | |
1863 | std::uniform_int_distribution<unsigned int>(_min, _max), | |
1864 | gen_(rd_()) {} | |
1865 | ||
f67539c2 | 1866 | private: |
1e59de90 TL |
1867 | virtual unsigned int Get() override { return (*this)(gen_); } |
1868 | virtual bool NeedTruncate() override { return false; } | |
f67539c2 TL |
1869 | std::random_device rd_; |
1870 | std::mt19937 gen_; | |
1871 | }; | |
1872 | ||
7c673cae FG |
1873 | // Helper for quickly generating random data. |
1874 | class RandomGenerator { | |
1875 | private: | |
1876 | std::string data_; | |
1877 | unsigned int pos_; | |
f67539c2 | 1878 | std::unique_ptr<BaseDistribution> dist_; |
7c673cae FG |
1879 | |
1880 | public: | |
1881 | RandomGenerator() { | |
f67539c2 TL |
1882 | auto max_value_size = FLAGS_value_size_max; |
1883 | switch (FLAGS_value_size_distribution_type_e) { | |
1884 | case kUniform: | |
1885 | dist_.reset(new UniformDistribution(FLAGS_value_size_min, | |
1886 | FLAGS_value_size_max)); | |
1887 | break; | |
1888 | case kNormal: | |
1e59de90 TL |
1889 | dist_.reset( |
1890 | new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max)); | |
f67539c2 TL |
1891 | break; |
1892 | case kFixed: | |
1893 | default: | |
1894 | dist_.reset(new FixedDistribution(value_size)); | |
1895 | max_value_size = value_size; | |
1896 | } | |
7c673cae FG |
1897 | // We use a limited amount of data over and over again and ensure |
1898 | // that it is larger than the compression window (32KB), and also | |
1899 | // large enough to serve all typical value sizes we want to write. | |
1900 | Random rnd(301); | |
1901 | std::string piece; | |
f67539c2 | 1902 | while (data_.size() < (unsigned)std::max(1048576, max_value_size)) { |
7c673cae FG |
1903 | // Add a short fragment that is as compressible as specified |
1904 | // by FLAGS_compression_ratio. | |
1905 | test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); | |
1906 | data_.append(piece); | |
1907 | } | |
1908 | pos_ = 0; | |
1909 | } | |
1910 | ||
1911 | Slice Generate(unsigned int len) { | |
1912 | assert(len <= data_.size()); | |
1913 | if (pos_ + len > data_.size()) { | |
1914 | pos_ = 0; | |
1915 | } | |
1916 | pos_ += len; | |
1917 | return Slice(data_.data() + pos_ - len, len); | |
1918 | } | |
11fdf7f2 | 1919 | |
f67539c2 TL |
1920 | Slice Generate() { |
1921 | auto len = dist_->Generate(); | |
1922 | return Generate(len); | |
11fdf7f2 | 1923 | } |
7c673cae FG |
1924 | }; |
1925 | ||
1926 | static void AppendWithSpace(std::string* str, Slice msg) { | |
1927 | if (msg.empty()) return; | |
1928 | if (!str->empty()) { | |
1929 | str->push_back(' '); | |
1930 | } | |
1931 | str->append(msg.data(), msg.size()); | |
1932 | } | |
1933 | ||
1934 | struct DBWithColumnFamilies { | |
1935 | std::vector<ColumnFamilyHandle*> cfh; | |
1936 | DB* db; | |
1937 | #ifndef ROCKSDB_LITE | |
1938 | OptimisticTransactionDB* opt_txn_db; | |
1e59de90 | 1939 | #endif // ROCKSDB_LITE |
7c673cae FG |
1940 | std::atomic<size_t> num_created; // Need to be updated after all the |
1941 | // new entries in cfh are set. | |
1942 | size_t num_hot; // Number of column families to be queried at each moment. | |
1943 | // After each CreateNewCf(), another num_hot number of new | |
1944 | // Column families will be created and used to be queried. | |
1945 | port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf() | |
11fdf7f2 TL |
1946 | std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating |
1947 | // on cfh[i]. | |
7c673cae FG |
1948 | |
1949 | DBWithColumnFamilies() | |
1950 | : db(nullptr) | |
1951 | #ifndef ROCKSDB_LITE | |
1e59de90 TL |
1952 | , |
1953 | opt_txn_db(nullptr) | |
7c673cae FG |
1954 | #endif // ROCKSDB_LITE |
1955 | { | |
1956 | cfh.clear(); | |
1957 | num_created = 0; | |
1958 | num_hot = 0; | |
1959 | } | |
1960 | ||
1961 | DBWithColumnFamilies(const DBWithColumnFamilies& other) | |
1962 | : cfh(other.cfh), | |
1963 | db(other.db), | |
1964 | #ifndef ROCKSDB_LITE | |
1965 | opt_txn_db(other.opt_txn_db), | |
1966 | #endif // ROCKSDB_LITE | |
1967 | num_created(other.num_created.load()), | |
11fdf7f2 TL |
1968 | num_hot(other.num_hot), |
1969 | cfh_idx_to_prob(other.cfh_idx_to_prob) { | |
1970 | } | |
7c673cae FG |
1971 | |
1972 | void DeleteDBs() { | |
1973 | std::for_each(cfh.begin(), cfh.end(), | |
1974 | [](ColumnFamilyHandle* cfhi) { delete cfhi; }); | |
1975 | cfh.clear(); | |
1976 | #ifndef ROCKSDB_LITE | |
1977 | if (opt_txn_db) { | |
1978 | delete opt_txn_db; | |
1979 | opt_txn_db = nullptr; | |
1980 | } else { | |
1981 | delete db; | |
1982 | db = nullptr; | |
1983 | } | |
1984 | #else | |
1985 | delete db; | |
1986 | db = nullptr; | |
1987 | #endif // ROCKSDB_LITE | |
1988 | } | |
1989 | ||
1990 | ColumnFamilyHandle* GetCfh(int64_t rand_num) { | |
1991 | assert(num_hot > 0); | |
11fdf7f2 TL |
1992 | size_t rand_offset = 0; |
1993 | if (!cfh_idx_to_prob.empty()) { | |
1994 | assert(cfh_idx_to_prob.size() == num_hot); | |
1995 | int sum = 0; | |
1996 | while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) { | |
1997 | sum += cfh_idx_to_prob[rand_offset]; | |
1998 | ++rand_offset; | |
1999 | } | |
2000 | assert(rand_offset < cfh_idx_to_prob.size()); | |
2001 | } else { | |
2002 | rand_offset = rand_num % num_hot; | |
2003 | } | |
7c673cae | 2004 | return cfh[num_created.load(std::memory_order_acquire) - num_hot + |
11fdf7f2 | 2005 | rand_offset]; |
7c673cae FG |
2006 | } |
2007 | ||
2008 | // stage: assume CF from 0 to stage * num_hot has be created. Need to create | |
2009 | // stage * num_hot + 1 to stage * (num_hot + 1). | |
2010 | void CreateNewCf(ColumnFamilyOptions options, int64_t stage) { | |
2011 | MutexLock l(&create_cf_mutex); | |
2012 | if ((stage + 1) * num_hot <= num_created) { | |
2013 | // Already created. | |
2014 | return; | |
2015 | } | |
2016 | auto new_num_created = num_created + num_hot; | |
2017 | assert(new_num_created <= cfh.size()); | |
2018 | for (size_t i = num_created; i < new_num_created; i++) { | |
2019 | Status s = | |
2020 | db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i])); | |
2021 | if (!s.ok()) { | |
2022 | fprintf(stderr, "create column family error: %s\n", | |
2023 | s.ToString().c_str()); | |
2024 | abort(); | |
2025 | } | |
2026 | } | |
2027 | num_created.store(new_num_created, std::memory_order_release); | |
2028 | } | |
2029 | }; | |
2030 | ||
1e59de90 | 2031 | // A class that reports stats to CSV file. |
7c673cae FG |
2032 | class ReporterAgent { |
2033 | public: | |
2034 | ReporterAgent(Env* env, const std::string& fname, | |
2035 | uint64_t report_interval_secs) | |
2036 | : env_(env), | |
2037 | total_ops_done_(0), | |
2038 | last_report_(0), | |
2039 | report_interval_secs_(report_interval_secs), | |
2040 | stop_(false) { | |
2041 | auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions()); | |
2042 | if (s.ok()) { | |
2043 | s = report_file_->Append(Header() + "\n"); | |
2044 | } | |
2045 | if (s.ok()) { | |
2046 | s = report_file_->Flush(); | |
2047 | } | |
2048 | if (!s.ok()) { | |
2049 | fprintf(stderr, "Can't open %s: %s\n", fname.c_str(), | |
2050 | s.ToString().c_str()); | |
2051 | abort(); | |
2052 | } | |
2053 | ||
2054 | reporting_thread_ = port::Thread([&]() { SleepAndReport(); }); | |
2055 | } | |
2056 | ||
2057 | ~ReporterAgent() { | |
2058 | { | |
2059 | std::unique_lock<std::mutex> lk(mutex_); | |
2060 | stop_ = true; | |
2061 | stop_cv_.notify_all(); | |
2062 | } | |
2063 | reporting_thread_.join(); | |
2064 | } | |
2065 | ||
2066 | // thread safe | |
2067 | void ReportFinishedOps(int64_t num_ops) { | |
2068 | total_ops_done_.fetch_add(num_ops); | |
2069 | } | |
2070 | ||
2071 | private: | |
2072 | std::string Header() const { return "secs_elapsed,interval_qps"; } | |
2073 | void SleepAndReport() { | |
1e59de90 TL |
2074 | auto* clock = env_->GetSystemClock().get(); |
2075 | auto time_started = clock->NowMicros(); | |
7c673cae FG |
2076 | while (true) { |
2077 | { | |
2078 | std::unique_lock<std::mutex> lk(mutex_); | |
2079 | if (stop_ || | |
2080 | stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_), | |
2081 | [&]() { return stop_; })) { | |
2082 | // stopping | |
2083 | break; | |
2084 | } | |
2085 | // else -> timeout, which means time for a report! | |
2086 | } | |
2087 | auto total_ops_done_snapshot = total_ops_done_.load(); | |
2088 | // round the seconds elapsed | |
2089 | auto secs_elapsed = | |
1e59de90 | 2090 | (clock->NowMicros() - time_started + kMicrosInSecond / 2) / |
7c673cae | 2091 | kMicrosInSecond; |
1e59de90 TL |
2092 | std::string report = |
2093 | std::to_string(secs_elapsed) + "," + | |
2094 | std::to_string(total_ops_done_snapshot - last_report_) + "\n"; | |
7c673cae FG |
2095 | auto s = report_file_->Append(report); |
2096 | if (s.ok()) { | |
2097 | s = report_file_->Flush(); | |
2098 | } | |
2099 | if (!s.ok()) { | |
2100 | fprintf(stderr, | |
2101 | "Can't write to report file (%s), stopping the reporting\n", | |
2102 | s.ToString().c_str()); | |
2103 | break; | |
2104 | } | |
2105 | last_report_ = total_ops_done_snapshot; | |
2106 | } | |
2107 | } | |
2108 | ||
2109 | Env* env_; | |
2110 | std::unique_ptr<WritableFile> report_file_; | |
2111 | std::atomic<int64_t> total_ops_done_; | |
2112 | int64_t last_report_; | |
2113 | const uint64_t report_interval_secs_; | |
f67539c2 | 2114 | ROCKSDB_NAMESPACE::port::Thread reporting_thread_; |
7c673cae FG |
2115 | std::mutex mutex_; |
2116 | // will notify on stop | |
2117 | std::condition_variable stop_cv_; | |
2118 | bool stop_; | |
2119 | }; | |
2120 | ||
2121 | enum OperationType : unsigned char { | |
2122 | kRead = 0, | |
2123 | kWrite, | |
2124 | kDelete, | |
2125 | kSeek, | |
2126 | kMerge, | |
2127 | kUpdate, | |
2128 | kCompress, | |
2129 | kUncompress, | |
2130 | kCrc, | |
2131 | kHash, | |
2132 | kOthers | |
2133 | }; | |
2134 | ||
2135 | static std::unordered_map<OperationType, std::string, std::hash<unsigned char>> | |
1e59de90 TL |
2136 | OperationTypeString = {{kRead, "read"}, {kWrite, "write"}, |
2137 | {kDelete, "delete"}, {kSeek, "seek"}, | |
2138 | {kMerge, "merge"}, {kUpdate, "update"}, | |
2139 | {kCompress, "compress"}, {kCompress, "uncompress"}, | |
2140 | {kCrc, "crc"}, {kHash, "hash"}, | |
2141 | {kOthers, "op"}}; | |
7c673cae FG |
2142 | |
2143 | class CombinedStats; | |
2144 | class Stats { | |
2145 | private: | |
1e59de90 | 2146 | SystemClock* clock_; |
7c673cae | 2147 | int id_; |
20effc67 | 2148 | uint64_t start_ = 0; |
11fdf7f2 | 2149 | uint64_t sine_interval_; |
7c673cae FG |
2150 | uint64_t finish_; |
2151 | double seconds_; | |
2152 | uint64_t done_; | |
2153 | uint64_t last_report_done_; | |
2154 | uint64_t next_report_; | |
2155 | uint64_t bytes_; | |
2156 | uint64_t last_op_finish_; | |
2157 | uint64_t last_report_finish_; | |
2158 | std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>, | |
1e59de90 TL |
2159 | std::hash<unsigned char>> |
2160 | hist_; | |
7c673cae FG |
2161 | std::string message_; |
2162 | bool exclude_from_merge_; | |
2163 | ReporterAgent* reporter_agent_; // does not own | |
2164 | friend class CombinedStats; | |
2165 | ||
2166 | public: | |
1e59de90 | 2167 | Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); } |
7c673cae FG |
2168 | |
2169 | void SetReporterAgent(ReporterAgent* reporter_agent) { | |
2170 | reporter_agent_ = reporter_agent; | |
2171 | } | |
2172 | ||
2173 | void Start(int id) { | |
2174 | id_ = id; | |
2175 | next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100; | |
2176 | last_op_finish_ = start_; | |
2177 | hist_.clear(); | |
2178 | done_ = 0; | |
2179 | last_report_done_ = 0; | |
2180 | bytes_ = 0; | |
2181 | seconds_ = 0; | |
1e59de90 TL |
2182 | start_ = clock_->NowMicros(); |
2183 | sine_interval_ = clock_->NowMicros(); | |
7c673cae FG |
2184 | finish_ = start_; |
2185 | last_report_finish_ = start_; | |
2186 | message_.clear(); | |
2187 | // When set, stats from this thread won't be merged with others. | |
2188 | exclude_from_merge_ = false; | |
2189 | } | |
2190 | ||
2191 | void Merge(const Stats& other) { | |
1e59de90 | 2192 | if (other.exclude_from_merge_) return; |
7c673cae FG |
2193 | |
2194 | for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) { | |
2195 | auto this_it = hist_.find(it->first); | |
2196 | if (this_it != hist_.end()) { | |
2197 | this_it->second->Merge(*(other.hist_.at(it->first))); | |
2198 | } else { | |
1e59de90 | 2199 | hist_.insert({it->first, it->second}); |
7c673cae FG |
2200 | } |
2201 | } | |
2202 | ||
2203 | done_ += other.done_; | |
2204 | bytes_ += other.bytes_; | |
2205 | seconds_ += other.seconds_; | |
2206 | if (other.start_ < start_) start_ = other.start_; | |
2207 | if (other.finish_ > finish_) finish_ = other.finish_; | |
2208 | ||
1e59de90 | 2209 | // Just keep the messages from one thread. |
7c673cae FG |
2210 | if (message_.empty()) message_ = other.message_; |
2211 | } | |
2212 | ||
2213 | void Stop() { | |
1e59de90 | 2214 | finish_ = clock_->NowMicros(); |
7c673cae FG |
2215 | seconds_ = (finish_ - start_) * 1e-6; |
2216 | } | |
2217 | ||
1e59de90 | 2218 | void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); } |
7c673cae FG |
2219 | |
2220 | void SetId(int id) { id_ = id; } | |
2221 | void SetExcludeFromMerge() { exclude_from_merge_ = true; } | |
2222 | ||
2223 | void PrintThreadStatus() { | |
2224 | std::vector<ThreadStatus> thread_list; | |
2225 | FLAGS_env->GetThreadList(&thread_list); | |
2226 | ||
1e59de90 TL |
2227 | fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID", |
2228 | "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage", | |
2229 | "State", "OperationProperties"); | |
7c673cae FG |
2230 | |
2231 | int64_t current_time = 0; | |
1e59de90 | 2232 | clock_->GetCurrentTime(¤t_time).PermitUncheckedError(); |
7c673cae FG |
2233 | for (auto ts : thread_list) { |
2234 | fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s", | |
1e59de90 TL |
2235 | ts.thread_id, |
2236 | ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(), | |
2237 | ts.cf_name.c_str(), | |
2238 | ThreadStatus::GetOperationName(ts.operation_type).c_str(), | |
2239 | ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(), | |
2240 | ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(), | |
2241 | ThreadStatus::GetStateName(ts.state_type).c_str()); | |
7c673cae FG |
2242 | |
2243 | auto op_properties = ThreadStatus::InterpretOperationProperties( | |
2244 | ts.operation_type, ts.op_properties); | |
2245 | for (const auto& op_prop : op_properties) { | |
1e59de90 TL |
2246 | fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(), |
2247 | op_prop.second); | |
7c673cae FG |
2248 | } |
2249 | fprintf(stderr, "\n"); | |
2250 | } | |
2251 | } | |
2252 | ||
1e59de90 | 2253 | void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); } |
11fdf7f2 | 2254 | |
1e59de90 | 2255 | uint64_t GetSineInterval() { return sine_interval_; } |
11fdf7f2 | 2256 | |
1e59de90 | 2257 | uint64_t GetStart() { return start_; } |
11fdf7f2 | 2258 | |
7c673cae | 2259 | void ResetLastOpTime() { |
1e59de90 TL |
2260 | // Set to now to avoid latency from calls to SleepForMicroseconds. |
2261 | last_op_finish_ = clock_->NowMicros(); | |
7c673cae FG |
2262 | } |
2263 | ||
2264 | void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops, | |
2265 | enum OperationType op_type = kOthers) { | |
2266 | if (reporter_agent_) { | |
2267 | reporter_agent_->ReportFinishedOps(num_ops); | |
2268 | } | |
2269 | if (FLAGS_histogram) { | |
1e59de90 | 2270 | uint64_t now = clock_->NowMicros(); |
7c673cae FG |
2271 | uint64_t micros = now - last_op_finish_; |
2272 | ||
1e59de90 | 2273 | if (hist_.find(op_type) == hist_.end()) { |
7c673cae FG |
2274 | auto hist_temp = std::make_shared<HistogramImpl>(); |
2275 | hist_.insert({op_type, std::move(hist_temp)}); | |
2276 | } | |
2277 | hist_[op_type]->Add(micros); | |
2278 | ||
1e59de90 | 2279 | if (micros >= FLAGS_slow_usecs && !FLAGS_stats_interval) { |
7c673cae FG |
2280 | fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, ""); |
2281 | fflush(stderr); | |
2282 | } | |
2283 | last_op_finish_ = now; | |
2284 | } | |
2285 | ||
2286 | done_ += num_ops; | |
1e59de90 | 2287 | if (done_ >= next_report_ && FLAGS_progress_reports) { |
7c673cae | 2288 | if (!FLAGS_stats_interval) { |
1e59de90 TL |
2289 | if (next_report_ < 1000) |
2290 | next_report_ += 100; | |
2291 | else if (next_report_ < 5000) | |
2292 | next_report_ += 500; | |
2293 | else if (next_report_ < 10000) | |
2294 | next_report_ += 1000; | |
2295 | else if (next_report_ < 50000) | |
2296 | next_report_ += 5000; | |
2297 | else if (next_report_ < 100000) | |
2298 | next_report_ += 10000; | |
2299 | else if (next_report_ < 500000) | |
2300 | next_report_ += 50000; | |
2301 | else | |
2302 | next_report_ += 100000; | |
7c673cae FG |
2303 | fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, ""); |
2304 | } else { | |
1e59de90 | 2305 | uint64_t now = clock_->NowMicros(); |
7c673cae FG |
2306 | int64_t usecs_since_last = now - last_report_finish_; |
2307 | ||
2308 | // Determine whether to print status where interval is either | |
2309 | // each N operations or each N seconds. | |
2310 | ||
2311 | if (FLAGS_stats_interval_seconds && | |
2312 | usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) { | |
1e59de90 | 2313 | // Don't check again for this many operations. |
7c673cae FG |
2314 | next_report_ += FLAGS_stats_interval; |
2315 | ||
2316 | } else { | |
7c673cae | 2317 | fprintf(stderr, |
1e59de90 TL |
2318 | "%s ... thread %d: (%" PRIu64 ",%" PRIu64 |
2319 | ") ops and " | |
7c673cae | 2320 | "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n", |
1e59de90 | 2321 | clock_->TimeToString(now / 1000000).c_str(), id_, |
7c673cae | 2322 | done_ - last_report_done_, done_, |
1e59de90 | 2323 | (done_ - last_report_done_) / (usecs_since_last / 1000000.0), |
7c673cae FG |
2324 | done_ / ((now - start_) / 1000000.0), |
2325 | (now - last_report_finish_) / 1000000.0, | |
2326 | (now - start_) / 1000000.0); | |
2327 | ||
2328 | if (id_ == 0 && FLAGS_stats_per_interval) { | |
2329 | std::string stats; | |
2330 | ||
2331 | if (db_with_cfh && db_with_cfh->num_created.load()) { | |
2332 | for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) { | |
2333 | if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats", | |
2334 | &stats)) | |
2335 | fprintf(stderr, "%s\n", stats.c_str()); | |
2336 | if (FLAGS_show_table_properties) { | |
2337 | for (int level = 0; level < FLAGS_num_levels; ++level) { | |
2338 | if (db->GetProperty( | |
2339 | db_with_cfh->cfh[i], | |
2340 | "rocksdb.aggregated-table-properties-at-level" + | |
1e59de90 | 2341 | std::to_string(level), |
7c673cae FG |
2342 | &stats)) { |
2343 | if (stats.find("# entries=0") == std::string::npos) { | |
2344 | fprintf(stderr, "Level[%d]: %s\n", level, | |
2345 | stats.c_str()); | |
2346 | } | |
2347 | } | |
2348 | } | |
2349 | } | |
2350 | } | |
2351 | } else if (db) { | |
2352 | if (db->GetProperty("rocksdb.stats", &stats)) { | |
1e59de90 TL |
2353 | fprintf(stderr, "%s", stats.c_str()); |
2354 | } | |
2355 | if (db->GetProperty("rocksdb.num-running-compactions", &stats)) { | |
2356 | fprintf(stderr, "num-running-compactions: %s\n", stats.c_str()); | |
2357 | } | |
2358 | if (db->GetProperty("rocksdb.num-running-flushes", &stats)) { | |
2359 | fprintf(stderr, "num-running-flushes: %s\n\n", stats.c_str()); | |
7c673cae FG |
2360 | } |
2361 | if (FLAGS_show_table_properties) { | |
2362 | for (int level = 0; level < FLAGS_num_levels; ++level) { | |
2363 | if (db->GetProperty( | |
2364 | "rocksdb.aggregated-table-properties-at-level" + | |
1e59de90 | 2365 | std::to_string(level), |
7c673cae FG |
2366 | &stats)) { |
2367 | if (stats.find("# entries=0") == std::string::npos) { | |
2368 | fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str()); | |
2369 | } | |
2370 | } | |
2371 | } | |
2372 | } | |
2373 | } | |
2374 | } | |
2375 | ||
2376 | next_report_ += FLAGS_stats_interval; | |
2377 | last_report_finish_ = now; | |
2378 | last_report_done_ = done_; | |
2379 | } | |
2380 | } | |
2381 | if (id_ == 0 && FLAGS_thread_status_per_interval) { | |
2382 | PrintThreadStatus(); | |
2383 | } | |
2384 | fflush(stderr); | |
2385 | } | |
2386 | } | |
2387 | ||
1e59de90 | 2388 | void AddBytes(int64_t n) { bytes_ += n; } |
7c673cae FG |
2389 | |
2390 | void Report(const Slice& name) { | |
2391 | // Pretend at least one op was done in case we are running a benchmark | |
2392 | // that does not call FinishedOps(). | |
2393 | if (done_ < 1) done_ = 1; | |
2394 | ||
2395 | std::string extra; | |
1e59de90 | 2396 | double elapsed = (finish_ - start_) * 1e-6; |
7c673cae FG |
2397 | if (bytes_ > 0) { |
2398 | // Rate is computed on actual elapsed time, not the sum of per-thread | |
2399 | // elapsed times. | |
7c673cae FG |
2400 | char rate[100]; |
2401 | snprintf(rate, sizeof(rate), "%6.1f MB/s", | |
2402 | (bytes_ / 1048576.0) / elapsed); | |
2403 | extra = rate; | |
2404 | } | |
2405 | AppendWithSpace(&extra, message_); | |
1e59de90 TL |
2406 | double throughput = (double)done_ / elapsed; |
2407 | ||
2408 | fprintf(stdout, | |
2409 | "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64 | |
2410 | " operations;%s%s\n", | |
2411 | name.ToString().c_str(), seconds_ * 1e6 / done_, (long)throughput, | |
2412 | elapsed, done_, (extra.empty() ? "" : " "), extra.c_str()); | |
7c673cae FG |
2413 | if (FLAGS_histogram) { |
2414 | for (auto it = hist_.begin(); it != hist_.end(); ++it) { | |
2415 | fprintf(stdout, "Microseconds per %s:\n%s\n", | |
2416 | OperationTypeString[it->first].c_str(), | |
2417 | it->second->ToString().c_str()); | |
2418 | } | |
2419 | } | |
2420 | if (FLAGS_report_file_operations) { | |
1e59de90 TL |
2421 | auto* counted_fs = |
2422 | FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>(); | |
2423 | assert(counted_fs); | |
2424 | fprintf(stdout, "%s", counted_fs->PrintCounters().c_str()); | |
2425 | counted_fs->ResetCounters(); | |
7c673cae FG |
2426 | } |
2427 | fflush(stdout); | |
2428 | } | |
2429 | }; | |
2430 | ||
2431 | class CombinedStats { | |
2432 | public: | |
2433 | void AddStats(const Stats& stat) { | |
2434 | uint64_t total_ops = stat.done_; | |
2435 | uint64_t total_bytes_ = stat.bytes_; | |
2436 | double elapsed; | |
2437 | ||
2438 | if (total_ops < 1) { | |
2439 | total_ops = 1; | |
2440 | } | |
2441 | ||
2442 | elapsed = (stat.finish_ - stat.start_) * 1e-6; | |
2443 | throughput_ops_.emplace_back(total_ops / elapsed); | |
2444 | ||
2445 | if (total_bytes_ > 0) { | |
2446 | double mbs = (total_bytes_ / 1048576.0); | |
2447 | throughput_mbs_.emplace_back(mbs / elapsed); | |
2448 | } | |
2449 | } | |
2450 | ||
2451 | void Report(const std::string& bench_name) { | |
1e59de90 TL |
2452 | if (throughput_ops_.size() < 2) { |
2453 | // skip if there are not enough samples | |
2454 | return; | |
2455 | } | |
2456 | ||
2457 | const char* name = bench_name.c_str(); | |
2458 | int num_runs = static_cast<int>(throughput_ops_.size()); | |
2459 | ||
2460 | if (throughput_mbs_.size() == throughput_ops_.size()) { | |
2461 | fprintf(stdout, | |
2462 | "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 " | |
2463 | "%.1f) MB/sec\n", | |
2464 | name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), | |
2465 | static_cast<int>(CalcConfidence95(throughput_ops_)), | |
2466 | CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_)); | |
2467 | } else { | |
2468 | fprintf(stdout, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name, | |
2469 | num_runs, static_cast<int>(CalcAvg(throughput_ops_)), | |
2470 | static_cast<int>(CalcConfidence95(throughput_ops_))); | |
2471 | } | |
2472 | } | |
2473 | ||
2474 | void ReportWithConfidenceIntervals(const std::string& bench_name) { | |
2475 | if (throughput_ops_.size() < 2) { | |
2476 | // skip if there are not enough samples | |
2477 | return; | |
2478 | } | |
2479 | ||
2480 | const char* name = bench_name.c_str(); | |
2481 | int num_runs = static_cast<int>(throughput_ops_.size()); | |
2482 | ||
2483 | int ops_avg = static_cast<int>(CalcAvg(throughput_ops_)); | |
2484 | int ops_confidence_95 = static_cast<int>(CalcConfidence95(throughput_ops_)); | |
2485 | ||
2486 | if (throughput_mbs_.size() == throughput_ops_.size()) { | |
2487 | double mbs_avg = CalcAvg(throughput_mbs_); | |
2488 | double mbs_confidence_95 = CalcConfidence95(throughput_mbs_); | |
2489 | fprintf(stdout, | |
2490 | "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n", | |
2491 | name, num_runs, ops_avg - ops_confidence_95, | |
2492 | ops_avg + ops_confidence_95, mbs_avg - mbs_confidence_95, | |
2493 | mbs_avg + mbs_confidence_95); | |
2494 | } else { | |
2495 | fprintf(stdout, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name, num_runs, | |
2496 | ops_avg - ops_confidence_95, ops_avg + ops_confidence_95); | |
2497 | } | |
2498 | } | |
2499 | ||
2500 | void ReportFinal(const std::string& bench_name) { | |
2501 | if (throughput_ops_.size() < 2) { | |
2502 | // skip if there are not enough samples | |
2503 | return; | |
2504 | } | |
2505 | ||
7c673cae FG |
2506 | const char* name = bench_name.c_str(); |
2507 | int num_runs = static_cast<int>(throughput_ops_.size()); | |
2508 | ||
2509 | if (throughput_mbs_.size() == throughput_ops_.size()) { | |
1e59de90 | 2510 | // \xC2\xB1 is +/- character in UTF-8 |
7c673cae | 2511 | fprintf(stdout, |
1e59de90 TL |
2512 | "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 " |
2513 | "%.1f) MB/sec\n" | |
7c673cae FG |
2514 | "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n", |
2515 | name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), | |
1e59de90 TL |
2516 | static_cast<int>(CalcConfidence95(throughput_ops_)), |
2517 | CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name, | |
2518 | num_runs, static_cast<int>(CalcMedian(throughput_ops_)), | |
7c673cae FG |
2519 | CalcMedian(throughput_mbs_)); |
2520 | } else { | |
2521 | fprintf(stdout, | |
1e59de90 | 2522 | "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n" |
7c673cae | 2523 | "%s [MEDIAN %d runs] : %d ops/sec\n", |
1e59de90 TL |
2524 | name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), |
2525 | static_cast<int>(CalcConfidence95(throughput_ops_)), name, | |
7c673cae FG |
2526 | num_runs, static_cast<int>(CalcMedian(throughput_ops_))); |
2527 | } | |
2528 | } | |
2529 | ||
2530 | private: | |
1e59de90 | 2531 | double CalcAvg(std::vector<double>& data) { |
7c673cae FG |
2532 | double avg = 0; |
2533 | for (double x : data) { | |
2534 | avg += x; | |
2535 | } | |
2536 | avg = avg / data.size(); | |
2537 | return avg; | |
2538 | } | |
2539 | ||
1e59de90 TL |
2540 | // Calculates 95% CI assuming a normal distribution of samples. |
2541 | // Samples are not from a normal distribution, but it still | |
2542 | // provides useful approximation. | |
2543 | double CalcConfidence95(std::vector<double>& data) { | |
2544 | assert(data.size() > 1); | |
2545 | double avg = CalcAvg(data); | |
2546 | double std_error = CalcStdDev(data, avg) / std::sqrt(data.size()); | |
2547 | ||
2548 | // Z score for the 97.5 percentile | |
2549 | // see https://en.wikipedia.org/wiki/1.96 | |
2550 | return 1.959964 * std_error; | |
2551 | } | |
2552 | ||
2553 | double CalcMedian(std::vector<double>& data) { | |
7c673cae FG |
2554 | assert(data.size() > 0); |
2555 | std::sort(data.begin(), data.end()); | |
2556 | ||
2557 | size_t mid = data.size() / 2; | |
2558 | if (data.size() % 2 == 1) { | |
2559 | // Odd number of entries | |
2560 | return data[mid]; | |
2561 | } else { | |
2562 | // Even number of entries | |
2563 | return (data[mid] + data[mid - 1]) / 2; | |
2564 | } | |
2565 | } | |
2566 | ||
1e59de90 TL |
2567 | double CalcStdDev(std::vector<double>& data, double average) { |
2568 | assert(data.size() > 1); | |
2569 | double squared_sum = 0.0; | |
2570 | for (double x : data) { | |
2571 | squared_sum += std::pow(x - average, 2); | |
2572 | } | |
2573 | ||
2574 | // using samples count - 1 following Bessel's correction | |
2575 | // see https://en.wikipedia.org/wiki/Bessel%27s_correction | |
2576 | return std::sqrt(squared_sum / (data.size() - 1)); | |
2577 | } | |
2578 | ||
7c673cae FG |
2579 | std::vector<double> throughput_ops_; |
2580 | std::vector<double> throughput_mbs_; | |
2581 | }; | |
2582 | ||
2583 | class TimestampEmulator { | |
2584 | private: | |
2585 | std::atomic<uint64_t> timestamp_; | |
2586 | ||
2587 | public: | |
2588 | TimestampEmulator() : timestamp_(0) {} | |
2589 | uint64_t Get() const { return timestamp_.load(); } | |
2590 | void Inc() { timestamp_++; } | |
20effc67 TL |
2591 | Slice Allocate(char* scratch) { |
2592 | // TODO: support larger timestamp sizes | |
2593 | assert(FLAGS_user_timestamp_size == 8); | |
2594 | assert(scratch); | |
2595 | uint64_t ts = timestamp_.fetch_add(1); | |
2596 | EncodeFixed64(scratch, ts); | |
2597 | return Slice(scratch, FLAGS_user_timestamp_size); | |
2598 | } | |
2599 | Slice GetTimestampForRead(Random64& rand, char* scratch) { | |
2600 | assert(FLAGS_user_timestamp_size == 8); | |
2601 | assert(scratch); | |
2602 | if (FLAGS_read_with_latest_user_timestamp) { | |
2603 | return Allocate(scratch); | |
2604 | } | |
2605 | // Choose a random timestamp from the past. | |
2606 | uint64_t ts = rand.Next() % Get(); | |
2607 | EncodeFixed64(scratch, ts); | |
2608 | return Slice(scratch, FLAGS_user_timestamp_size); | |
2609 | } | |
7c673cae FG |
2610 | }; |
2611 | ||
2612 | // State shared by all concurrent executions of the same benchmark. | |
2613 | struct SharedState { | |
2614 | port::Mutex mu; | |
2615 | port::CondVar cv; | |
2616 | int total; | |
2617 | int perf_level; | |
2618 | std::shared_ptr<RateLimiter> write_rate_limiter; | |
2619 | std::shared_ptr<RateLimiter> read_rate_limiter; | |
2620 | ||
2621 | // Each thread goes through the following states: | |
2622 | // (1) initializing | |
2623 | // (2) waiting for others to be initialized | |
2624 | // (3) running | |
2625 | // (4) done | |
2626 | ||
2627 | long num_initialized; | |
2628 | long num_done; | |
2629 | bool start; | |
2630 | ||
1e59de90 | 2631 | SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {} |
7c673cae FG |
2632 | }; |
2633 | ||
2634 | // Per-thread state for concurrent executions of the same benchmark. | |
2635 | struct ThreadState { | |
1e59de90 TL |
2636 | int tid; // 0..n-1 when running in n threads |
2637 | Random64 rand; // Has different seeds for different threads | |
7c673cae FG |
2638 | Stats stats; |
2639 | SharedState* shared; | |
2640 | ||
1e59de90 TL |
2641 | explicit ThreadState(int index, int my_seed) |
2642 | : tid(index), rand(seed_base + my_seed) {} | |
7c673cae FG |
2643 | }; |
2644 | ||
2645 | class Duration { | |
2646 | public: | |
2647 | Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) { | |
2648 | max_seconds_ = max_seconds; | |
1e59de90 | 2649 | max_ops_ = max_ops; |
7c673cae FG |
2650 | ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops; |
2651 | ops_ = 0; | |
2652 | start_at_ = FLAGS_env->NowMicros(); | |
2653 | } | |
2654 | ||
2655 | int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; } | |
2656 | ||
2657 | bool Done(int64_t increment) { | |
1e59de90 | 2658 | if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops |
7c673cae FG |
2659 | ops_ += increment; |
2660 | ||
2661 | if (max_seconds_) { | |
2662 | // Recheck every appx 1000 ops (exact iff increment is factor of 1000) | |
11fdf7f2 TL |
2663 | auto granularity = FLAGS_ops_between_duration_checks; |
2664 | if ((ops_ / granularity) != ((ops_ - increment) / granularity)) { | |
7c673cae FG |
2665 | uint64_t now = FLAGS_env->NowMicros(); |
2666 | return ((now - start_at_) / 1000000) >= max_seconds_; | |
2667 | } else { | |
2668 | return false; | |
2669 | } | |
2670 | } else { | |
2671 | return ops_ > max_ops_; | |
2672 | } | |
2673 | } | |
2674 | ||
2675 | private: | |
2676 | uint64_t max_seconds_; | |
2677 | int64_t max_ops_; | |
2678 | int64_t ops_per_stage_; | |
2679 | int64_t ops_; | |
2680 | uint64_t start_at_; | |
2681 | }; | |
2682 | ||
2683 | class Benchmark { | |
2684 | private: | |
2685 | std::shared_ptr<Cache> cache_; | |
2686 | std::shared_ptr<Cache> compressed_cache_; | |
1e59de90 | 2687 | std::shared_ptr<const SliceTransform> prefix_extractor_; |
7c673cae FG |
2688 | DBWithColumnFamilies db_; |
2689 | std::vector<DBWithColumnFamilies> multi_dbs_; | |
2690 | int64_t num_; | |
7c673cae | 2691 | int key_size_; |
20effc67 | 2692 | int user_timestamp_size_; |
7c673cae | 2693 | int prefix_size_; |
1e59de90 | 2694 | int total_thread_count_; |
7c673cae FG |
2695 | int64_t keys_per_prefix_; |
2696 | int64_t entries_per_batch_; | |
494da23a | 2697 | int64_t writes_before_delete_range_; |
7c673cae FG |
2698 | int64_t writes_per_range_tombstone_; |
2699 | int64_t range_tombstone_width_; | |
2700 | int64_t max_num_range_tombstones_; | |
1e59de90 | 2701 | ReadOptions read_options_; |
7c673cae FG |
2702 | WriteOptions write_options_; |
2703 | Options open_options_; // keep options around to properly destroy db later | |
494da23a | 2704 | #ifndef ROCKSDB_LITE |
11fdf7f2 | 2705 | TraceOptions trace_options_; |
f67539c2 | 2706 | TraceOptions block_cache_trace_options_; |
494da23a | 2707 | #endif |
7c673cae FG |
2708 | int64_t reads_; |
2709 | int64_t deletes_; | |
2710 | double read_random_exp_range_; | |
2711 | int64_t writes_; | |
2712 | int64_t readwrites_; | |
2713 | int64_t merge_keys_; | |
2714 | bool report_file_operations_; | |
1e59de90 TL |
2715 | bool use_blob_db_; // Stacked BlobDB |
2716 | bool read_operands_; // read via GetMergeOperands() | |
494da23a | 2717 | std::vector<std::string> keys_; |
11fdf7f2 TL |
2718 | |
2719 | class ErrorHandlerListener : public EventListener { | |
2720 | public: | |
494da23a | 2721 | #ifndef ROCKSDB_LITE |
11fdf7f2 TL |
2722 | ErrorHandlerListener() |
2723 | : mutex_(), | |
2724 | cv_(&mutex_), | |
2725 | no_auto_recovery_(false), | |
2726 | recovery_complete_(false) {} | |
2727 | ||
494da23a | 2728 | ~ErrorHandlerListener() override {} |
11fdf7f2 | 2729 | |
1e59de90 TL |
2730 | const char* Name() const override { return kClassName(); } |
2731 | static const char* kClassName() { return "ErrorHandlerListener"; } | |
2732 | ||
11fdf7f2 | 2733 | void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, |
494da23a TL |
2734 | Status /*bg_error*/, |
2735 | bool* auto_recovery) override { | |
11fdf7f2 TL |
2736 | if (*auto_recovery && no_auto_recovery_) { |
2737 | *auto_recovery = false; | |
2738 | } | |
2739 | } | |
2740 | ||
494da23a | 2741 | void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override { |
11fdf7f2 TL |
2742 | InstrumentedMutexLock l(&mutex_); |
2743 | recovery_complete_ = true; | |
2744 | cv_.SignalAll(); | |
2745 | } | |
2746 | ||
f67539c2 | 2747 | bool WaitForRecovery(uint64_t abs_time_us) { |
11fdf7f2 TL |
2748 | InstrumentedMutexLock l(&mutex_); |
2749 | if (!recovery_complete_) { | |
f67539c2 | 2750 | cv_.TimedWait(abs_time_us); |
11fdf7f2 TL |
2751 | } |
2752 | if (recovery_complete_) { | |
2753 | recovery_complete_ = false; | |
2754 | return true; | |
2755 | } | |
2756 | return false; | |
2757 | } | |
2758 | ||
2759 | void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } | |
2760 | ||
2761 | private: | |
2762 | InstrumentedMutex mutex_; | |
2763 | InstrumentedCondVar cv_; | |
2764 | bool no_auto_recovery_; | |
2765 | bool recovery_complete_; | |
494da23a TL |
2766 | #else // ROCKSDB_LITE |
2767 | bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; } | |
2768 | void EnableAutoRecovery(bool /*enable*/) {} | |
2769 | #endif // ROCKSDB_LITE | |
11fdf7f2 TL |
2770 | }; |
2771 | ||
2772 | std::shared_ptr<ErrorHandlerListener> listener_; | |
7c673cae | 2773 | |
20effc67 TL |
2774 | std::unique_ptr<TimestampEmulator> mock_app_clock_; |
2775 | ||
7c673cae FG |
2776 | bool SanityCheck() { |
2777 | if (FLAGS_compression_ratio > 1) { | |
2778 | fprintf(stderr, "compression_ratio should be between 0 and 1\n"); | |
2779 | return false; | |
2780 | } | |
2781 | return true; | |
2782 | } | |
2783 | ||
494da23a | 2784 | inline bool CompressSlice(const CompressionInfo& compression_info, |
11fdf7f2 | 2785 | const Slice& input, std::string* compressed) { |
20effc67 TL |
2786 | constexpr uint32_t compress_format_version = 2; |
2787 | ||
2788 | return CompressData(input, compression_info, compress_format_version, | |
2789 | compressed); | |
7c673cae FG |
2790 | } |
2791 | ||
1e59de90 | 2792 | void PrintHeader(const Options& options) { |
7c673cae | 2793 | PrintEnvironment(); |
20effc67 TL |
2794 | fprintf(stdout, |
2795 | "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n", | |
2796 | FLAGS_key_size, FLAGS_user_timestamp_size); | |
f67539c2 TL |
2797 | auto avg_value_size = FLAGS_value_size; |
2798 | if (FLAGS_value_size_distribution_type_e == kFixed) { | |
1e59de90 TL |
2799 | fprintf(stdout, |
2800 | "Values: %d bytes each (%d bytes after compression)\n", | |
f67539c2 TL |
2801 | avg_value_size, |
2802 | static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5)); | |
2803 | } else { | |
2804 | avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2; | |
1e59de90 TL |
2805 | fprintf(stdout, |
2806 | "Values: %d avg bytes each (%d bytes after compression)\n", | |
f67539c2 TL |
2807 | avg_value_size, |
2808 | static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5)); | |
2809 | fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n", | |
1e59de90 TL |
2810 | FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min, |
2811 | FLAGS_value_size_max); | |
f67539c2 | 2812 | } |
7c673cae FG |
2813 | fprintf(stdout, "Entries: %" PRIu64 "\n", num_); |
2814 | fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size); | |
2815 | fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_); | |
2816 | fprintf(stdout, "RawSize: %.1f MB (estimated)\n", | |
1e59de90 TL |
2817 | ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_) / |
2818 | 1048576.0)); | |
2819 | fprintf( | |
2820 | stdout, "FileSize: %.1f MB (estimated)\n", | |
2821 | (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) / | |
2822 | 1048576.0)); | |
7c673cae FG |
2823 | fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n", |
2824 | FLAGS_benchmark_write_rate_limit); | |
2825 | fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n", | |
2826 | FLAGS_benchmark_read_rate_limit); | |
2827 | if (FLAGS_enable_numa) { | |
2828 | fprintf(stderr, "Running in NUMA enabled mode.\n"); | |
2829 | #ifndef NUMA | |
2830 | fprintf(stderr, "NUMA is not defined in the system.\n"); | |
2831 | exit(1); | |
2832 | #else | |
2833 | if (numa_available() == -1) { | |
2834 | fprintf(stderr, "NUMA is not supported by the system.\n"); | |
2835 | exit(1); | |
2836 | } | |
2837 | #endif | |
2838 | } | |
2839 | ||
2840 | auto compression = CompressionTypeToString(FLAGS_compression_type_e); | |
2841 | fprintf(stdout, "Compression: %s\n", compression.c_str()); | |
494da23a TL |
2842 | fprintf(stdout, "Compression sampling rate: %" PRId64 "\n", |
2843 | FLAGS_sample_for_compression); | |
1e59de90 TL |
2844 | if (options.memtable_factory != nullptr) { |
2845 | fprintf(stdout, "Memtablerep: %s\n", | |
2846 | options.memtable_factory->GetId().c_str()); | |
7c673cae FG |
2847 | } |
2848 | fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); | |
2849 | ||
2850 | PrintWarnings(compression.c_str()); | |
2851 | fprintf(stdout, "------------------------------------------------\n"); | |
2852 | } | |
2853 | ||
2854 | void PrintWarnings(const char* compression) { | |
2855 | #if defined(__GNUC__) && !defined(__OPTIMIZE__) | |
1e59de90 TL |
2856 | fprintf( |
2857 | stdout, | |
2858 | "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); | |
7c673cae FG |
2859 | #endif |
2860 | #ifndef NDEBUG | |
2861 | fprintf(stdout, | |
2862 | "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); | |
2863 | #endif | |
f67539c2 | 2864 | if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) { |
7c673cae FG |
2865 | // The test string should not be too small. |
2866 | const int len = FLAGS_block_size; | |
2867 | std::string input_str(len, 'y'); | |
2868 | std::string compressed; | |
494da23a TL |
2869 | CompressionOptions opts; |
2870 | CompressionContext context(FLAGS_compression_type_e); | |
2871 | CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), | |
2872 | FLAGS_compression_type_e, | |
2873 | FLAGS_sample_for_compression); | |
2874 | bool result = CompressSlice(info, Slice(input_str), &compressed); | |
7c673cae FG |
2875 | |
2876 | if (!result) { | |
2877 | fprintf(stdout, "WARNING: %s compression is not enabled\n", | |
2878 | compression); | |
2879 | } else if (compressed.size() >= input_str.size()) { | |
2880 | fprintf(stdout, "WARNING: %s compression is not effective\n", | |
2881 | compression); | |
2882 | } | |
2883 | } | |
2884 | } | |
2885 | ||
2886 | // Current the following isn't equivalent to OS_LINUX. | |
2887 | #if defined(__linux) | |
2888 | static Slice TrimSpace(Slice s) { | |
2889 | unsigned int start = 0; | |
2890 | while (start < s.size() && isspace(s[start])) { | |
2891 | start++; | |
2892 | } | |
2893 | unsigned int limit = static_cast<unsigned int>(s.size()); | |
1e59de90 | 2894 | while (limit > start && isspace(s[limit - 1])) { |
7c673cae FG |
2895 | limit--; |
2896 | } | |
2897 | return Slice(s.data() + start, limit - start); | |
2898 | } | |
2899 | #endif | |
2900 | ||
2901 | void PrintEnvironment() { | |
1e59de90 TL |
2902 | fprintf(stderr, "RocksDB: version %s\n", |
2903 | GetRocksVersionAsString(true).c_str()); | |
7c673cae | 2904 | |
1e59de90 | 2905 | #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__) |
7c673cae FG |
2906 | time_t now = time(nullptr); |
2907 | char buf[52]; | |
2908 | // Lint complains about ctime() usage, so replace it with ctime_r(). The | |
2909 | // requirement is to provide a buffer which is at least 26 bytes. | |
2910 | fprintf(stderr, "Date: %s", | |
2911 | ctime_r(&now, buf)); // ctime_r() adds newline | |
2912 | ||
1e59de90 | 2913 | #if defined(__linux) |
7c673cae FG |
2914 | FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); |
2915 | if (cpuinfo != nullptr) { | |
2916 | char line[1000]; | |
2917 | int num_cpus = 0; | |
2918 | std::string cpu_type; | |
2919 | std::string cache_size; | |
2920 | while (fgets(line, sizeof(line), cpuinfo) != nullptr) { | |
2921 | const char* sep = strchr(line, ':'); | |
2922 | if (sep == nullptr) { | |
2923 | continue; | |
2924 | } | |
2925 | Slice key = TrimSpace(Slice(line, sep - 1 - line)); | |
2926 | Slice val = TrimSpace(Slice(sep + 1)); | |
2927 | if (key == "model name") { | |
2928 | ++num_cpus; | |
2929 | cpu_type = val.ToString(); | |
2930 | } else if (key == "cache size") { | |
2931 | cache_size = val.ToString(); | |
2932 | } | |
2933 | } | |
2934 | fclose(cpuinfo); | |
2935 | fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); | |
2936 | fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); | |
2937 | } | |
1e59de90 TL |
2938 | #elif defined(__APPLE__) |
2939 | struct host_basic_info h; | |
2940 | size_t hlen = HOST_BASIC_INFO_COUNT; | |
2941 | if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h, | |
2942 | (uint32_t*)&hlen) == KERN_SUCCESS) { | |
2943 | std::string cpu_type; | |
2944 | std::string cache_size; | |
2945 | size_t hcache_size; | |
2946 | hlen = sizeof(hcache_size); | |
2947 | if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) { | |
2948 | cache_size = std::to_string(hcache_size); | |
2949 | } | |
2950 | switch (h.cpu_type) { | |
2951 | case CPU_TYPE_X86_64: | |
2952 | cpu_type = "x86_64"; | |
2953 | break; | |
2954 | case CPU_TYPE_ARM64: | |
2955 | cpu_type = "arm64"; | |
2956 | break; | |
2957 | default: | |
2958 | break; | |
2959 | } | |
2960 | fprintf(stderr, "CPU: %d * %s\n", h.max_cpus, cpu_type.c_str()); | |
2961 | fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); | |
2962 | } | |
2963 | #elif defined(__FreeBSD__) | |
2964 | int ncpus; | |
2965 | size_t len = sizeof(ncpus); | |
2966 | int mib[2] = {CTL_HW, HW_NCPU}; | |
2967 | if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) { | |
2968 | char cpu_type[16]; | |
2969 | len = sizeof(cpu_type) - 1; | |
2970 | mib[1] = HW_MACHINE; | |
2971 | if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0; | |
2972 | ||
2973 | fprintf(stderr, "CPU: %d * %s\n", ncpus, cpu_type); | |
2974 | // no programmatic way to get the cache line size except on PPC | |
2975 | } | |
2976 | #endif | |
2977 | #endif | |
2978 | } | |
2979 | ||
2980 | static bool KeyExpired(const TimestampEmulator* timestamp_emulator, | |
2981 | const Slice& key) { | |
2982 | const char* pos = key.data(); | |
2983 | pos += 8; | |
2984 | uint64_t timestamp = 0; | |
2985 | if (port::kLittleEndian) { | |
2986 | int bytes_to_fill = 8; | |
7c673cae FG |
2987 | for (int i = 0; i < bytes_to_fill; ++i) { |
2988 | timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i])) | |
2989 | << ((bytes_to_fill - i - 1) << 3)); | |
2990 | } | |
2991 | } else { | |
2992 | memcpy(×tamp, pos, sizeof(timestamp)); | |
2993 | } | |
2994 | return timestamp_emulator->Get() - timestamp > FLAGS_time_range; | |
2995 | } | |
2996 | ||
2997 | class ExpiredTimeFilter : public CompactionFilter { | |
2998 | public: | |
2999 | explicit ExpiredTimeFilter( | |
3000 | const std::shared_ptr<TimestampEmulator>& timestamp_emulator) | |
3001 | : timestamp_emulator_(timestamp_emulator) {} | |
11fdf7f2 TL |
3002 | bool Filter(int /*level*/, const Slice& key, |
3003 | const Slice& /*existing_value*/, std::string* /*new_value*/, | |
3004 | bool* /*value_changed*/) const override { | |
7c673cae FG |
3005 | return KeyExpired(timestamp_emulator_.get(), key); |
3006 | } | |
3007 | const char* Name() const override { return "ExpiredTimeFilter"; } | |
3008 | ||
3009 | private: | |
3010 | std::shared_ptr<TimestampEmulator> timestamp_emulator_; | |
3011 | }; | |
3012 | ||
11fdf7f2 TL |
3013 | class KeepFilter : public CompactionFilter { |
3014 | public: | |
494da23a TL |
3015 | bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, |
3016 | std::string* /*new_value*/, | |
3017 | bool* /*value_changed*/) const override { | |
11fdf7f2 TL |
3018 | return false; |
3019 | } | |
3020 | ||
494da23a | 3021 | const char* Name() const override { return "KeepFilter"; } |
11fdf7f2 TL |
3022 | }; |
3023 | ||
1e59de90 TL |
3024 | static std::shared_ptr<MemoryAllocator> GetCacheAllocator() { |
3025 | std::shared_ptr<MemoryAllocator> allocator; | |
3026 | ||
3027 | if (FLAGS_use_cache_jemalloc_no_dump_allocator) { | |
3028 | JemallocAllocatorOptions jemalloc_options; | |
3029 | if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) { | |
3030 | fprintf(stderr, "JemallocNodumpAllocator not supported.\n"); | |
7c673cae FG |
3031 | exit(1); |
3032 | } | |
1e59de90 | 3033 | } else if (FLAGS_use_cache_memkind_kmem_allocator) { |
20effc67 | 3034 | #ifdef MEMKIND |
1e59de90 | 3035 | allocator = std::make_shared<MemkindKmemAllocator>(); |
20effc67 | 3036 | #else |
1e59de90 TL |
3037 | fprintf(stderr, "Memkind library is not linked with the binary.\n"); |
3038 | exit(1); | |
20effc67 | 3039 | #endif |
1e59de90 TL |
3040 | } |
3041 | ||
3042 | return allocator; | |
3043 | } | |
3044 | ||
3045 | static std::shared_ptr<Cache> NewCache(int64_t capacity) { | |
3046 | if (capacity <= 0) { | |
3047 | return nullptr; | |
3048 | } | |
3049 | if (FLAGS_cache_type == "clock_cache") { | |
3050 | fprintf(stderr, "Old clock cache implementation has been removed.\n"); | |
3051 | exit(1); | |
3052 | } else if (FLAGS_cache_type == "hyper_clock_cache") { | |
3053 | return HyperClockCacheOptions(static_cast<size_t>(capacity), | |
3054 | FLAGS_block_size /*estimated_entry_charge*/, | |
3055 | FLAGS_cache_numshardbits) | |
3056 | .MakeSharedCache(); | |
3057 | } else if (FLAGS_cache_type == "lru_cache") { | |
3058 | LRUCacheOptions opts( | |
3059 | static_cast<size_t>(capacity), FLAGS_cache_numshardbits, | |
3060 | false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio, | |
3061 | GetCacheAllocator(), kDefaultToAdaptiveMutex, | |
3062 | kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio); | |
3063 | ||
3064 | #ifndef ROCKSDB_LITE | |
3065 | if (!FLAGS_secondary_cache_uri.empty()) { | |
3066 | Status s = SecondaryCache::CreateFromString( | |
3067 | ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); | |
3068 | if (secondary_cache == nullptr) { | |
3069 | fprintf( | |
3070 | stderr, | |
3071 | "No secondary cache registered matching string: %s status=%s\n", | |
3072 | FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); | |
3073 | exit(1); | |
3074 | } | |
3075 | opts.secondary_cache = secondary_cache; | |
20effc67 | 3076 | } |
1e59de90 TL |
3077 | #endif // ROCKSDB_LITE |
3078 | ||
3079 | if (FLAGS_use_compressed_secondary_cache) { | |
3080 | CompressedSecondaryCacheOptions secondary_cache_opts; | |
3081 | secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size; | |
3082 | secondary_cache_opts.num_shard_bits = | |
3083 | FLAGS_compressed_secondary_cache_numshardbits; | |
3084 | secondary_cache_opts.high_pri_pool_ratio = | |
3085 | FLAGS_compressed_secondary_cache_high_pri_pool_ratio; | |
3086 | secondary_cache_opts.low_pri_pool_ratio = | |
3087 | FLAGS_compressed_secondary_cache_low_pri_pool_ratio; | |
3088 | secondary_cache_opts.compression_type = | |
3089 | FLAGS_compressed_secondary_cache_compression_type_e; | |
3090 | secondary_cache_opts.compress_format_version = | |
3091 | FLAGS_compressed_secondary_cache_compress_format_version; | |
3092 | opts.secondary_cache = | |
3093 | NewCompressedSecondaryCache(secondary_cache_opts); | |
3094 | } | |
3095 | ||
3096 | return NewLRUCache(opts); | |
3097 | } else { | |
3098 | fprintf(stderr, "Cache type not supported."); | |
3099 | exit(1); | |
7c673cae FG |
3100 | } |
3101 | } | |
3102 | ||
3103 | public: | |
3104 | Benchmark() | |
3105 | : cache_(NewCache(FLAGS_cache_size)), | |
3106 | compressed_cache_(NewCache(FLAGS_compressed_cache_size)), | |
1e59de90 TL |
3107 | prefix_extractor_(FLAGS_prefix_size != 0 |
3108 | ? NewFixedPrefixTransform(FLAGS_prefix_size) | |
3109 | : nullptr), | |
7c673cae | 3110 | num_(FLAGS_num), |
7c673cae | 3111 | key_size_(FLAGS_key_size), |
20effc67 | 3112 | user_timestamp_size_(FLAGS_user_timestamp_size), |
7c673cae | 3113 | prefix_size_(FLAGS_prefix_size), |
1e59de90 | 3114 | total_thread_count_(0), |
7c673cae FG |
3115 | keys_per_prefix_(FLAGS_keys_per_prefix), |
3116 | entries_per_batch_(1), | |
3117 | reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), | |
3118 | read_random_exp_range_(0.0), | |
3119 | writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes), | |
3120 | readwrites_( | |
3121 | (FLAGS_writes < 0 && FLAGS_reads < 0) | |
3122 | ? FLAGS_num | |
3123 | : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)), | |
3124 | merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), | |
11fdf7f2 TL |
3125 | report_file_operations_(FLAGS_report_file_operations), |
3126 | #ifndef ROCKSDB_LITE | |
1e59de90 | 3127 | use_blob_db_(FLAGS_use_blob_db), // Stacked BlobDB |
11fdf7f2 | 3128 | #else |
1e59de90 | 3129 | use_blob_db_(false), // Stacked BlobDB |
11fdf7f2 | 3130 | #endif // !ROCKSDB_LITE |
1e59de90 | 3131 | read_operands_(false) { |
7c673cae FG |
3132 | // use simcache instead of cache |
3133 | if (FLAGS_simcache_size >= 0) { | |
3134 | if (FLAGS_cache_numshardbits >= 1) { | |
3135 | cache_ = | |
3136 | NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits); | |
3137 | } else { | |
3138 | cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0); | |
3139 | } | |
3140 | } | |
3141 | ||
3142 | if (report_file_operations_) { | |
1e59de90 TL |
3143 | FLAGS_env = new CompositeEnvWrapper( |
3144 | FLAGS_env, | |
3145 | std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem())); | |
7c673cae FG |
3146 | } |
3147 | ||
3148 | if (FLAGS_prefix_size > FLAGS_key_size) { | |
3149 | fprintf(stderr, "prefix size is larger than key size"); | |
3150 | exit(1); | |
3151 | } | |
3152 | ||
3153 | std::vector<std::string> files; | |
3154 | FLAGS_env->GetChildren(FLAGS_db, &files); | |
3155 | for (size_t i = 0; i < files.size(); i++) { | |
3156 | if (Slice(files[i]).starts_with("heap-")) { | |
3157 | FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); | |
3158 | } | |
3159 | } | |
3160 | if (!FLAGS_use_existing_db) { | |
3161 | Options options; | |
f67539c2 | 3162 | options.env = FLAGS_env; |
7c673cae FG |
3163 | if (!FLAGS_wal_dir.empty()) { |
3164 | options.wal_dir = FLAGS_wal_dir; | |
3165 | } | |
11fdf7f2 TL |
3166 | #ifndef ROCKSDB_LITE |
3167 | if (use_blob_db_) { | |
1e59de90 | 3168 | // Stacked BlobDB |
11fdf7f2 TL |
3169 | blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions()); |
3170 | } | |
3171 | #endif // !ROCKSDB_LITE | |
7c673cae FG |
3172 | DestroyDB(FLAGS_db, options); |
3173 | if (!FLAGS_wal_dir.empty()) { | |
3174 | FLAGS_env->DeleteDir(FLAGS_wal_dir); | |
3175 | } | |
3176 | ||
3177 | if (FLAGS_num_multi_db > 1) { | |
3178 | FLAGS_env->CreateDir(FLAGS_db); | |
3179 | if (!FLAGS_wal_dir.empty()) { | |
3180 | FLAGS_env->CreateDir(FLAGS_wal_dir); | |
3181 | } | |
3182 | } | |
3183 | } | |
11fdf7f2 TL |
3184 | |
3185 | listener_.reset(new ErrorHandlerListener()); | |
20effc67 TL |
3186 | if (user_timestamp_size_ > 0) { |
3187 | mock_app_clock_.reset(new TimestampEmulator()); | |
3188 | } | |
7c673cae FG |
3189 | } |
3190 | ||
1e59de90 | 3191 | void DeleteDBs() { |
7c673cae | 3192 | db_.DeleteDBs(); |
1e59de90 TL |
3193 | for (const DBWithColumnFamilies& dbwcf : multi_dbs_) { |
3194 | delete dbwcf.db; | |
3195 | } | |
3196 | } | |
3197 | ||
3198 | ~Benchmark() { | |
3199 | DeleteDBs(); | |
7c673cae | 3200 | if (cache_.get() != nullptr) { |
1e59de90 TL |
3201 | // Clear cache reference first |
3202 | open_options_.write_buffer_manager.reset(); | |
7c673cae FG |
3203 | // this will leak, but we're shutting down so nobody cares |
3204 | cache_->DisownData(); | |
3205 | } | |
3206 | } | |
3207 | ||
3208 | Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) { | |
3209 | char* data = new char[key_size_]; | |
3210 | const char* const_data = data; | |
3211 | key_guard->reset(const_data); | |
3212 | return Slice(key_guard->get(), key_size_); | |
3213 | } | |
3214 | ||
3215 | // Generate key according to the given specification and random number. | |
20effc67 TL |
3216 | // The resulting key will have the following format: |
3217 | // - If keys_per_prefix_ is positive, extra trailing bytes are either cut | |
3218 | // off or padded with '0'. | |
3219 | // The prefix value is derived from key value. | |
3220 | // ---------------------------- | |
3221 | // | prefix 00000 | key 00000 | | |
3222 | // ---------------------------- | |
3223 | // | |
3224 | // - If keys_per_prefix_ is 0, the key is simply a binary representation of | |
3225 | // random number followed by trailing '0's | |
3226 | // ---------------------------- | |
3227 | // | key 00000 | | |
3228 | // ---------------------------- | |
7c673cae | 3229 | void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) { |
494da23a TL |
3230 | if (!keys_.empty()) { |
3231 | assert(FLAGS_use_existing_keys); | |
3232 | assert(keys_.size() == static_cast<size_t>(num_keys)); | |
3233 | assert(v < static_cast<uint64_t>(num_keys)); | |
3234 | *key = keys_[v]; | |
3235 | return; | |
3236 | } | |
7c673cae FG |
3237 | char* start = const_cast<char*>(key->data()); |
3238 | char* pos = start; | |
3239 | if (keys_per_prefix_ > 0) { | |
3240 | int64_t num_prefix = num_keys / keys_per_prefix_; | |
3241 | int64_t prefix = v % num_prefix; | |
3242 | int bytes_to_fill = std::min(prefix_size_, 8); | |
3243 | if (port::kLittleEndian) { | |
3244 | for (int i = 0; i < bytes_to_fill; ++i) { | |
3245 | pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; | |
3246 | } | |
3247 | } else { | |
3248 | memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill); | |
3249 | } | |
3250 | if (prefix_size_ > 8) { | |
3251 | // fill the rest with 0s | |
3252 | memset(pos + 8, '0', prefix_size_ - 8); | |
3253 | } | |
3254 | pos += prefix_size_; | |
3255 | } | |
3256 | ||
3257 | int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8); | |
3258 | if (port::kLittleEndian) { | |
3259 | for (int i = 0; i < bytes_to_fill; ++i) { | |
3260 | pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; | |
3261 | } | |
3262 | } else { | |
3263 | memcpy(pos, static_cast<void*>(&v), bytes_to_fill); | |
3264 | } | |
3265 | pos += bytes_to_fill; | |
3266 | if (key_size_ > pos - start) { | |
3267 | memset(pos, '0', key_size_ - (pos - start)); | |
3268 | } | |
3269 | } | |
3270 | ||
f67539c2 TL |
3271 | void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) { |
3272 | GenerateKeyFromInt(v, num_keys, key); | |
3273 | if (FLAGS_seek_missing_prefix) { | |
3274 | assert(prefix_size_ > 8); | |
3275 | char* key_ptr = const_cast<char*>(key->data()); | |
3276 | // This rely on GenerateKeyFromInt filling paddings with '0's. | |
3277 | // Putting a '1' will create a non-existing prefix. | |
3278 | key_ptr[8] = '1'; | |
3279 | } | |
3280 | } | |
3281 | ||
7c673cae FG |
3282 | std::string GetPathForMultiple(std::string base_name, size_t id) { |
3283 | if (!base_name.empty()) { | |
3284 | #ifndef OS_WIN | |
3285 | if (base_name.back() != '/') { | |
3286 | base_name += '/'; | |
3287 | } | |
3288 | #else | |
3289 | if (base_name.back() != '\\') { | |
3290 | base_name += '\\'; | |
3291 | } | |
3292 | #endif | |
3293 | } | |
1e59de90 | 3294 | return base_name + std::to_string(id); |
7c673cae FG |
3295 | } |
3296 | ||
f67539c2 TL |
3297 | void VerifyDBFromDB(std::string& truth_db_name) { |
3298 | DBWithColumnFamilies truth_db; | |
3299 | auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db); | |
3300 | if (!s.ok()) { | |
3301 | fprintf(stderr, "open error: %s\n", s.ToString().c_str()); | |
3302 | exit(1); | |
3303 | } | |
3304 | ReadOptions ro; | |
3305 | ro.total_order_seek = true; | |
3306 | std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro)); | |
3307 | std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro)); | |
3308 | // Verify that all the key/values in truth_db are retrivable in db with | |
3309 | // ::Get | |
3310 | fprintf(stderr, "Verifying db >= truth_db with ::Get...\n"); | |
3311 | for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) { | |
7c673cae FG |
3312 | std::string value; |
3313 | s = db_.db->Get(ro, truth_iter->key(), &value); | |
3314 | assert(s.ok()); | |
3315 | // TODO(myabandeh): provide debugging hints | |
3316 | assert(Slice(value) == truth_iter->value()); | |
f67539c2 TL |
3317 | } |
3318 | // Verify that the db iterator does not give any extra key/value | |
3319 | fprintf(stderr, "Verifying db == truth_db...\n"); | |
3320 | for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); | |
3321 | db_iter->Next(), truth_iter->Next()) { | |
3322 | assert(truth_iter->Valid()); | |
3323 | assert(truth_iter->value() == db_iter->value()); | |
3324 | } | |
3325 | // No more key should be left unchecked in truth_db | |
3326 | assert(!truth_iter->Valid()); | |
3327 | fprintf(stderr, "...Verified\n"); | |
7c673cae | 3328 | } |
7c673cae | 3329 | |
20effc67 | 3330 | void ErrorExit() { |
1e59de90 | 3331 | DeleteDBs(); |
20effc67 TL |
3332 | exit(1); |
3333 | } | |
3334 | ||
7c673cae FG |
3335 | void Run() { |
3336 | if (!SanityCheck()) { | |
20effc67 | 3337 | ErrorExit(); |
7c673cae FG |
3338 | } |
3339 | Open(&open_options_); | |
1e59de90 | 3340 | PrintHeader(open_options_); |
7c673cae FG |
3341 | std::stringstream benchmark_stream(FLAGS_benchmarks); |
3342 | std::string name; | |
3343 | std::unique_ptr<ExpiredTimeFilter> filter; | |
3344 | while (std::getline(benchmark_stream, name, ',')) { | |
3345 | // Sanitize parameters | |
3346 | num_ = FLAGS_num; | |
3347 | reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); | |
3348 | writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); | |
3349 | deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes); | |
f67539c2 | 3350 | value_size = FLAGS_value_size; |
7c673cae FG |
3351 | key_size_ = FLAGS_key_size; |
3352 | entries_per_batch_ = FLAGS_batch_size; | |
494da23a | 3353 | writes_before_delete_range_ = FLAGS_writes_before_delete_range; |
7c673cae FG |
3354 | writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone; |
3355 | range_tombstone_width_ = FLAGS_range_tombstone_width; | |
3356 | max_num_range_tombstones_ = FLAGS_max_num_range_tombstones; | |
3357 | write_options_ = WriteOptions(); | |
3358 | read_random_exp_range_ = FLAGS_read_random_exp_range; | |
3359 | if (FLAGS_sync) { | |
3360 | write_options_.sync = true; | |
3361 | } | |
3362 | write_options_.disableWAL = FLAGS_disable_wal; | |
1e59de90 TL |
3363 | write_options_.rate_limiter_priority = |
3364 | FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL; | |
3365 | read_options_ = ReadOptions(FLAGS_verify_checksum, true); | |
3366 | read_options_.total_order_seek = FLAGS_total_order_seek; | |
3367 | read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start; | |
3368 | read_options_.rate_limiter_priority = | |
3369 | FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; | |
3370 | read_options_.tailing = FLAGS_use_tailing_iterator; | |
3371 | read_options_.readahead_size = FLAGS_readahead_size; | |
3372 | read_options_.adaptive_readahead = FLAGS_adaptive_readahead; | |
3373 | read_options_.async_io = FLAGS_async_io; | |
3374 | read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io; | |
7c673cae FG |
3375 | |
3376 | void (Benchmark::*method)(ThreadState*) = nullptr; | |
3377 | void (Benchmark::*post_process_method)() = nullptr; | |
3378 | ||
3379 | bool fresh_db = false; | |
3380 | int num_threads = FLAGS_threads; | |
3381 | ||
3382 | int num_repeat = 1; | |
3383 | int num_warmup = 0; | |
3384 | if (!name.empty() && *name.rbegin() == ']') { | |
3385 | auto it = name.find('['); | |
3386 | if (it == std::string::npos) { | |
3387 | fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str()); | |
20effc67 | 3388 | ErrorExit(); |
7c673cae FG |
3389 | } |
3390 | std::string args = name.substr(it + 1); | |
3391 | args.resize(args.size() - 1); | |
3392 | name.resize(it); | |
3393 | ||
3394 | std::string bench_arg; | |
3395 | std::stringstream args_stream(args); | |
3396 | while (std::getline(args_stream, bench_arg, '-')) { | |
3397 | if (bench_arg.empty()) { | |
3398 | continue; | |
3399 | } | |
3400 | if (bench_arg[0] == 'X') { | |
3401 | // Repeat the benchmark n times | |
3402 | std::string num_str = bench_arg.substr(1); | |
3403 | num_repeat = std::stoi(num_str); | |
3404 | } else if (bench_arg[0] == 'W') { | |
3405 | // Warm up the benchmark for n times | |
3406 | std::string num_str = bench_arg.substr(1); | |
3407 | num_warmup = std::stoi(num_str); | |
3408 | } | |
3409 | } | |
3410 | } | |
3411 | ||
3412 | // Both fillseqdeterministic and filluniquerandomdeterministic | |
3413 | // fill the levels except the max level with UNIQUE_RANDOM | |
3414 | // and fill the max level with fillseq and filluniquerandom, respectively | |
3415 | if (name == "fillseqdeterministic" || | |
3416 | name == "filluniquerandomdeterministic") { | |
3417 | if (!FLAGS_disable_auto_compactions) { | |
3418 | fprintf(stderr, | |
3419 | "Please disable_auto_compactions in FillDeterministic " | |
3420 | "benchmark\n"); | |
20effc67 | 3421 | ErrorExit(); |
7c673cae FG |
3422 | } |
3423 | if (num_threads > 1) { | |
3424 | fprintf(stderr, | |
3425 | "filldeterministic multithreaded not supported" | |
3426 | ", use 1 thread\n"); | |
3427 | num_threads = 1; | |
3428 | } | |
3429 | fresh_db = true; | |
3430 | if (name == "fillseqdeterministic") { | |
3431 | method = &Benchmark::WriteSeqDeterministic; | |
3432 | } else { | |
3433 | method = &Benchmark::WriteUniqueRandomDeterministic; | |
3434 | } | |
3435 | } else if (name == "fillseq") { | |
3436 | fresh_db = true; | |
3437 | method = &Benchmark::WriteSeq; | |
3438 | } else if (name == "fillbatch") { | |
3439 | fresh_db = true; | |
3440 | entries_per_batch_ = 1000; | |
3441 | method = &Benchmark::WriteSeq; | |
3442 | } else if (name == "fillrandom") { | |
3443 | fresh_db = true; | |
3444 | method = &Benchmark::WriteRandom; | |
1e59de90 TL |
3445 | } else if (name == "filluniquerandom" || |
3446 | name == "fillanddeleteuniquerandom") { | |
7c673cae FG |
3447 | fresh_db = true; |
3448 | if (num_threads > 1) { | |
3449 | fprintf(stderr, | |
1e59de90 TL |
3450 | "filluniquerandom and fillanddeleteuniquerandom " |
3451 | "multithreaded not supported, use 1 thread"); | |
7c673cae FG |
3452 | num_threads = 1; |
3453 | } | |
3454 | method = &Benchmark::WriteUniqueRandom; | |
3455 | } else if (name == "overwrite") { | |
3456 | method = &Benchmark::WriteRandom; | |
3457 | } else if (name == "fillsync") { | |
3458 | fresh_db = true; | |
3459 | num_ /= 1000; | |
3460 | write_options_.sync = true; | |
3461 | method = &Benchmark::WriteRandom; | |
3462 | } else if (name == "fill100K") { | |
3463 | fresh_db = true; | |
3464 | num_ /= 1000; | |
f67539c2 | 3465 | value_size = 100 * 1000; |
7c673cae FG |
3466 | method = &Benchmark::WriteRandom; |
3467 | } else if (name == "readseq") { | |
3468 | method = &Benchmark::ReadSequential; | |
f67539c2 TL |
3469 | } else if (name == "readtorowcache") { |
3470 | if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) { | |
3471 | fprintf(stderr, | |
3472 | "Please set use_existing_keys to true and specify a " | |
3473 | "row cache size in readtorowcache benchmark\n"); | |
20effc67 | 3474 | ErrorExit(); |
f67539c2 TL |
3475 | } |
3476 | method = &Benchmark::ReadToRowCache; | |
7c673cae FG |
3477 | } else if (name == "readtocache") { |
3478 | method = &Benchmark::ReadSequential; | |
3479 | num_threads = 1; | |
3480 | reads_ = num_; | |
3481 | } else if (name == "readreverse") { | |
3482 | method = &Benchmark::ReadReverse; | |
3483 | } else if (name == "readrandom") { | |
f67539c2 TL |
3484 | if (FLAGS_multiread_stride) { |
3485 | fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", | |
3486 | entries_per_batch_); | |
3487 | } | |
7c673cae FG |
3488 | method = &Benchmark::ReadRandom; |
3489 | } else if (name == "readrandomfast") { | |
3490 | method = &Benchmark::ReadRandomFast; | |
3491 | } else if (name == "multireadrandom") { | |
3492 | fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", | |
3493 | entries_per_batch_); | |
3494 | method = &Benchmark::MultiReadRandom; | |
1e59de90 TL |
3495 | } else if (name == "multireadwhilewriting") { |
3496 | fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", | |
3497 | entries_per_batch_); | |
3498 | num_threads++; | |
3499 | method = &Benchmark::MultiReadWhileWriting; | |
20effc67 TL |
3500 | } else if (name == "approximatesizerandom") { |
3501 | fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", | |
3502 | entries_per_batch_); | |
3503 | method = &Benchmark::ApproximateSizeRandom; | |
494da23a TL |
3504 | } else if (name == "mixgraph") { |
3505 | method = &Benchmark::MixGraph; | |
7c673cae FG |
3506 | } else if (name == "readmissing") { |
3507 | ++key_size_; | |
3508 | method = &Benchmark::ReadRandom; | |
3509 | } else if (name == "newiterator") { | |
3510 | method = &Benchmark::IteratorCreation; | |
3511 | } else if (name == "newiteratorwhilewriting") { | |
3512 | num_threads++; // Add extra thread for writing | |
3513 | method = &Benchmark::IteratorCreationWhileWriting; | |
3514 | } else if (name == "seekrandom") { | |
3515 | method = &Benchmark::SeekRandom; | |
3516 | } else if (name == "seekrandomwhilewriting") { | |
3517 | num_threads++; // Add extra thread for writing | |
3518 | method = &Benchmark::SeekRandomWhileWriting; | |
3519 | } else if (name == "seekrandomwhilemerging") { | |
3520 | num_threads++; // Add extra thread for merging | |
3521 | method = &Benchmark::SeekRandomWhileMerging; | |
3522 | } else if (name == "readrandomsmall") { | |
3523 | reads_ /= 1000; | |
3524 | method = &Benchmark::ReadRandom; | |
3525 | } else if (name == "deleteseq") { | |
3526 | method = &Benchmark::DeleteSeq; | |
3527 | } else if (name == "deleterandom") { | |
3528 | method = &Benchmark::DeleteRandom; | |
3529 | } else if (name == "readwhilewriting") { | |
3530 | num_threads++; // Add extra thread for writing | |
3531 | method = &Benchmark::ReadWhileWriting; | |
3532 | } else if (name == "readwhilemerging") { | |
3533 | num_threads++; // Add extra thread for writing | |
3534 | method = &Benchmark::ReadWhileMerging; | |
11fdf7f2 TL |
3535 | } else if (name == "readwhilescanning") { |
3536 | num_threads++; // Add extra thread for scaning | |
3537 | method = &Benchmark::ReadWhileScanning; | |
7c673cae FG |
3538 | } else if (name == "readrandomwriterandom") { |
3539 | method = &Benchmark::ReadRandomWriteRandom; | |
3540 | } else if (name == "readrandommergerandom") { | |
3541 | if (FLAGS_merge_operator.empty()) { | |
3542 | fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", | |
3543 | name.c_str()); | |
20effc67 | 3544 | ErrorExit(); |
7c673cae FG |
3545 | } |
3546 | method = &Benchmark::ReadRandomMergeRandom; | |
3547 | } else if (name == "updaterandom") { | |
3548 | method = &Benchmark::UpdateRandom; | |
11fdf7f2 TL |
3549 | } else if (name == "xorupdaterandom") { |
3550 | method = &Benchmark::XORUpdateRandom; | |
7c673cae FG |
3551 | } else if (name == "appendrandom") { |
3552 | method = &Benchmark::AppendRandom; | |
3553 | } else if (name == "mergerandom") { | |
3554 | if (FLAGS_merge_operator.empty()) { | |
3555 | fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", | |
3556 | name.c_str()); | |
3557 | exit(1); | |
3558 | } | |
3559 | method = &Benchmark::MergeRandom; | |
3560 | } else if (name == "randomwithverify") { | |
3561 | method = &Benchmark::RandomWithVerify; | |
3562 | } else if (name == "fillseekseq") { | |
3563 | method = &Benchmark::WriteSeqSeekSeq; | |
3564 | } else if (name == "compact") { | |
3565 | method = &Benchmark::Compact; | |
11fdf7f2 TL |
3566 | } else if (name == "compactall") { |
3567 | CompactAll(); | |
1e59de90 TL |
3568 | #ifndef ROCKSDB_LITE |
3569 | } else if (name == "compact0") { | |
3570 | CompactLevel(0); | |
3571 | } else if (name == "compact1") { | |
3572 | CompactLevel(1); | |
3573 | } else if (name == "waitforcompaction") { | |
3574 | WaitForCompaction(); | |
3575 | #endif | |
3576 | } else if (name == "flush") { | |
3577 | Flush(); | |
7c673cae FG |
3578 | } else if (name == "crc32c") { |
3579 | method = &Benchmark::Crc32c; | |
3580 | } else if (name == "xxhash") { | |
3581 | method = &Benchmark::xxHash; | |
1e59de90 TL |
3582 | } else if (name == "xxhash64") { |
3583 | method = &Benchmark::xxHash64; | |
3584 | } else if (name == "xxh3") { | |
3585 | method = &Benchmark::xxh3; | |
7c673cae FG |
3586 | } else if (name == "acquireload") { |
3587 | method = &Benchmark::AcquireLoad; | |
3588 | } else if (name == "compress") { | |
3589 | method = &Benchmark::Compress; | |
3590 | } else if (name == "uncompress") { | |
3591 | method = &Benchmark::Uncompress; | |
3592 | #ifndef ROCKSDB_LITE | |
3593 | } else if (name == "randomtransaction") { | |
3594 | method = &Benchmark::RandomTransaction; | |
3595 | post_process_method = &Benchmark::RandomTransactionVerify; | |
3596 | #endif // ROCKSDB_LITE | |
3597 | } else if (name == "randomreplacekeys") { | |
3598 | fresh_db = true; | |
3599 | method = &Benchmark::RandomReplaceKeys; | |
3600 | } else if (name == "timeseries") { | |
3601 | timestamp_emulator_.reset(new TimestampEmulator()); | |
3602 | if (FLAGS_expire_style == "compaction_filter") { | |
3603 | filter.reset(new ExpiredTimeFilter(timestamp_emulator_)); | |
3604 | fprintf(stdout, "Compaction filter is used to remove expired data"); | |
3605 | open_options_.compaction_filter = filter.get(); | |
3606 | } | |
3607 | fresh_db = true; | |
3608 | method = &Benchmark::TimeSeries; | |
1e59de90 TL |
3609 | } else if (name == "block_cache_entry_stats") { |
3610 | // DB::Properties::kBlockCacheEntryStats | |
3611 | PrintStats("rocksdb.block-cache-entry-stats"); | |
7c673cae FG |
3612 | } else if (name == "stats") { |
3613 | PrintStats("rocksdb.stats"); | |
3614 | } else if (name == "resetstats") { | |
3615 | ResetStats(); | |
3616 | } else if (name == "verify") { | |
3617 | VerifyDBFromDB(FLAGS_truth_db); | |
3618 | } else if (name == "levelstats") { | |
3619 | PrintStats("rocksdb.levelstats"); | |
1e59de90 TL |
3620 | } else if (name == "memstats") { |
3621 | std::vector<std::string> keys{"rocksdb.num-immutable-mem-table", | |
3622 | "rocksdb.cur-size-active-mem-table", | |
3623 | "rocksdb.cur-size-all-mem-tables", | |
3624 | "rocksdb.size-all-mem-tables", | |
3625 | "rocksdb.num-entries-active-mem-table", | |
3626 | "rocksdb.num-entries-imm-mem-tables"}; | |
3627 | PrintStats(keys); | |
7c673cae FG |
3628 | } else if (name == "sstables") { |
3629 | PrintStats("rocksdb.sstables"); | |
f67539c2 TL |
3630 | } else if (name == "stats_history") { |
3631 | PrintStatsHistory(); | |
1e59de90 | 3632 | #ifndef ROCKSDB_LITE |
11fdf7f2 TL |
3633 | } else if (name == "replay") { |
3634 | if (num_threads > 1) { | |
3635 | fprintf(stderr, "Multi-threaded replay is not yet supported\n"); | |
20effc67 | 3636 | ErrorExit(); |
11fdf7f2 TL |
3637 | } |
3638 | if (FLAGS_trace_file == "") { | |
3639 | fprintf(stderr, "Please set --trace_file to be replayed from\n"); | |
20effc67 | 3640 | ErrorExit(); |
11fdf7f2 TL |
3641 | } |
3642 | method = &Benchmark::Replay; | |
1e59de90 | 3643 | #endif // ROCKSDB_LITE |
f67539c2 TL |
3644 | } else if (name == "getmergeoperands") { |
3645 | method = &Benchmark::GetMergeOperands; | |
1e59de90 TL |
3646 | #ifndef ROCKSDB_LITE |
3647 | } else if (name == "verifychecksum") { | |
3648 | method = &Benchmark::VerifyChecksum; | |
3649 | } else if (name == "verifyfilechecksums") { | |
3650 | method = &Benchmark::VerifyFileChecksums; | |
3651 | #endif // ROCKSDB_LITE | |
3652 | } else if (name == "readrandomoperands") { | |
3653 | read_operands_ = true; | |
3654 | method = &Benchmark::ReadRandom; | |
3655 | #ifndef ROCKSDB_LITE | |
3656 | } else if (name == "backup") { | |
3657 | method = &Benchmark::Backup; | |
3658 | } else if (name == "restore") { | |
3659 | method = &Benchmark::Restore; | |
3660 | #endif | |
7c673cae FG |
3661 | } else if (!name.empty()) { // No error message for empty name |
3662 | fprintf(stderr, "unknown benchmark '%s'\n", name.c_str()); | |
20effc67 | 3663 | ErrorExit(); |
7c673cae FG |
3664 | } |
3665 | ||
3666 | if (fresh_db) { | |
3667 | if (FLAGS_use_existing_db) { | |
3668 | fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", | |
3669 | name.c_str()); | |
3670 | method = nullptr; | |
3671 | } else { | |
3672 | if (db_.db != nullptr) { | |
3673 | db_.DeleteDBs(); | |
3674 | DestroyDB(FLAGS_db, open_options_); | |
3675 | } | |
3676 | Options options = open_options_; | |
3677 | for (size_t i = 0; i < multi_dbs_.size(); i++) { | |
3678 | delete multi_dbs_[i].db; | |
3679 | if (!open_options_.wal_dir.empty()) { | |
3680 | options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i); | |
3681 | } | |
3682 | DestroyDB(GetPathForMultiple(FLAGS_db, i), options); | |
3683 | } | |
3684 | multi_dbs_.clear(); | |
3685 | } | |
3686 | Open(&open_options_); // use open_options for the last accessed | |
3687 | } | |
3688 | ||
3689 | if (method != nullptr) { | |
3690 | fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); | |
11fdf7f2 TL |
3691 | |
3692 | #ifndef ROCKSDB_LITE | |
1e59de90 TL |
3693 | if (name == "backup") { |
3694 | std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl; | |
3695 | } else if (name == "restore") { | |
3696 | std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl; | |
3697 | std::cout << "Restore path: [" << FLAGS_restore_dir << "]" | |
3698 | << std::endl; | |
3699 | } | |
11fdf7f2 TL |
3700 | // A trace_file option can be provided both for trace and replay |
3701 | // operations. But db_bench does not support tracing and replaying at | |
3702 | // the same time, for now. So, start tracing only when it is not a | |
3703 | // replay. | |
3704 | if (FLAGS_trace_file != "" && name != "replay") { | |
3705 | std::unique_ptr<TraceWriter> trace_writer; | |
3706 | Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(), | |
3707 | FLAGS_trace_file, &trace_writer); | |
3708 | if (!s.ok()) { | |
3709 | fprintf(stderr, "Encountered an error starting a trace, %s\n", | |
3710 | s.ToString().c_str()); | |
20effc67 | 3711 | ErrorExit(); |
11fdf7f2 TL |
3712 | } |
3713 | s = db_.db->StartTrace(trace_options_, std::move(trace_writer)); | |
3714 | if (!s.ok()) { | |
3715 | fprintf(stderr, "Encountered an error starting a trace, %s\n", | |
3716 | s.ToString().c_str()); | |
20effc67 | 3717 | ErrorExit(); |
11fdf7f2 TL |
3718 | } |
3719 | fprintf(stdout, "Tracing the workload to: [%s]\n", | |
3720 | FLAGS_trace_file.c_str()); | |
3721 | } | |
f67539c2 TL |
3722 | // Start block cache tracing. |
3723 | if (!FLAGS_block_cache_trace_file.empty()) { | |
3724 | // Sanity checks. | |
3725 | if (FLAGS_block_cache_trace_sampling_frequency <= 0) { | |
3726 | fprintf(stderr, | |
3727 | "Block cache trace sampling frequency must be higher than " | |
3728 | "0.\n"); | |
20effc67 | 3729 | ErrorExit(); |
f67539c2 TL |
3730 | } |
3731 | if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) { | |
3732 | fprintf(stderr, | |
3733 | "The maximum file size for block cache tracing must be " | |
3734 | "higher than 0.\n"); | |
20effc67 | 3735 | ErrorExit(); |
f67539c2 TL |
3736 | } |
3737 | block_cache_trace_options_.max_trace_file_size = | |
3738 | FLAGS_block_cache_trace_max_trace_file_size_in_bytes; | |
3739 | block_cache_trace_options_.sampling_frequency = | |
3740 | FLAGS_block_cache_trace_sampling_frequency; | |
3741 | std::unique_ptr<TraceWriter> block_cache_trace_writer; | |
3742 | Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(), | |
3743 | FLAGS_block_cache_trace_file, | |
3744 | &block_cache_trace_writer); | |
3745 | if (!s.ok()) { | |
3746 | fprintf(stderr, | |
3747 | "Encountered an error when creating trace writer, %s\n", | |
3748 | s.ToString().c_str()); | |
20effc67 | 3749 | ErrorExit(); |
f67539c2 TL |
3750 | } |
3751 | s = db_.db->StartBlockCacheTrace(block_cache_trace_options_, | |
3752 | std::move(block_cache_trace_writer)); | |
3753 | if (!s.ok()) { | |
3754 | fprintf( | |
3755 | stderr, | |
3756 | "Encountered an error when starting block cache tracing, %s\n", | |
3757 | s.ToString().c_str()); | |
20effc67 | 3758 | ErrorExit(); |
f67539c2 TL |
3759 | } |
3760 | fprintf(stdout, "Tracing block cache accesses to: [%s]\n", | |
3761 | FLAGS_block_cache_trace_file.c_str()); | |
3762 | } | |
11fdf7f2 TL |
3763 | #endif // ROCKSDB_LITE |
3764 | ||
7c673cae FG |
3765 | if (num_warmup > 0) { |
3766 | printf("Warming up benchmark by running %d times\n", num_warmup); | |
3767 | } | |
3768 | ||
3769 | for (int i = 0; i < num_warmup; i++) { | |
3770 | RunBenchmark(num_threads, name, method); | |
3771 | } | |
3772 | ||
3773 | if (num_repeat > 1) { | |
3774 | printf("Running benchmark for %d times\n", num_repeat); | |
3775 | } | |
3776 | ||
3777 | CombinedStats combined_stats; | |
3778 | for (int i = 0; i < num_repeat; i++) { | |
3779 | Stats stats = RunBenchmark(num_threads, name, method); | |
3780 | combined_stats.AddStats(stats); | |
1e59de90 TL |
3781 | if (FLAGS_confidence_interval_only) { |
3782 | combined_stats.ReportWithConfidenceIntervals(name); | |
3783 | } else { | |
3784 | combined_stats.Report(name); | |
3785 | } | |
7c673cae FG |
3786 | } |
3787 | if (num_repeat > 1) { | |
1e59de90 | 3788 | combined_stats.ReportFinal(name); |
7c673cae FG |
3789 | } |
3790 | } | |
3791 | if (post_process_method != nullptr) { | |
3792 | (this->*post_process_method)(); | |
3793 | } | |
3794 | } | |
11fdf7f2 | 3795 | |
f67539c2 TL |
3796 | if (secondary_update_thread_) { |
3797 | secondary_update_stopped_.store(1, std::memory_order_relaxed); | |
3798 | secondary_update_thread_->join(); | |
3799 | secondary_update_thread_.reset(); | |
3800 | } | |
3801 | ||
11fdf7f2 TL |
3802 | #ifndef ROCKSDB_LITE |
3803 | if (name != "replay" && FLAGS_trace_file != "") { | |
3804 | Status s = db_.db->EndTrace(); | |
3805 | if (!s.ok()) { | |
3806 | fprintf(stderr, "Encountered an error ending the trace, %s\n", | |
3807 | s.ToString().c_str()); | |
3808 | } | |
3809 | } | |
f67539c2 TL |
3810 | if (!FLAGS_block_cache_trace_file.empty()) { |
3811 | Status s = db_.db->EndBlockCacheTrace(); | |
3812 | if (!s.ok()) { | |
3813 | fprintf(stderr, | |
3814 | "Encountered an error ending the block cache tracing, %s\n", | |
3815 | s.ToString().c_str()); | |
3816 | } | |
3817 | } | |
11fdf7f2 TL |
3818 | #endif // ROCKSDB_LITE |
3819 | ||
7c673cae FG |
3820 | if (FLAGS_statistics) { |
3821 | fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); | |
3822 | } | |
3823 | if (FLAGS_simcache_size >= 0) { | |
20effc67 TL |
3824 | fprintf( |
3825 | stdout, "SIMULATOR CACHE STATISTICS:\n%s\n", | |
3826 | static_cast_with_check<SimCache>(cache_.get())->ToString().c_str()); | |
7c673cae | 3827 | } |
f67539c2 TL |
3828 | |
3829 | #ifndef ROCKSDB_LITE | |
3830 | if (FLAGS_use_secondary_db) { | |
3831 | fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n", | |
3832 | secondary_db_updates_); | |
3833 | } | |
3834 | #endif // ROCKSDB_LITE | |
7c673cae FG |
3835 | } |
3836 | ||
3837 | private: | |
3838 | std::shared_ptr<TimestampEmulator> timestamp_emulator_; | |
f67539c2 TL |
3839 | std::unique_ptr<port::Thread> secondary_update_thread_; |
3840 | std::atomic<int> secondary_update_stopped_{0}; | |
3841 | #ifndef ROCKSDB_LITE | |
3842 | uint64_t secondary_db_updates_ = 0; | |
3843 | #endif // ROCKSDB_LITE | |
7c673cae FG |
3844 | struct ThreadArg { |
3845 | Benchmark* bm; | |
3846 | SharedState* shared; | |
3847 | ThreadState* thread; | |
3848 | void (Benchmark::*method)(ThreadState*); | |
3849 | }; | |
3850 | ||
3851 | static void ThreadBody(void* v) { | |
3852 | ThreadArg* arg = reinterpret_cast<ThreadArg*>(v); | |
3853 | SharedState* shared = arg->shared; | |
3854 | ThreadState* thread = arg->thread; | |
3855 | { | |
3856 | MutexLock l(&shared->mu); | |
3857 | shared->num_initialized++; | |
3858 | if (shared->num_initialized >= shared->total) { | |
3859 | shared->cv.SignalAll(); | |
3860 | } | |
3861 | while (!shared->start) { | |
3862 | shared->cv.Wait(); | |
3863 | } | |
3864 | } | |
3865 | ||
1e59de90 | 3866 | SetPerfLevel(static_cast<PerfLevel>(shared->perf_level)); |
494da23a | 3867 | perf_context.EnablePerLevelPerfContext(); |
7c673cae FG |
3868 | thread->stats.Start(thread->tid); |
3869 | (arg->bm->*(arg->method))(thread); | |
1e59de90 TL |
3870 | if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) { |
3871 | thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") + | |
3872 | get_perf_context()->ToString()); | |
3873 | } | |
7c673cae FG |
3874 | thread->stats.Stop(); |
3875 | ||
3876 | { | |
3877 | MutexLock l(&shared->mu); | |
3878 | shared->num_done++; | |
3879 | if (shared->num_done >= shared->total) { | |
3880 | shared->cv.SignalAll(); | |
3881 | } | |
3882 | } | |
3883 | } | |
3884 | ||
3885 | Stats RunBenchmark(int n, Slice name, | |
3886 | void (Benchmark::*method)(ThreadState*)) { | |
3887 | SharedState shared; | |
3888 | shared.total = n; | |
3889 | shared.num_initialized = 0; | |
3890 | shared.num_done = 0; | |
3891 | shared.start = false; | |
3892 | if (FLAGS_benchmark_write_rate_limit > 0) { | |
3893 | shared.write_rate_limiter.reset( | |
3894 | NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit)); | |
3895 | } | |
3896 | if (FLAGS_benchmark_read_rate_limit > 0) { | |
11fdf7f2 TL |
3897 | shared.read_rate_limiter.reset(NewGenericRateLimiter( |
3898 | FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */, | |
3899 | 10 /* fairness */, RateLimiter::Mode::kReadsOnly)); | |
7c673cae FG |
3900 | } |
3901 | ||
3902 | std::unique_ptr<ReporterAgent> reporter_agent; | |
3903 | if (FLAGS_report_interval_seconds > 0) { | |
3904 | reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file, | |
3905 | FLAGS_report_interval_seconds)); | |
3906 | } | |
3907 | ||
3908 | ThreadArg* arg = new ThreadArg[n]; | |
3909 | ||
3910 | for (int i = 0; i < n; i++) { | |
3911 | #ifdef NUMA | |
3912 | if (FLAGS_enable_numa) { | |
3913 | // Performs a local allocation of memory to threads in numa node. | |
3914 | int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA. | |
3915 | numa_exit_on_error = 1; | |
3916 | int numa_node = i % n_nodes; | |
3917 | bitmask* nodes = numa_allocate_nodemask(); | |
3918 | numa_bitmask_clearall(nodes); | |
3919 | numa_bitmask_setbit(nodes, numa_node); | |
3920 | // numa_bind() call binds the process to the node and these | |
3921 | // properties are passed on to the thread that is created in | |
3922 | // StartThread method called later in the loop. | |
3923 | numa_bind(nodes); | |
3924 | numa_set_strict(1); | |
3925 | numa_free_nodemask(nodes); | |
3926 | } | |
3927 | #endif | |
3928 | arg[i].bm = this; | |
3929 | arg[i].method = method; | |
3930 | arg[i].shared = &shared; | |
1e59de90 TL |
3931 | total_thread_count_++; |
3932 | arg[i].thread = new ThreadState(i, total_thread_count_); | |
7c673cae FG |
3933 | arg[i].thread->stats.SetReporterAgent(reporter_agent.get()); |
3934 | arg[i].thread->shared = &shared; | |
3935 | FLAGS_env->StartThread(ThreadBody, &arg[i]); | |
3936 | } | |
3937 | ||
3938 | shared.mu.Lock(); | |
3939 | while (shared.num_initialized < n) { | |
3940 | shared.cv.Wait(); | |
3941 | } | |
3942 | ||
3943 | shared.start = true; | |
3944 | shared.cv.SignalAll(); | |
3945 | while (shared.num_done < n) { | |
3946 | shared.cv.Wait(); | |
3947 | } | |
3948 | shared.mu.Unlock(); | |
3949 | ||
3950 | // Stats for some threads can be excluded. | |
3951 | Stats merge_stats; | |
3952 | for (int i = 0; i < n; i++) { | |
3953 | merge_stats.Merge(arg[i].thread->stats); | |
3954 | } | |
3955 | merge_stats.Report(name); | |
3956 | ||
3957 | for (int i = 0; i < n; i++) { | |
3958 | delete arg[i].thread; | |
3959 | } | |
3960 | delete[] arg; | |
3961 | ||
3962 | return merge_stats; | |
3963 | } | |
3964 | ||
1e59de90 TL |
3965 | template <OperationType kOpType, typename FnType, typename... Args> |
3966 | static inline void ChecksumBenchmark(FnType fn, ThreadState* thread, | |
3967 | Args... args) { | |
3968 | const int size = FLAGS_block_size; // use --block_size option for db_bench | |
3969 | std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)"; | |
11fdf7f2 TL |
3970 | const char* label = labels.c_str(); |
3971 | ||
7c673cae | 3972 | std::string data(size, 'x'); |
1e59de90 TL |
3973 | uint64_t bytes = 0; |
3974 | uint32_t val = 0; | |
3975 | while (bytes < 5000U * uint64_t{1048576}) { // ~5GB | |
3976 | val += static_cast<uint32_t>(fn(data.data(), size, args...)); | |
3977 | thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType); | |
7c673cae FG |
3978 | bytes += size; |
3979 | } | |
3980 | // Print so result is not dead | |
1e59de90 | 3981 | fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val)); |
7c673cae FG |
3982 | |
3983 | thread->stats.AddBytes(bytes); | |
3984 | thread->stats.AddMessage(label); | |
3985 | } | |
3986 | ||
1e59de90 TL |
3987 | void Crc32c(ThreadState* thread) { |
3988 | ChecksumBenchmark<kCrc>(crc32c::Value, thread); | |
3989 | } | |
3990 | ||
7c673cae | 3991 | void xxHash(ThreadState* thread) { |
1e59de90 TL |
3992 | ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0); |
3993 | } | |
7c673cae | 3994 | |
1e59de90 TL |
3995 | void xxHash64(ThreadState* thread) { |
3996 | ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0); | |
3997 | } | |
3998 | ||
3999 | void xxh3(ThreadState* thread) { | |
4000 | ChecksumBenchmark<kHash>(XXH3_64bits, thread); | |
7c673cae FG |
4001 | } |
4002 | ||
4003 | void AcquireLoad(ThreadState* thread) { | |
4004 | int dummy; | |
4005 | std::atomic<void*> ap(&dummy); | |
4006 | int count = 0; | |
1e59de90 | 4007 | void* ptr = nullptr; |
7c673cae FG |
4008 | thread->stats.AddMessage("(each op is 1000 loads)"); |
4009 | while (count < 100000) { | |
4010 | for (int i = 0; i < 1000; i++) { | |
4011 | ptr = ap.load(std::memory_order_acquire); | |
4012 | } | |
4013 | count++; | |
4014 | thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers); | |
4015 | } | |
4016 | if (ptr == nullptr) exit(1); // Disable unused variable warning. | |
4017 | } | |
4018 | ||
1e59de90 | 4019 | void Compress(ThreadState* thread) { |
7c673cae FG |
4020 | RandomGenerator gen; |
4021 | Slice input = gen.Generate(FLAGS_block_size); | |
4022 | int64_t bytes = 0; | |
4023 | int64_t produced = 0; | |
4024 | bool ok = true; | |
4025 | std::string compressed; | |
494da23a TL |
4026 | CompressionOptions opts; |
4027 | CompressionContext context(FLAGS_compression_type_e); | |
4028 | CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), | |
4029 | FLAGS_compression_type_e, | |
4030 | FLAGS_sample_for_compression); | |
7c673cae FG |
4031 | // Compress 1G |
4032 | while (ok && bytes < int64_t(1) << 30) { | |
4033 | compressed.clear(); | |
494da23a | 4034 | ok = CompressSlice(info, input, &compressed); |
7c673cae FG |
4035 | produced += compressed.size(); |
4036 | bytes += input.size(); | |
4037 | thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress); | |
4038 | } | |
4039 | ||
4040 | if (!ok) { | |
4041 | thread->stats.AddMessage("(compression failure)"); | |
4042 | } else { | |
4043 | char buf[340]; | |
4044 | snprintf(buf, sizeof(buf), "(output: %.1f%%)", | |
4045 | (produced * 100.0) / bytes); | |
4046 | thread->stats.AddMessage(buf); | |
4047 | thread->stats.AddBytes(bytes); | |
4048 | } | |
4049 | } | |
4050 | ||
1e59de90 | 4051 | void Uncompress(ThreadState* thread) { |
7c673cae FG |
4052 | RandomGenerator gen; |
4053 | Slice input = gen.Generate(FLAGS_block_size); | |
4054 | std::string compressed; | |
4055 | ||
494da23a TL |
4056 | CompressionContext compression_ctx(FLAGS_compression_type_e); |
4057 | CompressionOptions compression_opts; | |
4058 | CompressionInfo compression_info( | |
4059 | compression_opts, compression_ctx, CompressionDict::GetEmptyDict(), | |
4060 | FLAGS_compression_type_e, FLAGS_sample_for_compression); | |
11fdf7f2 | 4061 | UncompressionContext uncompression_ctx(FLAGS_compression_type_e); |
494da23a TL |
4062 | UncompressionInfo uncompression_info(uncompression_ctx, |
4063 | UncompressionDict::GetEmptyDict(), | |
4064 | FLAGS_compression_type_e); | |
11fdf7f2 | 4065 | |
494da23a | 4066 | bool ok = CompressSlice(compression_info, input, &compressed); |
7c673cae | 4067 | int64_t bytes = 0; |
20effc67 | 4068 | size_t uncompressed_size = 0; |
7c673cae | 4069 | while (ok && bytes < 1024 * 1048576) { |
20effc67 TL |
4070 | constexpr uint32_t compress_format_version = 2; |
4071 | ||
4072 | CacheAllocationPtr uncompressed = UncompressData( | |
4073 | uncompression_info, compressed.data(), compressed.size(), | |
4074 | &uncompressed_size, compress_format_version); | |
4075 | ||
4076 | ok = uncompressed.get() != nullptr; | |
7c673cae FG |
4077 | bytes += input.size(); |
4078 | thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress); | |
4079 | } | |
4080 | ||
4081 | if (!ok) { | |
4082 | thread->stats.AddMessage("(compression failure)"); | |
4083 | } else { | |
4084 | thread->stats.AddBytes(bytes); | |
4085 | } | |
4086 | } | |
4087 | ||
4088 | // Returns true if the options is initialized from the specified | |
4089 | // options file. | |
4090 | bool InitializeOptionsFromFile(Options* opts) { | |
4091 | #ifndef ROCKSDB_LITE | |
4092 | printf("Initializing RocksDB Options from the specified file\n"); | |
4093 | DBOptions db_opts; | |
4094 | std::vector<ColumnFamilyDescriptor> cf_descs; | |
4095 | if (FLAGS_options_file != "") { | |
f67539c2 | 4096 | auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts, |
7c673cae | 4097 | &cf_descs); |
f67539c2 | 4098 | db_opts.env = FLAGS_env; |
7c673cae FG |
4099 | if (s.ok()) { |
4100 | *opts = Options(db_opts, cf_descs[0].options); | |
4101 | return true; | |
4102 | } | |
4103 | fprintf(stderr, "Unable to load options file %s --- %s\n", | |
4104 | FLAGS_options_file.c_str(), s.ToString().c_str()); | |
4105 | exit(1); | |
4106 | } | |
11fdf7f2 TL |
4107 | #else |
4108 | (void)opts; | |
7c673cae FG |
4109 | #endif |
4110 | return false; | |
4111 | } | |
4112 | ||
4113 | void InitializeOptionsFromFlags(Options* opts) { | |
4114 | printf("Initializing RocksDB Options from command-line flags\n"); | |
4115 | Options& options = *opts; | |
1e59de90 TL |
4116 | ConfigOptions config_options(options); |
4117 | config_options.ignore_unsupported_options = false; | |
7c673cae FG |
4118 | |
4119 | assert(db_.db == nullptr); | |
4120 | ||
f67539c2 | 4121 | options.env = FLAGS_env; |
1e59de90 TL |
4122 | options.wal_dir = FLAGS_wal_dir; |
4123 | options.dump_malloc_stats = FLAGS_dump_malloc_stats; | |
4124 | options.stats_dump_period_sec = | |
4125 | static_cast<unsigned int>(FLAGS_stats_dump_period_sec); | |
4126 | options.stats_persist_period_sec = | |
4127 | static_cast<unsigned int>(FLAGS_stats_persist_period_sec); | |
4128 | options.persist_stats_to_disk = FLAGS_persist_stats_to_disk; | |
4129 | options.stats_history_buffer_size = | |
4130 | static_cast<size_t>(FLAGS_stats_history_buffer_size); | |
4131 | options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery; | |
4132 | ||
4133 | options.compression_opts.level = FLAGS_compression_level; | |
4134 | options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes; | |
4135 | options.compression_opts.zstd_max_train_bytes = | |
4136 | FLAGS_compression_zstd_max_train_bytes; | |
4137 | options.compression_opts.parallel_threads = | |
4138 | FLAGS_compression_parallel_threads; | |
4139 | options.compression_opts.max_dict_buffer_bytes = | |
4140 | FLAGS_compression_max_dict_buffer_bytes; | |
4141 | options.compression_opts.use_zstd_dict_trainer = | |
4142 | FLAGS_compression_use_zstd_dict_trainer; | |
4143 | ||
7c673cae | 4144 | options.max_open_files = FLAGS_open_files; |
11fdf7f2 TL |
4145 | if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) { |
4146 | options.write_buffer_manager.reset( | |
4147 | new WriteBufferManager(FLAGS_db_write_buffer_size, cache_)); | |
4148 | } | |
1e59de90 | 4149 | options.arena_block_size = FLAGS_arena_block_size; |
7c673cae FG |
4150 | options.write_buffer_size = FLAGS_write_buffer_size; |
4151 | options.max_write_buffer_number = FLAGS_max_write_buffer_number; | |
4152 | options.min_write_buffer_number_to_merge = | |
1e59de90 | 4153 | FLAGS_min_write_buffer_number_to_merge; |
7c673cae FG |
4154 | options.max_write_buffer_number_to_maintain = |
4155 | FLAGS_max_write_buffer_number_to_maintain; | |
f67539c2 TL |
4156 | options.max_write_buffer_size_to_maintain = |
4157 | FLAGS_max_write_buffer_size_to_maintain; | |
11fdf7f2 | 4158 | options.max_background_jobs = FLAGS_max_background_jobs; |
7c673cae FG |
4159 | options.max_background_compactions = FLAGS_max_background_compactions; |
4160 | options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions); | |
4161 | options.max_background_flushes = FLAGS_max_background_flushes; | |
4162 | options.compaction_style = FLAGS_compaction_style_e; | |
4163 | options.compaction_pri = FLAGS_compaction_pri_e; | |
4164 | options.allow_mmap_reads = FLAGS_mmap_read; | |
4165 | options.allow_mmap_writes = FLAGS_mmap_write; | |
4166 | options.use_direct_reads = FLAGS_use_direct_reads; | |
4167 | options.use_direct_io_for_flush_and_compaction = | |
4168 | FLAGS_use_direct_io_for_flush_and_compaction; | |
1e59de90 TL |
4169 | options.manual_wal_flush = FLAGS_manual_wal_flush; |
4170 | options.wal_compression = FLAGS_wal_compression_e; | |
7c673cae | 4171 | #ifndef ROCKSDB_LITE |
494da23a | 4172 | options.ttl = FLAGS_fifo_compaction_ttl; |
7c673cae | 4173 | options.compaction_options_fifo = CompactionOptionsFIFO( |
11fdf7f2 | 4174 | FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024, |
494da23a | 4175 | FLAGS_fifo_compaction_allow_compaction); |
1e59de90 | 4176 | options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm; |
7c673cae | 4177 | #endif // ROCKSDB_LITE |
1e59de90 | 4178 | options.prefix_extractor = prefix_extractor_; |
7c673cae FG |
4179 | if (FLAGS_use_uint64_comparator) { |
4180 | options.comparator = test::Uint64Comparator(); | |
4181 | if (FLAGS_key_size != 8) { | |
4182 | fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n"); | |
4183 | exit(1); | |
4184 | } | |
4185 | } | |
4186 | if (FLAGS_use_stderr_info_logger) { | |
4187 | options.info_log.reset(new StderrLogger()); | |
4188 | } | |
4189 | options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0; | |
4190 | options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio; | |
494da23a | 4191 | options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; |
7c673cae FG |
4192 | if (FLAGS_memtable_insert_with_hint_prefix_size > 0) { |
4193 | options.memtable_insert_with_hint_prefix_extractor.reset( | |
4194 | NewCappedPrefixTransform( | |
4195 | FLAGS_memtable_insert_with_hint_prefix_size)); | |
4196 | } | |
4197 | options.bloom_locality = FLAGS_bloom_locality; | |
4198 | options.max_file_opening_threads = FLAGS_file_opening_threads; | |
7c673cae | 4199 | options.compaction_readahead_size = FLAGS_compaction_readahead_size; |
f67539c2 | 4200 | options.log_readahead_size = FLAGS_log_readahead_size; |
7c673cae FG |
4201 | options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size; |
4202 | options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size; | |
4203 | options.use_fsync = FLAGS_use_fsync; | |
4204 | options.num_levels = FLAGS_num_levels; | |
4205 | options.target_file_size_base = FLAGS_target_file_size_base; | |
4206 | options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; | |
4207 | options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; | |
4208 | options.level_compaction_dynamic_level_bytes = | |
4209 | FLAGS_level_compaction_dynamic_level_bytes; | |
4210 | options.max_bytes_for_level_multiplier = | |
4211 | FLAGS_max_bytes_for_level_multiplier; | |
1e59de90 TL |
4212 | Status s = |
4213 | CreateMemTableRepFactory(config_options, &options.memtable_factory); | |
4214 | if (!s.ok()) { | |
4215 | fprintf(stderr, "Could not create memtable factory: %s\n", | |
4216 | s.ToString().c_str()); | |
4217 | exit(1); | |
4218 | } else if ((FLAGS_prefix_size == 0) && | |
4219 | (options.memtable_factory->IsInstanceOf("prefix_hash") || | |
4220 | options.memtable_factory->IsInstanceOf("hash_linkedlist"))) { | |
4221 | fprintf(stderr, | |
4222 | "prefix_size should be non-zero if PrefixHash or " | |
4223 | "HashLinkedList memtablerep is used\n"); | |
7c673cae | 4224 | exit(1); |
7c673cae FG |
4225 | } |
4226 | if (FLAGS_use_plain_table) { | |
4227 | #ifndef ROCKSDB_LITE | |
1e59de90 TL |
4228 | if (!options.memtable_factory->IsInstanceOf("prefix_hash") && |
4229 | !options.memtable_factory->IsInstanceOf("hash_linkedlist")) { | |
4230 | fprintf(stderr, "Warning: plain table is used with %s\n", | |
4231 | options.memtable_factory->Name()); | |
7c673cae FG |
4232 | } |
4233 | ||
4234 | int bloom_bits_per_key = FLAGS_bloom_bits; | |
4235 | if (bloom_bits_per_key < 0) { | |
1e59de90 | 4236 | bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key; |
7c673cae FG |
4237 | } |
4238 | ||
4239 | PlainTableOptions plain_table_options; | |
4240 | plain_table_options.user_key_len = FLAGS_key_size; | |
4241 | plain_table_options.bloom_bits_per_key = bloom_bits_per_key; | |
4242 | plain_table_options.hash_table_ratio = 0.75; | |
4243 | options.table_factory = std::shared_ptr<TableFactory>( | |
4244 | NewPlainTableFactory(plain_table_options)); | |
4245 | #else | |
4246 | fprintf(stderr, "Plain table is not supported in lite mode\n"); | |
4247 | exit(1); | |
4248 | #endif // ROCKSDB_LITE | |
4249 | } else if (FLAGS_use_cuckoo_table) { | |
4250 | #ifndef ROCKSDB_LITE | |
4251 | if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) { | |
4252 | fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); | |
4253 | exit(1); | |
4254 | } | |
11fdf7f2 TL |
4255 | |
4256 | if (!FLAGS_mmap_read) { | |
4257 | fprintf(stderr, "cuckoo table format requires mmap read to operate\n"); | |
4258 | exit(1); | |
4259 | } | |
4260 | ||
f67539c2 | 4261 | ROCKSDB_NAMESPACE::CuckooTableOptions table_options; |
7c673cae FG |
4262 | table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio; |
4263 | table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; | |
1e59de90 TL |
4264 | options.table_factory = |
4265 | std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options)); | |
7c673cae FG |
4266 | #else |
4267 | fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); | |
4268 | exit(1); | |
4269 | #endif // ROCKSDB_LITE | |
4270 | } else { | |
4271 | BlockBasedTableOptions block_based_options; | |
1e59de90 TL |
4272 | block_based_options.checksum = |
4273 | static_cast<ChecksumType>(FLAGS_checksum_type); | |
7c673cae FG |
4274 | if (FLAGS_use_hash_search) { |
4275 | if (FLAGS_prefix_size == 0) { | |
4276 | fprintf(stderr, | |
1e59de90 | 4277 | "prefix_size not assigned when enable use_hash_search \n"); |
7c673cae FG |
4278 | exit(1); |
4279 | } | |
4280 | block_based_options.index_type = BlockBasedTableOptions::kHashSearch; | |
4281 | } else { | |
4282 | block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; | |
4283 | } | |
11fdf7f2 | 4284 | if (FLAGS_partition_index_and_filters || FLAGS_partition_index) { |
20effc67 TL |
4285 | if (FLAGS_index_with_first_key) { |
4286 | fprintf(stderr, | |
4287 | "--index_with_first_key is not compatible with" | |
4288 | " partition index."); | |
4289 | } | |
11fdf7f2 TL |
4290 | if (FLAGS_use_hash_search) { |
4291 | fprintf(stderr, | |
4292 | "use_hash_search is incompatible with " | |
4293 | "partition index and is ignored"); | |
4294 | } | |
4295 | block_based_options.index_type = | |
4296 | BlockBasedTableOptions::kTwoLevelIndexSearch; | |
4297 | block_based_options.metadata_block_size = FLAGS_metadata_block_size; | |
4298 | if (FLAGS_partition_index_and_filters) { | |
4299 | block_based_options.partition_filters = true; | |
4300 | } | |
20effc67 TL |
4301 | } else if (FLAGS_index_with_first_key) { |
4302 | block_based_options.index_type = | |
4303 | BlockBasedTableOptions::kBinarySearchWithFirstKey; | |
4304 | } | |
4305 | BlockBasedTableOptions::IndexShorteningMode index_shortening = | |
4306 | block_based_options.index_shortening; | |
4307 | switch (FLAGS_index_shortening_mode) { | |
4308 | case 0: | |
4309 | index_shortening = | |
4310 | BlockBasedTableOptions::IndexShorteningMode::kNoShortening; | |
4311 | break; | |
4312 | case 1: | |
4313 | index_shortening = | |
4314 | BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators; | |
4315 | break; | |
4316 | case 2: | |
4317 | index_shortening = BlockBasedTableOptions::IndexShorteningMode:: | |
4318 | kShortenSeparatorsAndSuccessor; | |
4319 | break; | |
4320 | default: | |
4321 | fprintf(stderr, "Unknown key shortening mode\n"); | |
11fdf7f2 | 4322 | } |
20effc67 TL |
4323 | block_based_options.optimize_filters_for_memory = |
4324 | FLAGS_optimize_filters_for_memory; | |
4325 | block_based_options.index_shortening = index_shortening; | |
7c673cae FG |
4326 | if (cache_ == nullptr) { |
4327 | block_based_options.no_block_cache = true; | |
4328 | } | |
4329 | block_based_options.cache_index_and_filter_blocks = | |
4330 | FLAGS_cache_index_and_filter_blocks; | |
4331 | block_based_options.pin_l0_filter_and_index_blocks_in_cache = | |
4332 | FLAGS_pin_l0_filter_and_index_blocks_in_cache; | |
11fdf7f2 TL |
4333 | block_based_options.pin_top_level_index_and_filter = |
4334 | FLAGS_pin_top_level_index_and_filter; | |
7c673cae FG |
4335 | if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps |
4336 | block_based_options.cache_index_and_filter_blocks_with_high_priority = | |
4337 | true; | |
4338 | } | |
1e59de90 TL |
4339 | if (FLAGS_cache_high_pri_pool_ratio + FLAGS_cache_low_pri_pool_ratio > |
4340 | 1.0) { | |
4341 | fprintf(stderr, | |
4342 | "Sum of high_pri_pool_ratio and low_pri_pool_ratio " | |
4343 | "cannot exceed 1.0.\n"); | |
4344 | } | |
7c673cae | 4345 | block_based_options.block_cache = cache_; |
1e59de90 TL |
4346 | block_based_options.cache_usage_options.options_overrides.insert( |
4347 | {CacheEntryRole::kCompressionDictionaryBuildingBuffer, | |
4348 | {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer | |
4349 | ? CacheEntryRoleOptions::Decision::kEnabled | |
4350 | : CacheEntryRoleOptions::Decision::kDisabled}}); | |
4351 | block_based_options.cache_usage_options.options_overrides.insert( | |
4352 | {CacheEntryRole::kFilterConstruction, | |
4353 | {/*.charged = */ FLAGS_charge_filter_construction | |
4354 | ? CacheEntryRoleOptions::Decision::kEnabled | |
4355 | : CacheEntryRoleOptions::Decision::kDisabled}}); | |
4356 | block_based_options.cache_usage_options.options_overrides.insert( | |
4357 | {CacheEntryRole::kBlockBasedTableReader, | |
4358 | {/*.charged = */ FLAGS_charge_table_reader | |
4359 | ? CacheEntryRoleOptions::Decision::kEnabled | |
4360 | : CacheEntryRoleOptions::Decision::kDisabled}}); | |
4361 | block_based_options.cache_usage_options.options_overrides.insert( | |
4362 | {CacheEntryRole::kFileMetadata, | |
4363 | {/*.charged = */ FLAGS_charge_file_metadata | |
4364 | ? CacheEntryRoleOptions::Decision::kEnabled | |
4365 | : CacheEntryRoleOptions::Decision::kDisabled}}); | |
4366 | block_based_options.cache_usage_options.options_overrides.insert( | |
4367 | {CacheEntryRole::kBlobCache, | |
4368 | {/*.charged = */ FLAGS_charge_blob_cache | |
4369 | ? CacheEntryRoleOptions::Decision::kEnabled | |
4370 | : CacheEntryRoleOptions::Decision::kDisabled}}); | |
7c673cae FG |
4371 | block_based_options.block_cache_compressed = compressed_cache_; |
4372 | block_based_options.block_size = FLAGS_block_size; | |
4373 | block_based_options.block_restart_interval = FLAGS_block_restart_interval; | |
4374 | block_based_options.index_block_restart_interval = | |
4375 | FLAGS_index_block_restart_interval; | |
11fdf7f2 TL |
4376 | block_based_options.format_version = |
4377 | static_cast<uint32_t>(FLAGS_format_version); | |
7c673cae | 4378 | block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit; |
11fdf7f2 TL |
4379 | block_based_options.enable_index_compression = |
4380 | FLAGS_enable_index_compression; | |
4381 | block_based_options.block_align = FLAGS_block_align; | |
1e59de90 TL |
4382 | block_based_options.whole_key_filtering = FLAGS_whole_key_filtering; |
4383 | block_based_options.max_auto_readahead_size = | |
4384 | FLAGS_max_auto_readahead_size; | |
4385 | block_based_options.initial_auto_readahead_size = | |
4386 | FLAGS_initial_auto_readahead_size; | |
4387 | block_based_options.num_file_reads_for_auto_readahead = | |
4388 | FLAGS_num_file_reads_for_auto_readahead; | |
4389 | BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache = | |
4390 | block_based_options.prepopulate_block_cache; | |
4391 | switch (FLAGS_prepopulate_block_cache) { | |
4392 | case 0: | |
4393 | prepopulate_block_cache = | |
4394 | BlockBasedTableOptions::PrepopulateBlockCache::kDisable; | |
4395 | break; | |
4396 | case 1: | |
4397 | prepopulate_block_cache = | |
4398 | BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; | |
4399 | break; | |
4400 | default: | |
4401 | fprintf(stderr, "Unknown prepopulate block cache mode\n"); | |
4402 | } | |
4403 | block_based_options.prepopulate_block_cache = prepopulate_block_cache; | |
11fdf7f2 TL |
4404 | if (FLAGS_use_data_block_hash_index) { |
4405 | block_based_options.data_block_index_type = | |
f67539c2 | 4406 | ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash; |
11fdf7f2 TL |
4407 | } else { |
4408 | block_based_options.data_block_index_type = | |
f67539c2 | 4409 | ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch; |
11fdf7f2 TL |
4410 | } |
4411 | block_based_options.data_block_hash_table_util_ratio = | |
4412 | FLAGS_data_block_hash_table_util_ratio; | |
7c673cae FG |
4413 | if (FLAGS_read_cache_path != "") { |
4414 | #ifndef ROCKSDB_LITE | |
4415 | Status rc_status; | |
4416 | ||
4417 | // Read cache need to be provided with a the Logger, we will put all | |
4418 | // reac cache logs in the read cache path in a file named rc_LOG | |
4419 | rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path); | |
4420 | std::shared_ptr<Logger> read_cache_logger; | |
4421 | if (rc_status.ok()) { | |
4422 | rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG", | |
4423 | &read_cache_logger); | |
4424 | } | |
4425 | ||
4426 | if (rc_status.ok()) { | |
4427 | PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path, | |
4428 | FLAGS_read_cache_size, | |
4429 | read_cache_logger); | |
4430 | ||
4431 | rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read; | |
4432 | rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write; | |
4433 | rc_cfg.writer_qdepth = 4; | |
4434 | rc_cfg.writer_dispatch_size = 4 * 1024; | |
4435 | ||
4436 | auto pcache = std::make_shared<BlockCacheTier>(rc_cfg); | |
4437 | block_based_options.persistent_cache = pcache; | |
4438 | rc_status = pcache->Open(); | |
4439 | } | |
4440 | ||
4441 | if (!rc_status.ok()) { | |
4442 | fprintf(stderr, "Error initializing read cache, %s\n", | |
4443 | rc_status.ToString().c_str()); | |
4444 | exit(1); | |
4445 | } | |
4446 | #else | |
4447 | fprintf(stderr, "Read cache is not supported in LITE\n"); | |
4448 | exit(1); | |
4449 | ||
4450 | #endif | |
4451 | } | |
1e59de90 TL |
4452 | |
4453 | if (FLAGS_use_blob_cache) { | |
4454 | if (FLAGS_use_shared_block_and_blob_cache) { | |
4455 | options.blob_cache = cache_; | |
4456 | } else { | |
4457 | if (FLAGS_blob_cache_size > 0) { | |
4458 | LRUCacheOptions co; | |
4459 | co.capacity = FLAGS_blob_cache_size; | |
4460 | co.num_shard_bits = FLAGS_blob_cache_numshardbits; | |
4461 | co.memory_allocator = GetCacheAllocator(); | |
4462 | ||
4463 | options.blob_cache = NewLRUCache(co); | |
4464 | } else { | |
4465 | fprintf( | |
4466 | stderr, | |
4467 | "Unable to create a standalone blob cache if blob_cache_size " | |
4468 | "<= 0.\n"); | |
4469 | exit(1); | |
4470 | } | |
4471 | } | |
4472 | switch (FLAGS_prepopulate_blob_cache) { | |
4473 | case 0: | |
4474 | options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable; | |
4475 | break; | |
4476 | case 1: | |
4477 | options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; | |
4478 | break; | |
4479 | default: | |
4480 | fprintf(stderr, "Unknown prepopulate blob cache mode\n"); | |
4481 | exit(1); | |
4482 | } | |
4483 | ||
4484 | fprintf(stdout, | |
4485 | "Integrated BlobDB: blob cache enabled" | |
4486 | ", block and blob caches shared: %d", | |
4487 | FLAGS_use_shared_block_and_blob_cache); | |
4488 | if (!FLAGS_use_shared_block_and_blob_cache) { | |
4489 | fprintf(stdout, | |
4490 | ", blob cache size %" PRIu64 | |
4491 | ", blob cache num shard bits: %d", | |
4492 | FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits); | |
4493 | } | |
4494 | fprintf(stdout, ", blob cache prepopulated: %d\n", | |
4495 | FLAGS_prepopulate_blob_cache); | |
4496 | } else { | |
4497 | fprintf(stdout, "Integrated BlobDB: blob cache disabled\n"); | |
4498 | } | |
4499 | ||
7c673cae FG |
4500 | options.table_factory.reset( |
4501 | NewBlockBasedTableFactory(block_based_options)); | |
4502 | } | |
4503 | if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { | |
4504 | if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != | |
f67539c2 | 4505 | static_cast<unsigned int>(FLAGS_num_levels)) { |
7c673cae | 4506 | fprintf(stderr, "Insufficient number of fanouts specified %d\n", |
f67539c2 TL |
4507 | static_cast<int>( |
4508 | FLAGS_max_bytes_for_level_multiplier_additional_v.size())); | |
7c673cae FG |
4509 | exit(1); |
4510 | } | |
4511 | options.max_bytes_for_level_multiplier_additional = | |
1e59de90 | 4512 | FLAGS_max_bytes_for_level_multiplier_additional_v; |
7c673cae FG |
4513 | } |
4514 | options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; | |
4515 | options.level0_file_num_compaction_trigger = | |
4516 | FLAGS_level0_file_num_compaction_trigger; | |
4517 | options.level0_slowdown_writes_trigger = | |
1e59de90 | 4518 | FLAGS_level0_slowdown_writes_trigger; |
7c673cae | 4519 | options.compression = FLAGS_compression_type_e; |
1e59de90 TL |
4520 | if (FLAGS_simulate_hybrid_fs_file != "") { |
4521 | options.bottommost_temperature = Temperature::kWarm; | |
4522 | } | |
4523 | options.preclude_last_level_data_seconds = | |
4524 | FLAGS_preclude_last_level_data_seconds; | |
4525 | options.preserve_internal_time_seconds = | |
4526 | FLAGS_preserve_internal_time_seconds; | |
494da23a | 4527 | options.sample_for_compression = FLAGS_sample_for_compression; |
7c673cae FG |
4528 | options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; |
4529 | options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; | |
4530 | options.max_total_wal_size = FLAGS_max_total_wal_size; | |
4531 | ||
4532 | if (FLAGS_min_level_to_compress >= 0) { | |
4533 | assert(FLAGS_min_level_to_compress <= FLAGS_num_levels); | |
4534 | options.compression_per_level.resize(FLAGS_num_levels); | |
4535 | for (int i = 0; i < FLAGS_min_level_to_compress; i++) { | |
4536 | options.compression_per_level[i] = kNoCompression; | |
4537 | } | |
1e59de90 | 4538 | for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) { |
7c673cae FG |
4539 | options.compression_per_level[i] = FLAGS_compression_type_e; |
4540 | } | |
4541 | } | |
7c673cae FG |
4542 | options.soft_pending_compaction_bytes_limit = |
4543 | FLAGS_soft_pending_compaction_bytes_limit; | |
4544 | options.hard_pending_compaction_bytes_limit = | |
4545 | FLAGS_hard_pending_compaction_bytes_limit; | |
4546 | options.delayed_write_rate = FLAGS_delayed_write_rate; | |
4547 | options.allow_concurrent_memtable_write = | |
4548 | FLAGS_allow_concurrent_memtable_write; | |
1e59de90 TL |
4549 | options.experimental_mempurge_threshold = |
4550 | FLAGS_experimental_mempurge_threshold; | |
11fdf7f2 TL |
4551 | options.inplace_update_support = FLAGS_inplace_update_support; |
4552 | options.inplace_update_num_locks = FLAGS_inplace_update_num_locks; | |
7c673cae FG |
4553 | options.enable_write_thread_adaptive_yield = |
4554 | FLAGS_enable_write_thread_adaptive_yield; | |
11fdf7f2 | 4555 | options.enable_pipelined_write = FLAGS_enable_pipelined_write; |
f67539c2 | 4556 | options.unordered_write = FLAGS_unordered_write; |
7c673cae FG |
4557 | options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec; |
4558 | options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec; | |
7c673cae FG |
4559 | options.table_cache_numshardbits = FLAGS_table_cache_numshardbits; |
4560 | options.max_compaction_bytes = FLAGS_max_compaction_bytes; | |
4561 | options.disable_auto_compactions = FLAGS_disable_auto_compactions; | |
4562 | options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits; | |
1e59de90 TL |
4563 | options.paranoid_checks = FLAGS_paranoid_checks; |
4564 | options.force_consistency_checks = FLAGS_force_consistency_checks; | |
4565 | options.check_flush_compaction_key_order = | |
4566 | FLAGS_check_flush_compaction_key_order; | |
20effc67 | 4567 | options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds; |
1e59de90 | 4568 | options.ttl = FLAGS_ttl_seconds; |
7c673cae FG |
4569 | // fill storage options |
4570 | options.advise_random_on_open = FLAGS_advise_random_on_open; | |
4571 | options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; | |
4572 | options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; | |
4573 | options.bytes_per_sync = FLAGS_bytes_per_sync; | |
4574 | options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync; | |
4575 | ||
4576 | // merge operator options | |
1e59de90 TL |
4577 | if (!FLAGS_merge_operator.empty()) { |
4578 | s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator, | |
4579 | &options.merge_operator); | |
4580 | if (!s.ok()) { | |
4581 | fprintf(stderr, "invalid merge operator[%s]: %s\n", | |
4582 | FLAGS_merge_operator.c_str(), s.ToString().c_str()); | |
4583 | exit(1); | |
4584 | } | |
7c673cae FG |
4585 | } |
4586 | options.max_successive_merges = FLAGS_max_successive_merges; | |
4587 | options.report_bg_io_stats = FLAGS_report_bg_io_stats; | |
4588 | ||
4589 | // set universal style compaction configurations, if applicable | |
4590 | if (FLAGS_universal_size_ratio != 0) { | |
4591 | options.compaction_options_universal.size_ratio = | |
1e59de90 | 4592 | FLAGS_universal_size_ratio; |
7c673cae FG |
4593 | } |
4594 | if (FLAGS_universal_min_merge_width != 0) { | |
4595 | options.compaction_options_universal.min_merge_width = | |
1e59de90 | 4596 | FLAGS_universal_min_merge_width; |
7c673cae FG |
4597 | } |
4598 | if (FLAGS_universal_max_merge_width != 0) { | |
4599 | options.compaction_options_universal.max_merge_width = | |
1e59de90 | 4600 | FLAGS_universal_max_merge_width; |
7c673cae FG |
4601 | } |
4602 | if (FLAGS_universal_max_size_amplification_percent != 0) { | |
4603 | options.compaction_options_universal.max_size_amplification_percent = | |
1e59de90 | 4604 | FLAGS_universal_max_size_amplification_percent; |
7c673cae FG |
4605 | } |
4606 | if (FLAGS_universal_compression_size_percent != -1) { | |
4607 | options.compaction_options_universal.compression_size_percent = | |
1e59de90 | 4608 | FLAGS_universal_compression_size_percent; |
7c673cae FG |
4609 | } |
4610 | options.compaction_options_universal.allow_trivial_move = | |
4611 | FLAGS_universal_allow_trivial_move; | |
1e59de90 TL |
4612 | options.compaction_options_universal.incremental = |
4613 | FLAGS_universal_incremental; | |
7c673cae FG |
4614 | if (FLAGS_thread_status_per_interval > 0) { |
4615 | options.enable_thread_tracking = true; | |
4616 | } | |
7c673cae | 4617 | |
20effc67 TL |
4618 | if (FLAGS_user_timestamp_size > 0) { |
4619 | if (FLAGS_user_timestamp_size != 8) { | |
4620 | fprintf(stderr, "Only 64 bits timestamps are supported.\n"); | |
4621 | exit(1); | |
4622 | } | |
1e59de90 TL |
4623 | options.comparator = test::BytewiseComparatorWithU64TsWrapper(); |
4624 | } | |
4625 | ||
4626 | options.allow_data_in_errors = FLAGS_allow_data_in_errors; | |
4627 | options.track_and_verify_wals_in_manifest = | |
4628 | FLAGS_track_and_verify_wals_in_manifest; | |
4629 | ||
4630 | // Integrated BlobDB | |
4631 | options.enable_blob_files = FLAGS_enable_blob_files; | |
4632 | options.min_blob_size = FLAGS_min_blob_size; | |
4633 | options.blob_file_size = FLAGS_blob_file_size; | |
4634 | options.blob_compression_type = | |
4635 | StringToCompressionType(FLAGS_blob_compression_type.c_str()); | |
4636 | options.enable_blob_garbage_collection = | |
4637 | FLAGS_enable_blob_garbage_collection; | |
4638 | options.blob_garbage_collection_age_cutoff = | |
4639 | FLAGS_blob_garbage_collection_age_cutoff; | |
4640 | options.blob_garbage_collection_force_threshold = | |
4641 | FLAGS_blob_garbage_collection_force_threshold; | |
4642 | options.blob_compaction_readahead_size = | |
4643 | FLAGS_blob_compaction_readahead_size; | |
4644 | options.blob_file_starting_level = FLAGS_blob_file_starting_level; | |
20effc67 | 4645 | |
7c673cae FG |
4646 | #ifndef ROCKSDB_LITE |
4647 | if (FLAGS_readonly && FLAGS_transaction_db) { | |
4648 | fprintf(stderr, "Cannot use readonly flag with transaction_db\n"); | |
4649 | exit(1); | |
4650 | } | |
f67539c2 TL |
4651 | if (FLAGS_use_secondary_db && |
4652 | (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) { | |
4653 | fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n"); | |
4654 | exit(1); | |
4655 | } | |
7c673cae | 4656 | #endif // ROCKSDB_LITE |
1e59de90 TL |
4657 | options.memtable_protection_bytes_per_key = |
4658 | FLAGS_memtable_protection_bytes_per_key; | |
7c673cae FG |
4659 | } |
4660 | ||
4661 | void InitializeOptionsGeneral(Options* opts) { | |
1e59de90 TL |
4662 | // Be careful about what is set here to avoid accidentally overwriting |
4663 | // settings already configured by OPTIONS file. Only configure settings that | |
4664 | // are needed for the benchmark to run, settings for shared objects that | |
4665 | // were not configured already, settings that require dynamically invoking | |
4666 | // APIs, and settings for the benchmark itself. | |
7c673cae FG |
4667 | Options& options = *opts; |
4668 | ||
1e59de90 TL |
4669 | // Always set these since they are harmless when not needed and prevent |
4670 | // a guaranteed failure when they are needed. | |
4671 | options.create_missing_column_families = true; | |
4672 | options.create_if_missing = true; | |
4673 | ||
4674 | if (options.statistics == nullptr) { | |
4675 | options.statistics = dbstats; | |
4676 | } | |
7c673cae | 4677 | |
20effc67 TL |
4678 | auto table_options = |
4679 | options.table_factory->GetOptions<BlockBasedTableOptions>(); | |
4680 | if (table_options != nullptr) { | |
1e59de90 TL |
4681 | if (FLAGS_cache_size > 0) { |
4682 | // This violates this function's rules on when to set options. But we | |
4683 | // have to do it because the case of unconfigured block cache in OPTIONS | |
4684 | // file is indistinguishable (it is sanitized to 8MB by this point, not | |
4685 | // nullptr), and our regression tests assume this will be the shared | |
4686 | // block cache, even with OPTIONS file provided. | |
11fdf7f2 TL |
4687 | table_options->block_cache = cache_; |
4688 | } | |
1e59de90 TL |
4689 | if (table_options->filter_policy == nullptr) { |
4690 | if (FLAGS_bloom_bits < 0) { | |
4691 | table_options->filter_policy = BlockBasedTableOptions().filter_policy; | |
4692 | } else if (FLAGS_bloom_bits == 0) { | |
4693 | table_options->filter_policy.reset(); | |
4694 | } else { | |
4695 | table_options->filter_policy.reset( | |
4696 | FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits) | |
4697 | : NewBloomFilterPolicy(FLAGS_bloom_bits)); | |
4698 | } | |
11fdf7f2 TL |
4699 | } |
4700 | } | |
1e59de90 TL |
4701 | |
4702 | if (options.row_cache == nullptr) { | |
4703 | if (FLAGS_row_cache_size) { | |
4704 | if (FLAGS_cache_numshardbits >= 1) { | |
4705 | options.row_cache = | |
4706 | NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits); | |
4707 | } else { | |
4708 | options.row_cache = NewLRUCache(FLAGS_row_cache_size); | |
4709 | } | |
7c673cae FG |
4710 | } |
4711 | } | |
1e59de90 TL |
4712 | |
4713 | if (options.env == Env::Default()) { | |
4714 | options.env = FLAGS_env; | |
4715 | } | |
7c673cae | 4716 | if (FLAGS_enable_io_prio) { |
1e59de90 TL |
4717 | options.env->LowerThreadPoolIOPriority(Env::LOW); |
4718 | options.env->LowerThreadPoolIOPriority(Env::HIGH); | |
7c673cae | 4719 | } |
11fdf7f2 | 4720 | if (FLAGS_enable_cpu_prio) { |
1e59de90 TL |
4721 | options.env->LowerThreadPoolCPUPriority(Env::LOW); |
4722 | options.env->LowerThreadPoolCPUPriority(Env::HIGH); | |
11fdf7f2 | 4723 | } |
1e59de90 | 4724 | |
11fdf7f2 TL |
4725 | if (FLAGS_sine_write_rate) { |
4726 | FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0)); | |
4727 | } | |
4728 | ||
1e59de90 TL |
4729 | if (options.rate_limiter == nullptr) { |
4730 | if (FLAGS_rate_limiter_bytes_per_sec > 0) { | |
4731 | options.rate_limiter.reset(NewGenericRateLimiter( | |
4732 | FLAGS_rate_limiter_bytes_per_sec, | |
4733 | FLAGS_rate_limiter_refill_period_us, 10 /* fairness */, | |
4734 | // TODO: replace this with a more general FLAG for deciding | |
4735 | // RateLimiter::Mode as now we also rate-limit foreground reads e.g, | |
4736 | // Get()/MultiGet() | |
4737 | FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly | |
4738 | : RateLimiter::Mode::kWritesOnly, | |
4739 | FLAGS_rate_limiter_auto_tuned)); | |
11fdf7f2 | 4740 | } |
11fdf7f2 | 4741 | } |
7c673cae | 4742 | |
11fdf7f2 | 4743 | options.listeners.emplace_back(listener_); |
1e59de90 TL |
4744 | |
4745 | if (options.file_checksum_gen_factory == nullptr) { | |
4746 | if (FLAGS_file_checksum) { | |
4747 | options.file_checksum_gen_factory.reset( | |
4748 | new FileChecksumGenCrc32cFactory()); | |
4749 | } | |
4750 | } | |
4751 | ||
7c673cae FG |
4752 | if (FLAGS_num_multi_db <= 1) { |
4753 | OpenDb(options, FLAGS_db, &db_); | |
4754 | } else { | |
4755 | multi_dbs_.clear(); | |
4756 | multi_dbs_.resize(FLAGS_num_multi_db); | |
4757 | auto wal_dir = options.wal_dir; | |
4758 | for (int i = 0; i < FLAGS_num_multi_db; i++) { | |
4759 | if (!wal_dir.empty()) { | |
4760 | options.wal_dir = GetPathForMultiple(wal_dir, i); | |
4761 | } | |
4762 | OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]); | |
4763 | } | |
4764 | options.wal_dir = wal_dir; | |
4765 | } | |
11fdf7f2 TL |
4766 | |
4767 | // KeepFilter is a noop filter, this can be used to test compaction filter | |
1e59de90 TL |
4768 | if (options.compaction_filter == nullptr) { |
4769 | if (FLAGS_use_keep_filter) { | |
4770 | options.compaction_filter = new KeepFilter(); | |
4771 | fprintf(stdout, "A noop compaction filter is used\n"); | |
4772 | } | |
11fdf7f2 | 4773 | } |
494da23a TL |
4774 | |
4775 | if (FLAGS_use_existing_keys) { | |
4776 | // Only work on single database | |
4777 | assert(db_.db != nullptr); | |
1e59de90 | 4778 | ReadOptions read_opts; // before read_options_ initialized |
494da23a TL |
4779 | read_opts.total_order_seek = true; |
4780 | Iterator* iter = db_.db->NewIterator(read_opts); | |
4781 | for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { | |
4782 | keys_.emplace_back(iter->key().ToString()); | |
4783 | } | |
4784 | delete iter; | |
4785 | FLAGS_num = keys_.size(); | |
4786 | } | |
7c673cae FG |
4787 | } |
4788 | ||
4789 | void Open(Options* opts) { | |
4790 | if (!InitializeOptionsFromFile(opts)) { | |
4791 | InitializeOptionsFromFlags(opts); | |
4792 | } | |
4793 | ||
4794 | InitializeOptionsGeneral(opts); | |
4795 | } | |
4796 | ||
11fdf7f2 | 4797 | void OpenDb(Options options, const std::string& db_name, |
1e59de90 TL |
4798 | DBWithColumnFamilies* db) { |
4799 | uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0; | |
7c673cae FG |
4800 | Status s; |
4801 | // Open with column families if necessary. | |
4802 | if (FLAGS_num_column_families > 1) { | |
4803 | size_t num_hot = FLAGS_num_column_families; | |
4804 | if (FLAGS_num_hot_column_families > 0 && | |
4805 | FLAGS_num_hot_column_families < FLAGS_num_column_families) { | |
4806 | num_hot = FLAGS_num_hot_column_families; | |
4807 | } else { | |
4808 | FLAGS_num_hot_column_families = FLAGS_num_column_families; | |
4809 | } | |
4810 | std::vector<ColumnFamilyDescriptor> column_families; | |
4811 | for (size_t i = 0; i < num_hot; i++) { | |
4812 | column_families.push_back(ColumnFamilyDescriptor( | |
1e59de90 | 4813 | ColumnFamilyName(i), ColumnFamilyOptions(options))); |
7c673cae | 4814 | } |
11fdf7f2 TL |
4815 | std::vector<int> cfh_idx_to_prob; |
4816 | if (!FLAGS_column_family_distribution.empty()) { | |
4817 | std::stringstream cf_prob_stream(FLAGS_column_family_distribution); | |
4818 | std::string cf_prob; | |
4819 | int sum = 0; | |
4820 | while (std::getline(cf_prob_stream, cf_prob, ',')) { | |
4821 | cfh_idx_to_prob.push_back(std::stoi(cf_prob)); | |
4822 | sum += cfh_idx_to_prob.back(); | |
4823 | } | |
4824 | if (sum != 100) { | |
4825 | fprintf(stderr, "column_family_distribution items must sum to 100\n"); | |
4826 | exit(1); | |
4827 | } | |
4828 | if (cfh_idx_to_prob.size() != num_hot) { | |
4829 | fprintf(stderr, | |
4830 | "got %" ROCKSDB_PRIszt | |
4831 | " column_family_distribution items; expected " | |
4832 | "%" ROCKSDB_PRIszt "\n", | |
4833 | cfh_idx_to_prob.size(), num_hot); | |
4834 | exit(1); | |
4835 | } | |
4836 | } | |
7c673cae FG |
4837 | #ifndef ROCKSDB_LITE |
4838 | if (FLAGS_readonly) { | |
1e59de90 TL |
4839 | s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh, |
4840 | &db->db); | |
7c673cae FG |
4841 | } else if (FLAGS_optimistic_transaction_db) { |
4842 | s = OptimisticTransactionDB::Open(options, db_name, column_families, | |
4843 | &db->cfh, &db->opt_txn_db); | |
4844 | if (s.ok()) { | |
4845 | db->db = db->opt_txn_db->GetBaseDB(); | |
4846 | } | |
4847 | } else if (FLAGS_transaction_db) { | |
4848 | TransactionDB* ptr; | |
4849 | TransactionDBOptions txn_db_options; | |
f67539c2 TL |
4850 | if (options.unordered_write) { |
4851 | options.two_write_queues = true; | |
4852 | txn_db_options.skip_concurrency_control = true; | |
4853 | txn_db_options.write_policy = WRITE_PREPARED; | |
4854 | } | |
7c673cae FG |
4855 | s = TransactionDB::Open(options, txn_db_options, db_name, |
4856 | column_families, &db->cfh, &ptr); | |
4857 | if (s.ok()) { | |
4858 | db->db = ptr; | |
4859 | } | |
4860 | } else { | |
4861 | s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); | |
4862 | } | |
4863 | #else | |
4864 | s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); | |
4865 | #endif // ROCKSDB_LITE | |
4866 | db->cfh.resize(FLAGS_num_column_families); | |
4867 | db->num_created = num_hot; | |
4868 | db->num_hot = num_hot; | |
11fdf7f2 | 4869 | db->cfh_idx_to_prob = std::move(cfh_idx_to_prob); |
7c673cae FG |
4870 | #ifndef ROCKSDB_LITE |
4871 | } else if (FLAGS_readonly) { | |
4872 | s = DB::OpenForReadOnly(options, db_name, &db->db); | |
4873 | } else if (FLAGS_optimistic_transaction_db) { | |
4874 | s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db); | |
4875 | if (s.ok()) { | |
4876 | db->db = db->opt_txn_db->GetBaseDB(); | |
4877 | } | |
4878 | } else if (FLAGS_transaction_db) { | |
11fdf7f2 | 4879 | TransactionDB* ptr = nullptr; |
7c673cae | 4880 | TransactionDBOptions txn_db_options; |
f67539c2 TL |
4881 | if (options.unordered_write) { |
4882 | options.two_write_queues = true; | |
4883 | txn_db_options.skip_concurrency_control = true; | |
4884 | txn_db_options.write_policy = WRITE_PREPARED; | |
4885 | } | |
11fdf7f2 TL |
4886 | s = CreateLoggerFromOptions(db_name, options, &options.info_log); |
4887 | if (s.ok()) { | |
4888 | s = TransactionDB::Open(options, txn_db_options, db_name, &ptr); | |
4889 | } | |
7c673cae FG |
4890 | if (s.ok()) { |
4891 | db->db = ptr; | |
4892 | } | |
7c673cae | 4893 | } else if (FLAGS_use_blob_db) { |
1e59de90 | 4894 | // Stacked BlobDB |
11fdf7f2 TL |
4895 | blob_db::BlobDBOptions blob_db_options; |
4896 | blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; | |
f67539c2 | 4897 | blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; |
11fdf7f2 TL |
4898 | blob_db_options.is_fifo = FLAGS_blob_db_is_fifo; |
4899 | blob_db_options.max_db_size = FLAGS_blob_db_max_db_size; | |
4900 | blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs; | |
4901 | blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; | |
4902 | blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; | |
4903 | blob_db_options.blob_file_size = FLAGS_blob_db_file_size; | |
f67539c2 | 4904 | blob_db_options.compression = FLAGS_blob_db_compression_type_e; |
11fdf7f2 TL |
4905 | blob_db::BlobDB* ptr = nullptr; |
4906 | s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr); | |
4907 | if (s.ok()) { | |
4908 | db->db = ptr; | |
4909 | } | |
f67539c2 TL |
4910 | } else if (FLAGS_use_secondary_db) { |
4911 | if (FLAGS_secondary_path.empty()) { | |
4912 | std::string default_secondary_path; | |
4913 | FLAGS_env->GetTestDirectory(&default_secondary_path); | |
4914 | default_secondary_path += "/dbbench_secondary"; | |
4915 | FLAGS_secondary_path = default_secondary_path; | |
4916 | } | |
4917 | s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db); | |
4918 | if (s.ok() && FLAGS_secondary_update_interval > 0) { | |
4919 | secondary_update_thread_.reset(new port::Thread( | |
4920 | [this](int interval, DBWithColumnFamilies* _db) { | |
4921 | while (0 == secondary_update_stopped_.load( | |
4922 | std::memory_order_relaxed)) { | |
4923 | Status secondary_update_status = | |
4924 | _db->db->TryCatchUpWithPrimary(); | |
4925 | if (!secondary_update_status.ok()) { | |
4926 | fprintf(stderr, "Failed to catch up with primary: %s\n", | |
4927 | secondary_update_status.ToString().c_str()); | |
4928 | break; | |
4929 | } | |
4930 | ++secondary_db_updates_; | |
4931 | FLAGS_env->SleepForMicroseconds(interval * 1000000); | |
4932 | } | |
4933 | }, | |
4934 | FLAGS_secondary_update_interval, db)); | |
4935 | } | |
11fdf7f2 | 4936 | #endif // ROCKSDB_LITE |
7c673cae FG |
4937 | } else { |
4938 | s = DB::Open(options, db_name, &db->db); | |
4939 | } | |
1e59de90 TL |
4940 | if (FLAGS_report_open_timing) { |
4941 | std::cout << "OpenDb: " | |
4942 | << (FLAGS_env->NowNanos() - open_start) / 1000000.0 | |
4943 | << " milliseconds\n"; | |
4944 | } | |
7c673cae FG |
4945 | if (!s.ok()) { |
4946 | fprintf(stderr, "open error: %s\n", s.ToString().c_str()); | |
4947 | exit(1); | |
4948 | } | |
4949 | } | |
4950 | ||
1e59de90 | 4951 | enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM }; |
7c673cae FG |
4952 | |
4953 | void WriteSeqDeterministic(ThreadState* thread) { | |
4954 | DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL); | |
4955 | } | |
4956 | ||
4957 | void WriteUniqueRandomDeterministic(ThreadState* thread) { | |
4958 | DoDeterministicCompact(thread, open_options_.compaction_style, | |
4959 | UNIQUE_RANDOM); | |
4960 | } | |
4961 | ||
1e59de90 | 4962 | void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); } |
7c673cae | 4963 | |
1e59de90 | 4964 | void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); } |
7c673cae FG |
4965 | |
4966 | void WriteUniqueRandom(ThreadState* thread) { | |
4967 | DoWrite(thread, UNIQUE_RANDOM); | |
4968 | } | |
4969 | ||
4970 | class KeyGenerator { | |
4971 | public: | |
11fdf7f2 TL |
4972 | KeyGenerator(Random64* rand, WriteMode mode, uint64_t num, |
4973 | uint64_t /*num_per_set*/ = 64 * 1024) | |
4974 | : rand_(rand), mode_(mode), num_(num), next_(0) { | |
7c673cae FG |
4975 | if (mode_ == UNIQUE_RANDOM) { |
4976 | // NOTE: if memory consumption of this approach becomes a concern, | |
4977 | // we can either break it into pieces and only random shuffle a section | |
4978 | // each time. Alternatively, use a bit map implementation | |
4979 | // (https://reviews.facebook.net/differential/diff/54627/) | |
4980 | values_.resize(num_); | |
4981 | for (uint64_t i = 0; i < num_; ++i) { | |
4982 | values_[i] = i; | |
4983 | } | |
20effc67 | 4984 | RandomShuffle(values_.begin(), values_.end(), |
1e59de90 | 4985 | static_cast<uint32_t>(seed_base)); |
7c673cae FG |
4986 | } |
4987 | } | |
4988 | ||
4989 | uint64_t Next() { | |
4990 | switch (mode_) { | |
4991 | case SEQUENTIAL: | |
4992 | return next_++; | |
4993 | case RANDOM: | |
4994 | return rand_->Next() % num_; | |
4995 | case UNIQUE_RANDOM: | |
11fdf7f2 | 4996 | assert(next_ < num_); |
7c673cae FG |
4997 | return values_[next_++]; |
4998 | } | |
4999 | assert(false); | |
5000 | return std::numeric_limits<uint64_t>::max(); | |
5001 | } | |
5002 | ||
1e59de90 TL |
5003 | // Only available for UNIQUE_RANDOM mode. |
5004 | uint64_t Fetch(uint64_t index) { | |
5005 | assert(mode_ == UNIQUE_RANDOM); | |
5006 | assert(index < values_.size()); | |
5007 | return values_[index]; | |
5008 | } | |
5009 | ||
7c673cae FG |
5010 | private: |
5011 | Random64* rand_; | |
5012 | WriteMode mode_; | |
5013 | const uint64_t num_; | |
5014 | uint64_t next_; | |
5015 | std::vector<uint64_t> values_; | |
5016 | }; | |
5017 | ||
1e59de90 | 5018 | DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; } |
7c673cae FG |
5019 | |
5020 | DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) { | |
5021 | return SelectDBWithCfh(thread->rand.Next()); | |
5022 | } | |
5023 | ||
5024 | DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) { | |
5025 | if (db_.db != nullptr) { | |
5026 | return &db_; | |
1e59de90 | 5027 | } else { |
7c673cae FG |
5028 | return &multi_dbs_[rand_int % multi_dbs_.size()]; |
5029 | } | |
5030 | } | |
5031 | ||
11fdf7f2 | 5032 | double SineRate(double x) { |
1e59de90 | 5033 | return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d; |
11fdf7f2 TL |
5034 | } |
5035 | ||
7c673cae FG |
5036 | void DoWrite(ThreadState* thread, WriteMode write_mode) { |
5037 | const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; | |
5038 | const int64_t num_ops = writes_ == 0 ? num_ : writes_; | |
5039 | ||
5040 | size_t num_key_gens = 1; | |
5041 | if (db_.db == nullptr) { | |
5042 | num_key_gens = multi_dbs_.size(); | |
5043 | } | |
5044 | std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens); | |
5045 | int64_t max_ops = num_ops * num_key_gens; | |
5046 | int64_t ops_per_stage = max_ops; | |
5047 | if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) { | |
5048 | ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families / | |
5049 | FLAGS_num_hot_column_families) + | |
5050 | 1; | |
5051 | } | |
5052 | ||
5053 | Duration duration(test_duration, max_ops, ops_per_stage); | |
1e59de90 | 5054 | const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_; |
7c673cae | 5055 | for (size_t i = 0; i < num_key_gens; i++) { |
11fdf7f2 | 5056 | key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, |
1e59de90 | 5057 | num_per_key_gen, ops_per_stage)); |
7c673cae FG |
5058 | } |
5059 | ||
5060 | if (num_ != FLAGS_num) { | |
5061 | char msg[100]; | |
5062 | snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_); | |
5063 | thread->stats.AddMessage(msg); | |
5064 | } | |
5065 | ||
5066 | RandomGenerator gen; | |
20effc67 | 5067 | WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0, |
1e59de90 | 5068 | FLAGS_write_batch_protection_bytes_per_key, |
20effc67 | 5069 | user_timestamp_size_); |
7c673cae FG |
5070 | Status s; |
5071 | int64_t bytes = 0; | |
5072 | ||
5073 | std::unique_ptr<const char[]> key_guard; | |
5074 | Slice key = AllocateKey(&key_guard); | |
5075 | std::unique_ptr<const char[]> begin_key_guard; | |
5076 | Slice begin_key = AllocateKey(&begin_key_guard); | |
5077 | std::unique_ptr<const char[]> end_key_guard; | |
5078 | Slice end_key = AllocateKey(&end_key_guard); | |
1e59de90 TL |
5079 | double p = 0.0; |
5080 | uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0; | |
5081 | // If user set overwrite_probability flag, | |
5082 | // check if value is in [0.0,1.0]. | |
5083 | if (FLAGS_overwrite_probability > 0.0) { | |
5084 | p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability; | |
5085 | // If overwrite set by user, and UNIQUE_RANDOM mode on, | |
5086 | // the overwrite_window_size must be > 0. | |
5087 | if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) { | |
5088 | fprintf(stderr, | |
5089 | "Overwrite_window_size must be strictly greater than 0.\n"); | |
5090 | ErrorExit(); | |
5091 | } | |
5092 | } | |
5093 | ||
5094 | // Default_random_engine provides slightly | |
5095 | // improved throughput over mt19937. | |
5096 | std::default_random_engine overwrite_gen{ | |
5097 | static_cast<unsigned int>(seed_base)}; | |
5098 | std::bernoulli_distribution overwrite_decider(p); | |
5099 | ||
5100 | // Inserted key window is filled with the last N | |
5101 | // keys previously inserted into the DB (with | |
5102 | // N=FLAGS_overwrite_window_size). | |
5103 | // We use a deque struct because: | |
5104 | // - random access is O(1) | |
5105 | // - insertion/removal at beginning/end is also O(1). | |
5106 | std::deque<int64_t> inserted_key_window; | |
5107 | Random64 reservoir_id_gen(seed_base); | |
5108 | ||
5109 | // --- Variables used in disposable/persistent keys simulation: | |
5110 | // The following variables are used when | |
5111 | // disposable_entries_batch_size is >0. We simualte a workload | |
5112 | // where the following sequence is repeated multiple times: | |
5113 | // "A set of keys S1 is inserted ('disposable entries'), then after | |
5114 | // some delay another set of keys S2 is inserted ('persistent entries') | |
5115 | // and the first set of keys S1 is deleted. S2 artificially represents | |
5116 | // the insertion of hypothetical results from some undefined computation | |
5117 | // done on the first set of keys S1. The next sequence can start as soon | |
5118 | // as the last disposable entry in the set S1 of this sequence is | |
5119 | // inserted, if the delay is non negligible" | |
5120 | bool skip_for_loop = false, is_disposable_entry = true; | |
5121 | std::vector<uint64_t> disposable_entries_index(num_key_gens, 0); | |
5122 | std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0); | |
5123 | const uint64_t kNumDispAndPersEntries = | |
5124 | FLAGS_disposable_entries_batch_size + | |
5125 | FLAGS_persistent_entries_batch_size; | |
5126 | if (kNumDispAndPersEntries > 0) { | |
5127 | if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) || | |
5128 | (p > 0.0)) { | |
5129 | fprintf( | |
5130 | stderr, | |
5131 | "Disposable/persistent deletes are not compatible with overwrites " | |
5132 | "and DeleteRanges; and are only supported in filluniquerandom.\n"); | |
5133 | ErrorExit(); | |
5134 | } | |
5135 | if (FLAGS_disposable_entries_value_size < 0 || | |
5136 | FLAGS_persistent_entries_value_size < 0) { | |
5137 | fprintf( | |
5138 | stderr, | |
5139 | "disposable_entries_value_size and persistent_entries_value_size" | |
5140 | "have to be positive.\n"); | |
5141 | ErrorExit(); | |
5142 | } | |
5143 | } | |
5144 | Random rnd_disposable_entry(static_cast<uint32_t>(seed_base)); | |
5145 | std::string random_value; | |
5146 | // Queue that stores scheduled timestamp of disposable entries deletes, | |
5147 | // along with starting index of disposable entry keys to delete. | |
5148 | std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q( | |
5149 | num_key_gens); | |
5150 | // --- End of variables used in disposable/persistent keys simulation. | |
5151 | ||
7c673cae FG |
5152 | std::vector<std::unique_ptr<const char[]>> expanded_key_guards; |
5153 | std::vector<Slice> expanded_keys; | |
5154 | if (FLAGS_expand_range_tombstones) { | |
5155 | expanded_key_guards.resize(range_tombstone_width_); | |
5156 | for (auto& expanded_key_guard : expanded_key_guards) { | |
5157 | expanded_keys.emplace_back(AllocateKey(&expanded_key_guard)); | |
5158 | } | |
5159 | } | |
5160 | ||
20effc67 TL |
5161 | std::unique_ptr<char[]> ts_guard; |
5162 | if (user_timestamp_size_ > 0) { | |
5163 | ts_guard.reset(new char[user_timestamp_size_]); | |
5164 | } | |
5165 | ||
7c673cae FG |
5166 | int64_t stage = 0; |
5167 | int64_t num_written = 0; | |
1e59de90 TL |
5168 | int64_t next_seq_db_at = num_ops; |
5169 | size_t id = 0; | |
5170 | int64_t num_range_deletions = 0; | |
5171 | ||
5172 | while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) { | |
7c673cae FG |
5173 | if (duration.GetStage() != stage) { |
5174 | stage = duration.GetStage(); | |
5175 | if (db_.db != nullptr) { | |
5176 | db_.CreateNewCf(open_options_, stage); | |
5177 | } else { | |
5178 | for (auto& db : multi_dbs_) { | |
5179 | db.CreateNewCf(open_options_, stage); | |
5180 | } | |
5181 | } | |
5182 | } | |
5183 | ||
1e59de90 TL |
5184 | if (write_mode != SEQUENTIAL) { |
5185 | id = thread->rand.Next() % num_key_gens; | |
5186 | } else { | |
5187 | // When doing a sequential load with multiple databases, load them in | |
5188 | // order rather than all at the same time to avoid: | |
5189 | // 1) long delays between flushing memtables | |
5190 | // 2) flushing memtables for all of them at the same point in time | |
5191 | // 3) not putting the same number of keys in each database | |
5192 | if (num_written >= next_seq_db_at) { | |
5193 | next_seq_db_at += num_ops; | |
5194 | id++; | |
5195 | if (id >= num_key_gens) { | |
5196 | fprintf(stderr, "Logic error. Filled all databases\n"); | |
5197 | ErrorExit(); | |
5198 | } | |
5199 | } | |
5200 | } | |
7c673cae | 5201 | DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id); |
1e59de90 | 5202 | |
7c673cae | 5203 | batch.Clear(); |
f67539c2 | 5204 | int64_t batch_bytes = 0; |
7c673cae FG |
5205 | |
5206 | for (int64_t j = 0; j < entries_per_batch_; j++) { | |
1e59de90 TL |
5207 | int64_t rand_num = 0; |
5208 | if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) { | |
5209 | if ((inserted_key_window.size() > 0) && | |
5210 | overwrite_decider(overwrite_gen)) { | |
5211 | num_overwrites++; | |
5212 | rand_num = inserted_key_window[reservoir_id_gen.Next() % | |
5213 | inserted_key_window.size()]; | |
5214 | } else { | |
5215 | num_unique_keys++; | |
5216 | rand_num = key_gens[id]->Next(); | |
5217 | if (inserted_key_window.size() < FLAGS_overwrite_window_size) { | |
5218 | inserted_key_window.push_back(rand_num); | |
5219 | } else { | |
5220 | inserted_key_window.pop_front(); | |
5221 | inserted_key_window.push_back(rand_num); | |
5222 | } | |
5223 | } | |
5224 | } else if (kNumDispAndPersEntries > 0) { | |
5225 | // Check if queue is non-empty and if we need to insert | |
5226 | // 'persistent' KV entries (KV entries that are never deleted) | |
5227 | // and delete disposable entries previously inserted. | |
5228 | if (!disposable_entries_q[id].empty() && | |
5229 | (disposable_entries_q[id].front().first < | |
5230 | FLAGS_env->NowMicros())) { | |
5231 | // If we need to perform a "merge op" pattern, | |
5232 | // we first write all the persistent KV entries not targeted | |
5233 | // by deletes, and then we write the disposable entries deletes. | |
5234 | if (persistent_ent_and_del_index[id] < | |
5235 | FLAGS_persistent_entries_batch_size) { | |
5236 | // Generate key to insert. | |
5237 | rand_num = | |
5238 | key_gens[id]->Fetch(disposable_entries_q[id].front().second + | |
5239 | FLAGS_disposable_entries_batch_size + | |
5240 | persistent_ent_and_del_index[id]); | |
5241 | persistent_ent_and_del_index[id]++; | |
5242 | is_disposable_entry = false; | |
5243 | skip_for_loop = false; | |
5244 | } else if (persistent_ent_and_del_index[id] < | |
5245 | kNumDispAndPersEntries) { | |
5246 | // Find key of the entry to delete. | |
5247 | rand_num = | |
5248 | key_gens[id]->Fetch(disposable_entries_q[id].front().second + | |
5249 | (persistent_ent_and_del_index[id] - | |
5250 | FLAGS_persistent_entries_batch_size)); | |
5251 | persistent_ent_and_del_index[id]++; | |
5252 | GenerateKeyFromInt(rand_num, FLAGS_num, &key); | |
5253 | // For the delete operation, everything happens here and we | |
5254 | // skip the rest of the for-loop, which is designed for | |
5255 | // inserts. | |
5256 | if (FLAGS_num_column_families <= 1) { | |
5257 | batch.Delete(key); | |
5258 | } else { | |
5259 | // We use same rand_num as seed for key and column family so | |
5260 | // that we can deterministically find the cfh corresponding to a | |
5261 | // particular key while reading the key. | |
5262 | batch.Delete(db_with_cfh->GetCfh(rand_num), key); | |
5263 | } | |
5264 | // A delete only includes Key+Timestamp (no value). | |
5265 | batch_bytes += key_size_ + user_timestamp_size_; | |
5266 | bytes += key_size_ + user_timestamp_size_; | |
5267 | num_selective_deletes++; | |
5268 | // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++). | |
5269 | skip_for_loop = true; | |
5270 | } else { | |
5271 | assert(false); // should never reach this point. | |
5272 | } | |
5273 | // If disposable_entries_q needs to be updated (ie: when a selective | |
5274 | // insert+delete was successfully completed, pop the job out of the | |
5275 | // queue). | |
5276 | if (!disposable_entries_q[id].empty() && | |
5277 | (disposable_entries_q[id].front().first < | |
5278 | FLAGS_env->NowMicros()) && | |
5279 | persistent_ent_and_del_index[id] == kNumDispAndPersEntries) { | |
5280 | disposable_entries_q[id].pop(); | |
5281 | persistent_ent_and_del_index[id] = 0; | |
5282 | } | |
5283 | ||
5284 | // If we are deleting disposable entries, skip the rest of the | |
5285 | // for-loop since there is no key-value inserts at this moment in | |
5286 | // time. | |
5287 | if (skip_for_loop) { | |
5288 | continue; | |
5289 | } | |
5290 | ||
5291 | } | |
5292 | // If no job is in the queue, then we keep inserting disposable KV | |
5293 | // entries that will be deleted later by a series of deletes. | |
5294 | else { | |
5295 | rand_num = key_gens[id]->Fetch(disposable_entries_index[id]); | |
5296 | disposable_entries_index[id]++; | |
5297 | is_disposable_entry = true; | |
5298 | if ((disposable_entries_index[id] % | |
5299 | FLAGS_disposable_entries_batch_size) == 0) { | |
5300 | // Skip the persistent KV entries inserts for now | |
5301 | disposable_entries_index[id] += | |
5302 | FLAGS_persistent_entries_batch_size; | |
5303 | } | |
5304 | } | |
5305 | } else { | |
5306 | rand_num = key_gens[id]->Next(); | |
5307 | } | |
7c673cae | 5308 | GenerateKeyFromInt(rand_num, FLAGS_num, &key); |
1e59de90 TL |
5309 | Slice val; |
5310 | if (kNumDispAndPersEntries > 0) { | |
5311 | random_value = rnd_disposable_entry.RandomString( | |
5312 | is_disposable_entry ? FLAGS_disposable_entries_value_size | |
5313 | : FLAGS_persistent_entries_value_size); | |
5314 | val = Slice(random_value); | |
5315 | num_unique_keys++; | |
5316 | } else { | |
5317 | val = gen.Generate(); | |
5318 | } | |
11fdf7f2 TL |
5319 | if (use_blob_db_) { |
5320 | #ifndef ROCKSDB_LITE | |
1e59de90 | 5321 | // Stacked BlobDB |
11fdf7f2 TL |
5322 | blob_db::BlobDB* blobdb = |
5323 | static_cast<blob_db::BlobDB*>(db_with_cfh->db); | |
f67539c2 TL |
5324 | if (FLAGS_blob_db_max_ttl_range > 0) { |
5325 | int ttl = rand() % FLAGS_blob_db_max_ttl_range; | |
5326 | s = blobdb->PutWithTTL(write_options_, key, val, ttl); | |
5327 | } else { | |
5328 | s = blobdb->Put(write_options_, key, val); | |
5329 | } | |
11fdf7f2 | 5330 | #endif // ROCKSDB_LITE |
7c673cae | 5331 | } else if (FLAGS_num_column_families <= 1) { |
f67539c2 | 5332 | batch.Put(key, val); |
7c673cae FG |
5333 | } else { |
5334 | // We use same rand_num as seed for key and column family so that we | |
5335 | // can deterministically find the cfh corresponding to a particular | |
5336 | // key while reading the key. | |
1e59de90 | 5337 | batch.Put(db_with_cfh->GetCfh(rand_num), key, val); |
7c673cae | 5338 | } |
20effc67 TL |
5339 | batch_bytes += val.size() + key_size_ + user_timestamp_size_; |
5340 | bytes += val.size() + key_size_ + user_timestamp_size_; | |
7c673cae | 5341 | ++num_written; |
1e59de90 TL |
5342 | |
5343 | // If all disposable entries have been inserted, then we need to | |
5344 | // add in the job queue a call for 'persistent entry insertions + | |
5345 | // disposable entry deletions'. | |
5346 | if (kNumDispAndPersEntries > 0 && is_disposable_entry && | |
5347 | ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) { | |
5348 | // Queue contains [timestamp, starting_idx], | |
5349 | // timestamp = current_time + delay (minimum aboslute time when to | |
5350 | // start inserting the selective deletes) starting_idx = index in the | |
5351 | // keygen of the rand_num to generate the key of the first KV entry to | |
5352 | // delete (= key of the first selective delete). | |
5353 | disposable_entries_q[id].push(std::make_pair( | |
5354 | FLAGS_env->NowMicros() + | |
5355 | FLAGS_disposable_entries_delete_delay /* timestamp */, | |
5356 | disposable_entries_index[id] - kNumDispAndPersEntries | |
5357 | /*starting idx*/)); | |
5358 | } | |
7c673cae | 5359 | if (writes_per_range_tombstone_ > 0 && |
494da23a TL |
5360 | num_written > writes_before_delete_range_ && |
5361 | (num_written - writes_before_delete_range_) / | |
5362 | writes_per_range_tombstone_ <= | |
7c673cae | 5363 | max_num_range_tombstones_ && |
494da23a TL |
5364 | (num_written - writes_before_delete_range_) % |
5365 | writes_per_range_tombstone_ == | |
5366 | 0) { | |
1e59de90 | 5367 | num_range_deletions++; |
7c673cae FG |
5368 | int64_t begin_num = key_gens[id]->Next(); |
5369 | if (FLAGS_expand_range_tombstones) { | |
5370 | for (int64_t offset = 0; offset < range_tombstone_width_; | |
5371 | ++offset) { | |
5372 | GenerateKeyFromInt(begin_num + offset, FLAGS_num, | |
5373 | &expanded_keys[offset]); | |
11fdf7f2 TL |
5374 | if (use_blob_db_) { |
5375 | #ifndef ROCKSDB_LITE | |
1e59de90 | 5376 | // Stacked BlobDB |
7c673cae FG |
5377 | s = db_with_cfh->db->Delete(write_options_, |
5378 | expanded_keys[offset]); | |
11fdf7f2 | 5379 | #endif // ROCKSDB_LITE |
7c673cae FG |
5380 | } else if (FLAGS_num_column_families <= 1) { |
5381 | batch.Delete(expanded_keys[offset]); | |
5382 | } else { | |
5383 | batch.Delete(db_with_cfh->GetCfh(rand_num), | |
5384 | expanded_keys[offset]); | |
5385 | } | |
5386 | } | |
5387 | } else { | |
5388 | GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key); | |
5389 | GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num, | |
5390 | &end_key); | |
11fdf7f2 TL |
5391 | if (use_blob_db_) { |
5392 | #ifndef ROCKSDB_LITE | |
1e59de90 | 5393 | // Stacked BlobDB |
7c673cae FG |
5394 | s = db_with_cfh->db->DeleteRange( |
5395 | write_options_, db_with_cfh->db->DefaultColumnFamily(), | |
5396 | begin_key, end_key); | |
11fdf7f2 | 5397 | #endif // ROCKSDB_LITE |
7c673cae FG |
5398 | } else if (FLAGS_num_column_families <= 1) { |
5399 | batch.DeleteRange(begin_key, end_key); | |
5400 | } else { | |
5401 | batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key, | |
5402 | end_key); | |
5403 | } | |
5404 | } | |
5405 | } | |
5406 | } | |
f67539c2 TL |
5407 | if (thread->shared->write_rate_limiter.get() != nullptr) { |
5408 | thread->shared->write_rate_limiter->Request( | |
1e59de90 TL |
5409 | batch_bytes, Env::IO_HIGH, nullptr /* stats */, |
5410 | RateLimiter::OpType::kWrite); | |
f67539c2 TL |
5411 | // Set time at which last op finished to Now() to hide latency and |
5412 | // sleep from rate limiter. Also, do the check once per batch, not | |
5413 | // once per write. | |
5414 | thread->stats.ResetLastOpTime(); | |
5415 | } | |
20effc67 TL |
5416 | if (user_timestamp_size_ > 0) { |
5417 | Slice user_ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
5418 | s = batch.UpdateTimestamps( |
5419 | user_ts, [this](uint32_t) { return user_timestamp_size_; }); | |
20effc67 TL |
5420 | if (!s.ok()) { |
5421 | fprintf(stderr, "assign timestamp to write batch: %s\n", | |
5422 | s.ToString().c_str()); | |
5423 | ErrorExit(); | |
5424 | } | |
5425 | } | |
11fdf7f2 | 5426 | if (!use_blob_db_) { |
1e59de90 | 5427 | // Not stacked BlobDB |
7c673cae FG |
5428 | s = db_with_cfh->db->Write(write_options_, &batch); |
5429 | } | |
5430 | thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, | |
5431 | entries_per_batch_, kWrite); | |
11fdf7f2 TL |
5432 | if (FLAGS_sine_write_rate) { |
5433 | uint64_t now = FLAGS_env->NowMicros(); | |
5434 | ||
5435 | uint64_t usecs_since_last; | |
5436 | if (now > thread->stats.GetSineInterval()) { | |
5437 | usecs_since_last = now - thread->stats.GetSineInterval(); | |
5438 | } else { | |
5439 | usecs_since_last = 0; | |
5440 | } | |
5441 | ||
5442 | if (usecs_since_last > | |
5443 | (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) { | |
5444 | double usecs_since_start = | |
1e59de90 | 5445 | static_cast<double>(now - thread->stats.GetStart()); |
11fdf7f2 TL |
5446 | thread->stats.ResetSineInterval(); |
5447 | uint64_t write_rate = | |
1e59de90 | 5448 | static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0)); |
11fdf7f2 | 5449 | thread->shared->write_rate_limiter.reset( |
1e59de90 | 5450 | NewGenericRateLimiter(write_rate)); |
11fdf7f2 TL |
5451 | } |
5452 | } | |
5453 | if (!s.ok()) { | |
5454 | s = listener_->WaitForRecovery(600000000) ? Status::OK() : s; | |
5455 | } | |
5456 | ||
7c673cae FG |
5457 | if (!s.ok()) { |
5458 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
20effc67 | 5459 | ErrorExit(); |
7c673cae FG |
5460 | } |
5461 | } | |
1e59de90 TL |
5462 | if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) { |
5463 | fprintf(stdout, | |
5464 | "Number of unique keys inserted: %" PRIu64 | |
5465 | ".\nNumber of overwrites: %" PRIu64 "\n", | |
5466 | num_unique_keys, num_overwrites); | |
5467 | } else if (kNumDispAndPersEntries > 0) { | |
5468 | fprintf(stdout, | |
5469 | "Number of unique keys inserted (disposable+persistent): %" PRIu64 | |
5470 | ".\nNumber of 'disposable entry delete': %" PRIu64 "\n", | |
5471 | num_written, num_selective_deletes); | |
5472 | } | |
5473 | if (num_range_deletions > 0) { | |
5474 | std::cout << "Number of range deletions: " << num_range_deletions | |
5475 | << std::endl; | |
5476 | } | |
7c673cae FG |
5477 | thread->stats.AddBytes(bytes); |
5478 | } | |
5479 | ||
5480 | Status DoDeterministicCompact(ThreadState* thread, | |
5481 | CompactionStyle compaction_style, | |
5482 | WriteMode write_mode) { | |
5483 | #ifndef ROCKSDB_LITE | |
5484 | ColumnFamilyMetaData meta; | |
5485 | std::vector<DB*> db_list; | |
5486 | if (db_.db != nullptr) { | |
5487 | db_list.push_back(db_.db); | |
5488 | } else { | |
5489 | for (auto& db : multi_dbs_) { | |
5490 | db_list.push_back(db.db); | |
5491 | } | |
5492 | } | |
5493 | std::vector<Options> options_list; | |
5494 | for (auto db : db_list) { | |
5495 | options_list.push_back(db->GetOptions()); | |
5496 | if (compaction_style != kCompactionStyleFIFO) { | |
5497 | db->SetOptions({{"disable_auto_compactions", "1"}, | |
5498 | {"level0_slowdown_writes_trigger", "400000000"}, | |
5499 | {"level0_stop_writes_trigger", "400000000"}}); | |
5500 | } else { | |
5501 | db->SetOptions({{"disable_auto_compactions", "1"}}); | |
5502 | } | |
5503 | } | |
5504 | ||
5505 | assert(!db_list.empty()); | |
5506 | auto num_db = db_list.size(); | |
5507 | size_t num_levels = static_cast<size_t>(open_options_.num_levels); | |
5508 | size_t output_level = open_options_.num_levels - 1; | |
5509 | std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db); | |
5510 | std::vector<size_t> num_files_at_level0(num_db, 0); | |
5511 | if (compaction_style == kCompactionStyleLevel) { | |
5512 | if (num_levels == 0) { | |
5513 | return Status::InvalidArgument("num_levels should be larger than 1"); | |
5514 | } | |
5515 | bool should_stop = false; | |
5516 | while (!should_stop) { | |
5517 | if (sorted_runs[0].empty()) { | |
5518 | DoWrite(thread, write_mode); | |
5519 | } else { | |
5520 | DoWrite(thread, UNIQUE_RANDOM); | |
5521 | } | |
5522 | for (size_t i = 0; i < num_db; i++) { | |
5523 | auto db = db_list[i]; | |
5524 | db->Flush(FlushOptions()); | |
5525 | db->GetColumnFamilyMetaData(&meta); | |
5526 | if (num_files_at_level0[i] == meta.levels[0].files.size() || | |
5527 | writes_ == 0) { | |
5528 | should_stop = true; | |
5529 | continue; | |
5530 | } | |
5531 | sorted_runs[i].emplace_back( | |
5532 | meta.levels[0].files.begin(), | |
5533 | meta.levels[0].files.end() - num_files_at_level0[i]); | |
5534 | num_files_at_level0[i] = meta.levels[0].files.size(); | |
5535 | if (sorted_runs[i].back().size() == 1) { | |
5536 | should_stop = true; | |
5537 | continue; | |
5538 | } | |
5539 | if (sorted_runs[i].size() == output_level) { | |
5540 | auto& L1 = sorted_runs[i].back(); | |
5541 | L1.erase(L1.begin(), L1.begin() + L1.size() / 3); | |
5542 | should_stop = true; | |
5543 | continue; | |
5544 | } | |
5545 | } | |
1e59de90 TL |
5546 | writes_ /= |
5547 | static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier); | |
7c673cae FG |
5548 | } |
5549 | for (size_t i = 0; i < num_db; i++) { | |
5550 | if (sorted_runs[i].size() < num_levels - 1) { | |
1e59de90 TL |
5551 | fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", |
5552 | num_levels); | |
7c673cae FG |
5553 | exit(1); |
5554 | } | |
5555 | } | |
5556 | for (size_t i = 0; i < num_db; i++) { | |
5557 | auto db = db_list[i]; | |
5558 | auto compactionOptions = CompactionOptions(); | |
11fdf7f2 | 5559 | compactionOptions.compression = FLAGS_compression_type_e; |
7c673cae FG |
5560 | auto options = db->GetOptions(); |
5561 | MutableCFOptions mutable_cf_options(options); | |
5562 | for (size_t j = 0; j < sorted_runs[i].size(); j++) { | |
1e59de90 TL |
5563 | compactionOptions.output_file_size_limit = MaxFileSizeForLevel( |
5564 | mutable_cf_options, static_cast<int>(output_level), | |
5565 | compaction_style); | |
7c673cae | 5566 | std::cout << sorted_runs[i][j].size() << std::endl; |
1e59de90 TL |
5567 | db->CompactFiles( |
5568 | compactionOptions, | |
5569 | {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name}, | |
5570 | static_cast<int>(output_level - j) /*level*/); | |
7c673cae FG |
5571 | } |
5572 | } | |
5573 | } else if (compaction_style == kCompactionStyleUniversal) { | |
5574 | auto ratio = open_options_.compaction_options_universal.size_ratio; | |
5575 | bool should_stop = false; | |
5576 | while (!should_stop) { | |
5577 | if (sorted_runs[0].empty()) { | |
5578 | DoWrite(thread, write_mode); | |
5579 | } else { | |
5580 | DoWrite(thread, UNIQUE_RANDOM); | |
5581 | } | |
5582 | for (size_t i = 0; i < num_db; i++) { | |
5583 | auto db = db_list[i]; | |
5584 | db->Flush(FlushOptions()); | |
5585 | db->GetColumnFamilyMetaData(&meta); | |
5586 | if (num_files_at_level0[i] == meta.levels[0].files.size() || | |
5587 | writes_ == 0) { | |
5588 | should_stop = true; | |
5589 | continue; | |
5590 | } | |
5591 | sorted_runs[i].emplace_back( | |
5592 | meta.levels[0].files.begin(), | |
5593 | meta.levels[0].files.end() - num_files_at_level0[i]); | |
5594 | num_files_at_level0[i] = meta.levels[0].files.size(); | |
5595 | if (sorted_runs[i].back().size() == 1) { | |
5596 | should_stop = true; | |
5597 | continue; | |
5598 | } | |
5599 | num_files_at_level0[i] = meta.levels[0].files.size(); | |
5600 | } | |
1e59de90 TL |
5601 | writes_ = static_cast<int64_t>(writes_ * static_cast<double>(100) / |
5602 | (ratio + 200)); | |
7c673cae FG |
5603 | } |
5604 | for (size_t i = 0; i < num_db; i++) { | |
5605 | if (sorted_runs[i].size() < num_levels) { | |
1e59de90 TL |
5606 | fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", |
5607 | num_levels); | |
7c673cae FG |
5608 | exit(1); |
5609 | } | |
5610 | } | |
5611 | for (size_t i = 0; i < num_db; i++) { | |
5612 | auto db = db_list[i]; | |
5613 | auto compactionOptions = CompactionOptions(); | |
11fdf7f2 | 5614 | compactionOptions.compression = FLAGS_compression_type_e; |
7c673cae FG |
5615 | auto options = db->GetOptions(); |
5616 | MutableCFOptions mutable_cf_options(options); | |
5617 | for (size_t j = 0; j < sorted_runs[i].size(); j++) { | |
1e59de90 TL |
5618 | compactionOptions.output_file_size_limit = MaxFileSizeForLevel( |
5619 | mutable_cf_options, static_cast<int>(output_level), | |
5620 | compaction_style); | |
7c673cae FG |
5621 | db->CompactFiles( |
5622 | compactionOptions, | |
5623 | {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name}, | |
5624 | (output_level > j ? static_cast<int>(output_level - j) | |
5625 | : 0) /*level*/); | |
5626 | } | |
5627 | } | |
5628 | } else if (compaction_style == kCompactionStyleFIFO) { | |
5629 | if (num_levels != 1) { | |
5630 | return Status::InvalidArgument( | |
1e59de90 | 5631 | "num_levels should be 1 for FIFO compaction"); |
7c673cae FG |
5632 | } |
5633 | if (FLAGS_num_multi_db != 0) { | |
5634 | return Status::InvalidArgument("Doesn't support multiDB"); | |
5635 | } | |
5636 | auto db = db_list[0]; | |
5637 | std::vector<std::string> file_names; | |
5638 | while (true) { | |
5639 | if (sorted_runs[0].empty()) { | |
5640 | DoWrite(thread, write_mode); | |
5641 | } else { | |
5642 | DoWrite(thread, UNIQUE_RANDOM); | |
5643 | } | |
5644 | db->Flush(FlushOptions()); | |
5645 | db->GetColumnFamilyMetaData(&meta); | |
5646 | auto total_size = meta.levels[0].size; | |
5647 | if (total_size >= | |
1e59de90 | 5648 | db->GetOptions().compaction_options_fifo.max_table_files_size) { |
7c673cae FG |
5649 | for (auto file_meta : meta.levels[0].files) { |
5650 | file_names.emplace_back(file_meta.name); | |
5651 | } | |
5652 | break; | |
5653 | } | |
5654 | } | |
5655 | // TODO(shuzhang1989): Investigate why CompactFiles not working | |
5656 | // auto compactionOptions = CompactionOptions(); | |
5657 | // db->CompactFiles(compactionOptions, file_names, 0); | |
5658 | auto compactionOptions = CompactRangeOptions(); | |
5659 | db->CompactRange(compactionOptions, nullptr, nullptr); | |
5660 | } else { | |
5661 | fprintf(stdout, | |
5662 | "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n", | |
5663 | "filldeterministic"); | |
5664 | return Status::InvalidArgument("None compaction is not supported"); | |
5665 | } | |
5666 | ||
5667 | // Verify seqno and key range | |
5668 | // Note: the seqno get changed at the max level by implementation | |
5669 | // optimization, so skip the check of the max level. | |
5670 | #ifndef NDEBUG | |
5671 | for (size_t k = 0; k < num_db; k++) { | |
5672 | auto db = db_list[k]; | |
5673 | db->GetColumnFamilyMetaData(&meta); | |
5674 | // verify the number of sorted runs | |
5675 | if (compaction_style == kCompactionStyleLevel) { | |
5676 | assert(num_levels - 1 == sorted_runs[k].size()); | |
5677 | } else if (compaction_style == kCompactionStyleUniversal) { | |
5678 | assert(meta.levels[0].files.size() + num_levels - 1 == | |
5679 | sorted_runs[k].size()); | |
5680 | } else if (compaction_style == kCompactionStyleFIFO) { | |
5681 | // TODO(gzh): FIFO compaction | |
5682 | db->GetColumnFamilyMetaData(&meta); | |
5683 | auto total_size = meta.levels[0].size; | |
5684 | assert(total_size <= | |
1e59de90 TL |
5685 | db->GetOptions().compaction_options_fifo.max_table_files_size); |
5686 | break; | |
7c673cae FG |
5687 | } |
5688 | ||
5689 | // verify smallest/largest seqno and key range of each sorted run | |
5690 | auto max_level = num_levels - 1; | |
5691 | int level; | |
5692 | for (size_t i = 0; i < sorted_runs[k].size(); i++) { | |
5693 | level = static_cast<int>(max_level - i); | |
5694 | SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber; | |
5695 | SequenceNumber sorted_run_largest_seqno = 0; | |
5696 | std::string sorted_run_smallest_key, sorted_run_largest_key; | |
5697 | bool first_key = true; | |
5698 | for (auto fileMeta : sorted_runs[k][i]) { | |
5699 | sorted_run_smallest_seqno = | |
5700 | std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno); | |
5701 | sorted_run_largest_seqno = | |
5702 | std::max(sorted_run_largest_seqno, fileMeta.largest_seqno); | |
5703 | if (first_key || | |
5704 | db->DefaultColumnFamily()->GetComparator()->Compare( | |
5705 | fileMeta.smallestkey, sorted_run_smallest_key) < 0) { | |
5706 | sorted_run_smallest_key = fileMeta.smallestkey; | |
5707 | } | |
5708 | if (first_key || | |
5709 | db->DefaultColumnFamily()->GetComparator()->Compare( | |
5710 | fileMeta.largestkey, sorted_run_largest_key) > 0) { | |
5711 | sorted_run_largest_key = fileMeta.largestkey; | |
5712 | } | |
5713 | first_key = false; | |
5714 | } | |
5715 | if (compaction_style == kCompactionStyleLevel || | |
5716 | (compaction_style == kCompactionStyleUniversal && level > 0)) { | |
5717 | SequenceNumber level_smallest_seqno = kMaxSequenceNumber; | |
5718 | SequenceNumber level_largest_seqno = 0; | |
5719 | for (auto fileMeta : meta.levels[level].files) { | |
5720 | level_smallest_seqno = | |
5721 | std::min(level_smallest_seqno, fileMeta.smallest_seqno); | |
5722 | level_largest_seqno = | |
5723 | std::max(level_largest_seqno, fileMeta.largest_seqno); | |
5724 | } | |
5725 | assert(sorted_run_smallest_key == | |
5726 | meta.levels[level].files.front().smallestkey); | |
5727 | assert(sorted_run_largest_key == | |
5728 | meta.levels[level].files.back().largestkey); | |
5729 | if (level != static_cast<int>(max_level)) { | |
5730 | // compaction at max_level would change sequence number | |
5731 | assert(sorted_run_smallest_seqno == level_smallest_seqno); | |
5732 | assert(sorted_run_largest_seqno == level_largest_seqno); | |
5733 | } | |
5734 | } else if (compaction_style == kCompactionStyleUniversal) { | |
5735 | // level <= 0 means sorted runs on level 0 | |
5736 | auto level0_file = | |
5737 | meta.levels[0].files[sorted_runs[k].size() - 1 - i]; | |
5738 | assert(sorted_run_smallest_key == level0_file.smallestkey); | |
5739 | assert(sorted_run_largest_key == level0_file.largestkey); | |
5740 | if (level != static_cast<int>(max_level)) { | |
5741 | assert(sorted_run_smallest_seqno == level0_file.smallest_seqno); | |
5742 | assert(sorted_run_largest_seqno == level0_file.largest_seqno); | |
5743 | } | |
5744 | } | |
5745 | } | |
5746 | } | |
5747 | #endif | |
5748 | // print the size of each sorted_run | |
5749 | for (size_t k = 0; k < num_db; k++) { | |
5750 | auto db = db_list[k]; | |
5751 | fprintf(stdout, | |
1e59de90 TL |
5752 | "---------------------- DB %" ROCKSDB_PRIszt |
5753 | " LSM ---------------------\n", | |
5754 | k); | |
7c673cae FG |
5755 | db->GetColumnFamilyMetaData(&meta); |
5756 | for (auto& levelMeta : meta.levels) { | |
5757 | if (levelMeta.files.empty()) { | |
5758 | continue; | |
5759 | } | |
5760 | if (levelMeta.level == 0) { | |
5761 | for (auto& fileMeta : levelMeta.files) { | |
1e59de90 | 5762 | fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n", |
7c673cae FG |
5763 | levelMeta.level, fileMeta.name.c_str(), fileMeta.size); |
5764 | } | |
5765 | } else { | |
5766 | fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n", | |
5767 | levelMeta.level, levelMeta.files.front().name.c_str(), | |
5768 | levelMeta.files.back().name.c_str(), levelMeta.size); | |
5769 | } | |
5770 | } | |
5771 | } | |
5772 | for (size_t i = 0; i < num_db; i++) { | |
5773 | db_list[i]->SetOptions( | |
5774 | {{"disable_auto_compactions", | |
5775 | std::to_string(options_list[i].disable_auto_compactions)}, | |
5776 | {"level0_slowdown_writes_trigger", | |
5777 | std::to_string(options_list[i].level0_slowdown_writes_trigger)}, | |
5778 | {"level0_stop_writes_trigger", | |
5779 | std::to_string(options_list[i].level0_stop_writes_trigger)}}); | |
5780 | } | |
5781 | return Status::OK(); | |
5782 | #else | |
11fdf7f2 TL |
5783 | (void)thread; |
5784 | (void)compaction_style; | |
5785 | (void)write_mode; | |
7c673cae FG |
5786 | fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n"); |
5787 | return Status::NotSupported( | |
5788 | "Rocksdb Lite doesn't support filldeterministic"); | |
5789 | #endif // ROCKSDB_LITE | |
5790 | } | |
5791 | ||
5792 | void ReadSequential(ThreadState* thread) { | |
5793 | if (db_.db != nullptr) { | |
5794 | ReadSequential(thread, db_.db); | |
5795 | } else { | |
5796 | for (const auto& db_with_cfh : multi_dbs_) { | |
5797 | ReadSequential(thread, db_with_cfh.db); | |
5798 | } | |
5799 | } | |
5800 | } | |
5801 | ||
5802 | void ReadSequential(ThreadState* thread, DB* db) { | |
1e59de90 | 5803 | ReadOptions options = read_options_; |
20effc67 TL |
5804 | std::unique_ptr<char[]> ts_guard; |
5805 | Slice ts; | |
5806 | if (user_timestamp_size_ > 0) { | |
5807 | ts_guard.reset(new char[user_timestamp_size_]); | |
5808 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
5809 | options.timestamp = &ts; | |
5810 | } | |
7c673cae | 5811 | |
1e59de90 TL |
5812 | options.adaptive_readahead = FLAGS_adaptive_readahead; |
5813 | options.async_io = FLAGS_async_io; | |
5814 | ||
7c673cae FG |
5815 | Iterator* iter = db->NewIterator(options); |
5816 | int64_t i = 0; | |
5817 | int64_t bytes = 0; | |
5818 | for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { | |
5819 | bytes += iter->key().size() + iter->value().size(); | |
5820 | thread->stats.FinishedOps(nullptr, db, 1, kRead); | |
5821 | ++i; | |
5822 | ||
5823 | if (thread->shared->read_rate_limiter.get() != nullptr && | |
5824 | i % 1024 == 1023) { | |
5825 | thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH, | |
11fdf7f2 TL |
5826 | nullptr /* stats */, |
5827 | RateLimiter::OpType::kRead); | |
7c673cae FG |
5828 | } |
5829 | } | |
5830 | ||
5831 | delete iter; | |
5832 | thread->stats.AddBytes(bytes); | |
f67539c2 TL |
5833 | } |
5834 | ||
5835 | void ReadToRowCache(ThreadState* thread) { | |
5836 | int64_t read = 0; | |
5837 | int64_t found = 0; | |
5838 | int64_t bytes = 0; | |
5839 | int64_t key_rand = 0; | |
f67539c2 TL |
5840 | std::unique_ptr<const char[]> key_guard; |
5841 | Slice key = AllocateKey(&key_guard); | |
5842 | PinnableSlice pinnable_val; | |
5843 | ||
5844 | while (key_rand < FLAGS_num) { | |
5845 | DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); | |
5846 | // We use same key_rand as seed for key and column family so that we can | |
5847 | // deterministically find the cfh corresponding to a particular key, as it | |
5848 | // is done in DoWrite method. | |
5849 | GenerateKeyFromInt(key_rand, FLAGS_num, &key); | |
5850 | key_rand++; | |
5851 | read++; | |
5852 | Status s; | |
5853 | if (FLAGS_num_column_families > 1) { | |
1e59de90 TL |
5854 | s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand), |
5855 | key, &pinnable_val); | |
f67539c2 TL |
5856 | } else { |
5857 | pinnable_val.Reset(); | |
1e59de90 | 5858 | s = db_with_cfh->db->Get(read_options_, |
f67539c2 TL |
5859 | db_with_cfh->db->DefaultColumnFamily(), key, |
5860 | &pinnable_val); | |
5861 | } | |
5862 | ||
5863 | if (s.ok()) { | |
5864 | found++; | |
5865 | bytes += key.size() + pinnable_val.size(); | |
5866 | } else if (!s.IsNotFound()) { | |
5867 | fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); | |
5868 | abort(); | |
5869 | } | |
5870 | ||
5871 | if (thread->shared->read_rate_limiter.get() != nullptr && | |
5872 | read % 256 == 255) { | |
5873 | thread->shared->read_rate_limiter->Request( | |
5874 | 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); | |
5875 | } | |
5876 | ||
5877 | thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); | |
5878 | } | |
5879 | ||
5880 | char msg[100]; | |
5881 | snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, | |
5882 | read); | |
5883 | ||
5884 | thread->stats.AddBytes(bytes); | |
5885 | thread->stats.AddMessage(msg); | |
7c673cae FG |
5886 | } |
5887 | ||
5888 | void ReadReverse(ThreadState* thread) { | |
5889 | if (db_.db != nullptr) { | |
5890 | ReadReverse(thread, db_.db); | |
5891 | } else { | |
5892 | for (const auto& db_with_cfh : multi_dbs_) { | |
5893 | ReadReverse(thread, db_with_cfh.db); | |
5894 | } | |
5895 | } | |
5896 | } | |
5897 | ||
5898 | void ReadReverse(ThreadState* thread, DB* db) { | |
1e59de90 | 5899 | Iterator* iter = db->NewIterator(read_options_); |
7c673cae FG |
5900 | int64_t i = 0; |
5901 | int64_t bytes = 0; | |
5902 | for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { | |
5903 | bytes += iter->key().size() + iter->value().size(); | |
5904 | thread->stats.FinishedOps(nullptr, db, 1, kRead); | |
5905 | ++i; | |
5906 | if (thread->shared->read_rate_limiter.get() != nullptr && | |
5907 | i % 1024 == 1023) { | |
5908 | thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH, | |
11fdf7f2 TL |
5909 | nullptr /* stats */, |
5910 | RateLimiter::OpType::kRead); | |
7c673cae FG |
5911 | } |
5912 | } | |
5913 | delete iter; | |
5914 | thread->stats.AddBytes(bytes); | |
5915 | } | |
5916 | ||
5917 | void ReadRandomFast(ThreadState* thread) { | |
5918 | int64_t read = 0; | |
5919 | int64_t found = 0; | |
5920 | int64_t nonexist = 0; | |
1e59de90 | 5921 | ReadOptions options = read_options_; |
7c673cae FG |
5922 | std::unique_ptr<const char[]> key_guard; |
5923 | Slice key = AllocateKey(&key_guard); | |
5924 | std::string value; | |
20effc67 TL |
5925 | Slice ts; |
5926 | std::unique_ptr<char[]> ts_guard; | |
5927 | if (user_timestamp_size_ > 0) { | |
5928 | ts_guard.reset(new char[user_timestamp_size_]); | |
5929 | } | |
7c673cae FG |
5930 | DB* db = SelectDBWithCfh(thread)->db; |
5931 | ||
5932 | int64_t pot = 1; | |
5933 | while (pot < FLAGS_num) { | |
5934 | pot <<= 1; | |
5935 | } | |
5936 | ||
5937 | Duration duration(FLAGS_duration, reads_); | |
5938 | do { | |
5939 | for (int i = 0; i < 100; ++i) { | |
5940 | int64_t key_rand = thread->rand.Next() & (pot - 1); | |
5941 | GenerateKeyFromInt(key_rand, FLAGS_num, &key); | |
5942 | ++read; | |
20effc67 TL |
5943 | std::string ts_ret; |
5944 | std::string* ts_ptr = nullptr; | |
5945 | if (user_timestamp_size_ > 0) { | |
5946 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, | |
5947 | ts_guard.get()); | |
5948 | options.timestamp = &ts; | |
5949 | ts_ptr = &ts_ret; | |
5950 | } | |
5951 | auto status = db->Get(options, key, &value, ts_ptr); | |
7c673cae FG |
5952 | if (status.ok()) { |
5953 | ++found; | |
5954 | } else if (!status.IsNotFound()) { | |
5955 | fprintf(stderr, "Get returned an error: %s\n", | |
5956 | status.ToString().c_str()); | |
5957 | abort(); | |
5958 | } | |
5959 | if (key_rand >= FLAGS_num) { | |
5960 | ++nonexist; | |
5961 | } | |
5962 | } | |
5963 | if (thread->shared->read_rate_limiter.get() != nullptr) { | |
11fdf7f2 TL |
5964 | thread->shared->read_rate_limiter->Request( |
5965 | 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); | |
7c673cae FG |
5966 | } |
5967 | ||
5968 | thread->stats.FinishedOps(nullptr, db, 100, kRead); | |
5969 | } while (!duration.Done(100)); | |
5970 | ||
5971 | char msg[100]; | |
1e59de90 TL |
5972 | snprintf(msg, sizeof(msg), |
5973 | "(%" PRIu64 " of %" PRIu64 | |
5974 | " found, " | |
7c673cae FG |
5975 | "issued %" PRIu64 " non-exist keys)\n", |
5976 | found, read, nonexist); | |
5977 | ||
5978 | thread->stats.AddMessage(msg); | |
7c673cae FG |
5979 | } |
5980 | ||
5981 | int64_t GetRandomKey(Random64* rand) { | |
5982 | uint64_t rand_int = rand->Next(); | |
5983 | int64_t key_rand; | |
5984 | if (read_random_exp_range_ == 0) { | |
5985 | key_rand = rand_int % FLAGS_num; | |
5986 | } else { | |
5987 | const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62; | |
5988 | long double order = -static_cast<long double>(rand_int % kBigInt) / | |
5989 | static_cast<long double>(kBigInt) * | |
5990 | read_random_exp_range_; | |
5991 | long double exp_ran = std::exp(order); | |
5992 | uint64_t rand_num = | |
5993 | static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num)); | |
5994 | // Map to a different number to avoid locality. | |
5995 | const uint64_t kBigPrime = 0x5bd1e995; | |
5996 | // Overflow is like %(2^64). Will have little impact of results. | |
5997 | key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num); | |
5998 | } | |
5999 | return key_rand; | |
6000 | } | |
6001 | ||
6002 | void ReadRandom(ThreadState* thread) { | |
6003 | int64_t read = 0; | |
6004 | int64_t found = 0; | |
6005 | int64_t bytes = 0; | |
f67539c2 | 6006 | int num_keys = 0; |
20effc67 | 6007 | int64_t key_rand = 0; |
1e59de90 | 6008 | ReadOptions options = read_options_; |
7c673cae FG |
6009 | std::unique_ptr<const char[]> key_guard; |
6010 | Slice key = AllocateKey(&key_guard); | |
7c673cae | 6011 | PinnableSlice pinnable_val; |
1e59de90 TL |
6012 | std::vector<PinnableSlice> pinnable_vals; |
6013 | if (read_operands_) { | |
6014 | // Start off with a small-ish value that'll be increased later if | |
6015 | // `GetMergeOperands()` tells us it is not large enough. | |
6016 | pinnable_vals.resize(8); | |
6017 | } | |
20effc67 TL |
6018 | std::unique_ptr<char[]> ts_guard; |
6019 | Slice ts; | |
6020 | if (user_timestamp_size_ > 0) { | |
6021 | ts_guard.reset(new char[user_timestamp_size_]); | |
6022 | } | |
7c673cae FG |
6023 | |
6024 | Duration duration(FLAGS_duration, reads_); | |
6025 | while (!duration.Done(1)) { | |
6026 | DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); | |
6027 | // We use same key_rand as seed for key and column family so that we can | |
6028 | // deterministically find the cfh corresponding to a particular key, as it | |
6029 | // is done in DoWrite method. | |
f67539c2 TL |
6030 | if (entries_per_batch_ > 1 && FLAGS_multiread_stride) { |
6031 | if (++num_keys == entries_per_batch_) { | |
6032 | num_keys = 0; | |
6033 | key_rand = GetRandomKey(&thread->rand); | |
6034 | if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >= | |
6035 | FLAGS_num) { | |
6036 | key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride; | |
6037 | } | |
6038 | } else { | |
6039 | key_rand += FLAGS_multiread_stride; | |
6040 | } | |
6041 | } else { | |
6042 | key_rand = GetRandomKey(&thread->rand); | |
6043 | } | |
20effc67 | 6044 | GenerateKeyFromInt(key_rand, FLAGS_num, &key); |
7c673cae | 6045 | read++; |
20effc67 TL |
6046 | std::string ts_ret; |
6047 | std::string* ts_ptr = nullptr; | |
6048 | if (user_timestamp_size_ > 0) { | |
6049 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
6050 | options.timestamp = &ts; | |
6051 | ts_ptr = &ts_ret; | |
6052 | } | |
7c673cae | 6053 | Status s; |
1e59de90 TL |
6054 | pinnable_val.Reset(); |
6055 | for (size_t i = 0; i < pinnable_vals.size(); ++i) { | |
6056 | pinnable_vals[i].Reset(); | |
6057 | } | |
6058 | ColumnFamilyHandle* cfh; | |
7c673cae | 6059 | if (FLAGS_num_column_families > 1) { |
1e59de90 | 6060 | cfh = db_with_cfh->GetCfh(key_rand); |
7c673cae | 6061 | } else { |
1e59de90 TL |
6062 | cfh = db_with_cfh->db->DefaultColumnFamily(); |
6063 | } | |
6064 | if (read_operands_) { | |
6065 | GetMergeOperandsOptions get_merge_operands_options; | |
6066 | get_merge_operands_options.expected_max_number_of_operands = | |
6067 | static_cast<int>(pinnable_vals.size()); | |
6068 | int number_of_operands; | |
6069 | s = db_with_cfh->db->GetMergeOperands( | |
6070 | options, cfh, key, pinnable_vals.data(), | |
6071 | &get_merge_operands_options, &number_of_operands); | |
6072 | if (s.IsIncomplete()) { | |
6073 | // Should only happen a few times when we encounter a key that had | |
6074 | // more merge operands than any key seen so far. Production use case | |
6075 | // would typically retry in such event to get all the operands so do | |
6076 | // that here. | |
6077 | pinnable_vals.resize(number_of_operands); | |
6078 | get_merge_operands_options.expected_max_number_of_operands = | |
6079 | static_cast<int>(pinnable_vals.size()); | |
6080 | s = db_with_cfh->db->GetMergeOperands( | |
6081 | options, cfh, key, pinnable_vals.data(), | |
6082 | &get_merge_operands_options, &number_of_operands); | |
6083 | } | |
6084 | } else { | |
6085 | s = db_with_cfh->db->Get(options, cfh, key, &pinnable_val, ts_ptr); | |
7c673cae | 6086 | } |
1e59de90 | 6087 | |
7c673cae FG |
6088 | if (s.ok()) { |
6089 | found++; | |
20effc67 | 6090 | bytes += key.size() + pinnable_val.size() + user_timestamp_size_; |
1e59de90 TL |
6091 | for (size_t i = 0; i < pinnable_vals.size(); ++i) { |
6092 | bytes += pinnable_vals[i].size(); | |
6093 | pinnable_vals[i].Reset(); | |
6094 | } | |
7c673cae FG |
6095 | } else if (!s.IsNotFound()) { |
6096 | fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); | |
6097 | abort(); | |
6098 | } | |
6099 | ||
6100 | if (thread->shared->read_rate_limiter.get() != nullptr && | |
6101 | read % 256 == 255) { | |
11fdf7f2 TL |
6102 | thread->shared->read_rate_limiter->Request( |
6103 | 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); | |
7c673cae FG |
6104 | } |
6105 | ||
6106 | thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); | |
6107 | } | |
6108 | ||
6109 | char msg[100]; | |
1e59de90 TL |
6110 | snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, |
6111 | read); | |
7c673cae FG |
6112 | |
6113 | thread->stats.AddBytes(bytes); | |
6114 | thread->stats.AddMessage(msg); | |
7c673cae FG |
6115 | } |
6116 | ||
6117 | // Calls MultiGet over a list of keys from a random distribution. | |
6118 | // Returns the total number of keys found. | |
6119 | void MultiReadRandom(ThreadState* thread) { | |
6120 | int64_t read = 0; | |
1e59de90 | 6121 | int64_t bytes = 0; |
7c673cae FG |
6122 | int64_t num_multireads = 0; |
6123 | int64_t found = 0; | |
1e59de90 | 6124 | ReadOptions options = read_options_; |
7c673cae | 6125 | std::vector<Slice> keys; |
1e59de90 | 6126 | std::vector<std::unique_ptr<const char[]>> key_guards; |
7c673cae | 6127 | std::vector<std::string> values(entries_per_batch_); |
f67539c2 TL |
6128 | PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_]; |
6129 | std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values); | |
6130 | std::vector<Status> stat_list(entries_per_batch_); | |
7c673cae FG |
6131 | while (static_cast<int64_t>(keys.size()) < entries_per_batch_) { |
6132 | key_guards.push_back(std::unique_ptr<const char[]>()); | |
6133 | keys.push_back(AllocateKey(&key_guards.back())); | |
6134 | } | |
6135 | ||
20effc67 TL |
6136 | std::unique_ptr<char[]> ts_guard; |
6137 | if (user_timestamp_size_ > 0) { | |
6138 | ts_guard.reset(new char[user_timestamp_size_]); | |
6139 | } | |
6140 | ||
7c673cae | 6141 | Duration duration(FLAGS_duration, reads_); |
1e59de90 | 6142 | while (!duration.Done(entries_per_batch_)) { |
7c673cae | 6143 | DB* db = SelectDB(thread); |
f67539c2 TL |
6144 | if (FLAGS_multiread_stride) { |
6145 | int64_t key = GetRandomKey(&thread->rand); | |
6146 | if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >= | |
6147 | static_cast<int64_t>(FLAGS_num)) { | |
6148 | key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride; | |
6149 | } | |
6150 | for (int64_t i = 0; i < entries_per_batch_; ++i) { | |
6151 | GenerateKeyFromInt(key, FLAGS_num, &keys[i]); | |
6152 | key += FLAGS_multiread_stride; | |
6153 | } | |
6154 | } else { | |
6155 | for (int64_t i = 0; i < entries_per_batch_; ++i) { | |
6156 | GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]); | |
6157 | } | |
7c673cae | 6158 | } |
20effc67 TL |
6159 | Slice ts; |
6160 | if (user_timestamp_size_ > 0) { | |
6161 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
6162 | options.timestamp = &ts; | |
6163 | } | |
f67539c2 TL |
6164 | if (!FLAGS_multiread_batched) { |
6165 | std::vector<Status> statuses = db->MultiGet(options, keys, &values); | |
6166 | assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_); | |
6167 | ||
6168 | read += entries_per_batch_; | |
6169 | num_multireads++; | |
6170 | for (int64_t i = 0; i < entries_per_batch_; ++i) { | |
6171 | if (statuses[i].ok()) { | |
1e59de90 | 6172 | bytes += keys[i].size() + values[i].size() + user_timestamp_size_; |
f67539c2 TL |
6173 | ++found; |
6174 | } else if (!statuses[i].IsNotFound()) { | |
6175 | fprintf(stderr, "MultiGet returned an error: %s\n", | |
6176 | statuses[i].ToString().c_str()); | |
6177 | abort(); | |
6178 | } | |
6179 | } | |
6180 | } else { | |
6181 | db->MultiGet(options, db->DefaultColumnFamily(), keys.size(), | |
6182 | keys.data(), pin_values, stat_list.data()); | |
6183 | ||
6184 | read += entries_per_batch_; | |
6185 | num_multireads++; | |
6186 | for (int64_t i = 0; i < entries_per_batch_; ++i) { | |
6187 | if (stat_list[i].ok()) { | |
1e59de90 TL |
6188 | bytes += |
6189 | keys[i].size() + pin_values[i].size() + user_timestamp_size_; | |
f67539c2 TL |
6190 | ++found; |
6191 | } else if (!stat_list[i].IsNotFound()) { | |
6192 | fprintf(stderr, "MultiGet returned an error: %s\n", | |
6193 | stat_list[i].ToString().c_str()); | |
6194 | abort(); | |
6195 | } | |
6196 | stat_list[i] = Status::OK(); | |
6197 | pin_values[i].Reset(); | |
7c673cae FG |
6198 | } |
6199 | } | |
6200 | if (thread->shared->read_rate_limiter.get() != nullptr && | |
6201 | num_multireads % 256 == 255) { | |
6202 | thread->shared->read_rate_limiter->Request( | |
11fdf7f2 TL |
6203 | 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */, |
6204 | RateLimiter::OpType::kRead); | |
7c673cae FG |
6205 | } |
6206 | thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead); | |
6207 | } | |
6208 | ||
6209 | char msg[100]; | |
1e59de90 TL |
6210 | snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found, |
6211 | read); | |
6212 | thread->stats.AddBytes(bytes); | |
7c673cae FG |
6213 | thread->stats.AddMessage(msg); |
6214 | } | |
6215 | ||
20effc67 TL |
6216 | // Calls ApproximateSize over random key ranges. |
6217 | void ApproximateSizeRandom(ThreadState* thread) { | |
6218 | int64_t size_sum = 0; | |
6219 | int64_t num_sizes = 0; | |
6220 | const size_t batch_size = entries_per_batch_; | |
6221 | std::vector<Range> ranges; | |
6222 | std::vector<Slice> lkeys; | |
6223 | std::vector<std::unique_ptr<const char[]>> lkey_guards; | |
6224 | std::vector<Slice> rkeys; | |
6225 | std::vector<std::unique_ptr<const char[]>> rkey_guards; | |
6226 | std::vector<uint64_t> sizes; | |
6227 | while (ranges.size() < batch_size) { | |
6228 | // Ugly without C++17 return from emplace_back | |
6229 | lkey_guards.emplace_back(); | |
6230 | rkey_guards.emplace_back(); | |
6231 | lkeys.emplace_back(AllocateKey(&lkey_guards.back())); | |
6232 | rkeys.emplace_back(AllocateKey(&rkey_guards.back())); | |
6233 | ranges.emplace_back(lkeys.back(), rkeys.back()); | |
6234 | sizes.push_back(0); | |
6235 | } | |
6236 | Duration duration(FLAGS_duration, reads_); | |
6237 | while (!duration.Done(1)) { | |
6238 | DB* db = SelectDB(thread); | |
6239 | for (size_t i = 0; i < batch_size; ++i) { | |
6240 | int64_t lkey = GetRandomKey(&thread->rand); | |
6241 | int64_t rkey = GetRandomKey(&thread->rand); | |
6242 | if (lkey > rkey) { | |
6243 | std::swap(lkey, rkey); | |
6244 | } | |
6245 | GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]); | |
6246 | GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]); | |
6247 | } | |
6248 | db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_), | |
6249 | &sizes[0]); | |
6250 | num_sizes += entries_per_batch_; | |
6251 | for (int64_t size : sizes) { | |
6252 | size_sum += size; | |
6253 | } | |
6254 | thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers); | |
6255 | } | |
6256 | ||
6257 | char msg[100]; | |
6258 | snprintf(msg, sizeof(msg), "(Avg approx size=%g)", | |
6259 | static_cast<double>(size_sum) / static_cast<double>(num_sizes)); | |
6260 | thread->stats.AddMessage(msg); | |
6261 | } | |
6262 | ||
f67539c2 | 6263 | // The inverse function of Pareto distribution |
494da23a TL |
6264 | int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) { |
6265 | double ret; | |
6266 | if (k == 0.0) { | |
6267 | ret = theta - sigma * std::log(u); | |
6268 | } else { | |
6269 | ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k; | |
6270 | } | |
6271 | return static_cast<int64_t>(ceil(ret)); | |
6272 | } | |
f67539c2 | 6273 | // The inverse function of power distribution (y=ax^b) |
494da23a TL |
6274 | int64_t PowerCdfInversion(double u, double a, double b) { |
6275 | double ret; | |
6276 | ret = std::pow((u / a), (1 / b)); | |
6277 | return static_cast<int64_t>(ceil(ret)); | |
6278 | } | |
6279 | ||
6280 | // Add the noice to the QPS | |
6281 | double AddNoise(double origin, double noise_ratio) { | |
6282 | if (noise_ratio < 0.0 || noise_ratio > 1.0) { | |
6283 | return origin; | |
6284 | } | |
6285 | int band_int = static_cast<int>(FLAGS_sine_a); | |
6286 | double delta = (rand() % band_int - band_int / 2) * noise_ratio; | |
6287 | if (origin + delta < 0) { | |
6288 | return origin; | |
6289 | } else { | |
6290 | return (origin + delta); | |
6291 | } | |
6292 | } | |
6293 | ||
f67539c2 | 6294 | // Decide the ratio of different query types |
494da23a TL |
6295 | // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge |
6296 | class QueryDecider { | |
6297 | public: | |
6298 | std::vector<int> type_; | |
6299 | std::vector<double> ratio_; | |
6300 | int range_; | |
6301 | ||
6302 | QueryDecider() {} | |
6303 | ~QueryDecider() {} | |
6304 | ||
6305 | Status Initiate(std::vector<double> ratio_input) { | |
6306 | int range_max = 1000; | |
6307 | double sum = 0.0; | |
6308 | for (auto& ratio : ratio_input) { | |
6309 | sum += ratio; | |
6310 | } | |
6311 | range_ = 0; | |
6312 | for (auto& ratio : ratio_input) { | |
6313 | range_ += static_cast<int>(ceil(range_max * (ratio / sum))); | |
6314 | type_.push_back(range_); | |
6315 | ratio_.push_back(ratio / sum); | |
6316 | } | |
6317 | return Status::OK(); | |
6318 | } | |
6319 | ||
6320 | int GetType(int64_t rand_num) { | |
6321 | if (rand_num < 0) { | |
6322 | rand_num = rand_num * (-1); | |
6323 | } | |
6324 | assert(range_ != 0); | |
6325 | int pos = static_cast<int>(rand_num % range_); | |
6326 | for (int i = 0; i < static_cast<int>(type_.size()); i++) { | |
6327 | if (pos < type_[i]) { | |
6328 | return i; | |
6329 | } | |
6330 | } | |
6331 | return 0; | |
6332 | } | |
6333 | }; | |
6334 | ||
f67539c2 TL |
6335 | // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector |
6336 | // to transfer a random value to one keyrange based on the hotness. | |
6337 | struct KeyrangeUnit { | |
6338 | int64_t keyrange_start; | |
6339 | int64_t keyrange_access; | |
6340 | int64_t keyrange_keys; | |
6341 | }; | |
6342 | ||
6343 | // From our observations, the prefix hotness (key-range hotness) follows | |
6344 | // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x). | |
6345 | // However, we cannot directly use the inverse function to decide a | |
6346 | // key-range from a random distribution. To achieve it, we create a list of | |
6347 | // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is | |
6348 | // decided based on the hotness of the key-range. When a random value is | |
6349 | // generated based on uniform distribution, we map it to the KeyrangeUnit Vec | |
6350 | // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being | |
6351 | // selected is the same as the hotness of this KeyrangeUnit. After that, the | |
6352 | // key can be randomly allocated to the key-range of this KeyrangeUnit, or we | |
6353 | // can based on the power distribution (y=ax^b) to generate the offset of | |
6354 | // the key in the selected key-range. In this way, we generate the keyID | |
6355 | // based on the hotness of the prefix and also the key hotness distribution. | |
6356 | class GenerateTwoTermExpKeys { | |
6357 | public: | |
20effc67 TL |
6358 | // Avoid uninitialized warning-as-error in some compilers |
6359 | int64_t keyrange_rand_max_ = 0; | |
6360 | int64_t keyrange_size_ = 0; | |
6361 | int64_t keyrange_num_ = 0; | |
f67539c2 TL |
6362 | std::vector<KeyrangeUnit> keyrange_set_; |
6363 | ||
f67539c2 TL |
6364 | // Initiate the KeyrangeUnit vector and calculate the size of each |
6365 | // KeyrangeUnit. | |
6366 | Status InitiateExpDistribution(int64_t total_keys, double prefix_a, | |
6367 | double prefix_b, double prefix_c, | |
6368 | double prefix_d) { | |
6369 | int64_t amplify = 0; | |
6370 | int64_t keyrange_start = 0; | |
f67539c2 TL |
6371 | if (FLAGS_keyrange_num <= 0) { |
6372 | keyrange_num_ = 1; | |
6373 | } else { | |
6374 | keyrange_num_ = FLAGS_keyrange_num; | |
6375 | } | |
6376 | keyrange_size_ = total_keys / keyrange_num_; | |
6377 | ||
6378 | // Calculate the key-range shares size based on the input parameters | |
6379 | for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) { | |
6380 | // Step 1. Calculate the probability that this key range will be | |
6381 | // accessed in a query. It is based on the two-term expoential | |
6382 | // distribution | |
6383 | double keyrange_p = prefix_a * std::exp(prefix_b * pfx) + | |
6384 | prefix_c * std::exp(prefix_d * pfx); | |
6385 | if (keyrange_p < std::pow(10.0, -16.0)) { | |
6386 | keyrange_p = 0.0; | |
6387 | } | |
6388 | // Step 2. Calculate the amplify | |
6389 | // In order to allocate a query to a key-range based on the random | |
6390 | // number generated for this query, we need to extend the probability | |
6391 | // of each key range from [0,1] to [0, amplify]. Amplify is calculated | |
6392 | // by 1/(smallest key-range probability). In this way, we ensure that | |
6393 | // all key-ranges are assigned with an Integer that >=0 | |
6394 | if (amplify == 0 && keyrange_p > 0) { | |
6395 | amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1; | |
6396 | } | |
6397 | ||
6398 | // Step 3. For each key-range, we calculate its position in the | |
6399 | // [0, amplify] range, including the start, the size (keyrange_access) | |
6400 | KeyrangeUnit p_unit; | |
6401 | p_unit.keyrange_start = keyrange_start; | |
6402 | if (0.0 >= keyrange_p) { | |
6403 | p_unit.keyrange_access = 0; | |
6404 | } else { | |
6405 | p_unit.keyrange_access = | |
6406 | static_cast<int64_t>(std::floor(amplify * keyrange_p)); | |
6407 | } | |
6408 | p_unit.keyrange_keys = keyrange_size_; | |
6409 | keyrange_set_.push_back(p_unit); | |
6410 | keyrange_start += p_unit.keyrange_access; | |
6411 | } | |
6412 | keyrange_rand_max_ = keyrange_start; | |
6413 | ||
6414 | // Step 4. Shuffle the key-ranges randomly | |
6415 | // Since the access probability is calculated from small to large, | |
6416 | // If we do not re-allocate them, hot key-ranges are always at the end | |
6417 | // and cold key-ranges are at the begin of the key space. Therefore, the | |
6418 | // key-ranges are shuffled and the rand seed is only decide by the | |
6419 | // key-range hotness distribution. With the same distribution parameters | |
6420 | // the shuffle results are the same. | |
6421 | Random64 rand_loca(keyrange_rand_max_); | |
6422 | for (int64_t i = 0; i < FLAGS_keyrange_num; i++) { | |
6423 | int64_t pos = rand_loca.Next() % FLAGS_keyrange_num; | |
6424 | assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) && | |
6425 | pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size())); | |
6426 | std::swap(keyrange_set_[i], keyrange_set_[pos]); | |
6427 | } | |
6428 | ||
6429 | // Step 5. Recalculate the prefix start postion after shuffling | |
6430 | int64_t offset = 0; | |
6431 | for (auto& p_unit : keyrange_set_) { | |
6432 | p_unit.keyrange_start = offset; | |
6433 | offset += p_unit.keyrange_access; | |
6434 | } | |
6435 | ||
6436 | return Status::OK(); | |
6437 | } | |
6438 | ||
6439 | // Generate the Key ID according to the input ini_rand and key distribution | |
6440 | int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a, | |
6441 | double key_dist_b) { | |
6442 | int64_t keyrange_rand = ini_rand % keyrange_rand_max_; | |
6443 | ||
6444 | // Calculate and select one key-range that contains the new key | |
6445 | int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size()); | |
6446 | while (start + 1 < end) { | |
6447 | int64_t mid = start + (end - start) / 2; | |
6448 | assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size())); | |
6449 | if (keyrange_rand < keyrange_set_[mid].keyrange_start) { | |
6450 | end = mid; | |
6451 | } else { | |
6452 | start = mid; | |
6453 | } | |
6454 | } | |
6455 | int64_t keyrange_id = start; | |
6456 | ||
6457 | // Select one key in the key-range and compose the keyID | |
6458 | int64_t key_offset = 0, key_seed; | |
20effc67 | 6459 | if (key_dist_a == 0.0 || key_dist_b == 0.0) { |
f67539c2 TL |
6460 | key_offset = ini_rand % keyrange_size_; |
6461 | } else { | |
20effc67 TL |
6462 | double u = |
6463 | static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_; | |
f67539c2 | 6464 | key_seed = static_cast<int64_t>( |
20effc67 | 6465 | ceil(std::pow((u / key_dist_a), (1 / key_dist_b)))); |
f67539c2 | 6466 | Random64 rand_key(key_seed); |
20effc67 | 6467 | key_offset = rand_key.Next() % keyrange_size_; |
f67539c2 TL |
6468 | } |
6469 | return keyrange_size_ * keyrange_id + key_offset; | |
6470 | } | |
6471 | }; | |
6472 | ||
1e59de90 | 6473 | // The social graph workload mixed with Get, Put, Iterator queries. |
f67539c2 TL |
6474 | // The value size and iterator length follow Pareto distribution. |
6475 | // The overall key access follow power distribution. If user models the | |
6476 | // workload based on different key-ranges (or different prefixes), user | |
6477 | // can use two-term-exponential distribution to fit the workload. User | |
1e59de90 | 6478 | // needs to decide the ratio between Get, Put, Iterator queries before |
f67539c2 | 6479 | // starting the benchmark. |
494da23a | 6480 | void MixGraph(ThreadState* thread) { |
494da23a TL |
6481 | int64_t gets = 0; |
6482 | int64_t puts = 0; | |
1e59de90 | 6483 | int64_t get_found = 0; |
494da23a TL |
6484 | int64_t seek = 0; |
6485 | int64_t seek_found = 0; | |
6486 | int64_t bytes = 0; | |
1e59de90 TL |
6487 | double total_scan_length = 0; |
6488 | double total_val_size = 0; | |
494da23a TL |
6489 | const int64_t default_value_max = 1 * 1024 * 1024; |
6490 | int64_t value_max = default_value_max; | |
6491 | int64_t scan_len_max = FLAGS_mix_max_scan_len; | |
6492 | double write_rate = 1000000.0; | |
6493 | double read_rate = 1000000.0; | |
f67539c2 | 6494 | bool use_prefix_modeling = false; |
20effc67 | 6495 | bool use_random_modeling = false; |
f67539c2 | 6496 | GenerateTwoTermExpKeys gen_exp; |
494da23a TL |
6497 | std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio, |
6498 | FLAGS_mix_seek_ratio}; | |
6499 | char value_buffer[default_value_max]; | |
6500 | QueryDecider query; | |
6501 | RandomGenerator gen; | |
6502 | Status s; | |
6503 | if (value_max > FLAGS_mix_max_value_size) { | |
6504 | value_max = FLAGS_mix_max_value_size; | |
6505 | } | |
6506 | ||
494da23a TL |
6507 | std::unique_ptr<const char[]> key_guard; |
6508 | Slice key = AllocateKey(&key_guard); | |
6509 | PinnableSlice pinnable_val; | |
6510 | query.Initiate(ratio); | |
6511 | ||
6512 | // the limit of qps initiation | |
1e59de90 TL |
6513 | if (FLAGS_sine_mix_rate) { |
6514 | thread->shared->read_rate_limiter.reset( | |
6515 | NewGenericRateLimiter(static_cast<int64_t>(read_rate))); | |
494da23a | 6516 | thread->shared->write_rate_limiter.reset( |
f67539c2 TL |
6517 | NewGenericRateLimiter(static_cast<int64_t>(write_rate))); |
6518 | } | |
6519 | ||
6520 | // Decide if user wants to use prefix based key generation | |
6521 | if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 || | |
6522 | FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) { | |
6523 | use_prefix_modeling = true; | |
6524 | gen_exp.InitiateExpDistribution( | |
6525 | FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b, | |
6526 | FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d); | |
494da23a | 6527 | } |
20effc67 TL |
6528 | if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) { |
6529 | use_random_modeling = true; | |
6530 | } | |
494da23a TL |
6531 | |
6532 | Duration duration(FLAGS_duration, reads_); | |
6533 | while (!duration.Done(1)) { | |
6534 | DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); | |
f67539c2 TL |
6535 | int64_t ini_rand, rand_v, key_rand, key_seed; |
6536 | ini_rand = GetRandomKey(&thread->rand); | |
6537 | rand_v = ini_rand % FLAGS_num; | |
494da23a | 6538 | double u = static_cast<double>(rand_v) / FLAGS_num; |
f67539c2 TL |
6539 | |
6540 | // Generate the keyID based on the key hotness and prefix hotness | |
20effc67 TL |
6541 | if (use_random_modeling) { |
6542 | key_rand = ini_rand; | |
6543 | } else if (use_prefix_modeling) { | |
f67539c2 TL |
6544 | key_rand = |
6545 | gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b); | |
6546 | } else { | |
6547 | key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b); | |
6548 | Random64 rand(key_seed); | |
6549 | key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num; | |
6550 | } | |
494da23a TL |
6551 | GenerateKeyFromInt(key_rand, FLAGS_num, &key); |
6552 | int query_type = query.GetType(rand_v); | |
6553 | ||
6554 | // change the qps | |
6555 | uint64_t now = FLAGS_env->NowMicros(); | |
6556 | uint64_t usecs_since_last; | |
6557 | if (now > thread->stats.GetSineInterval()) { | |
6558 | usecs_since_last = now - thread->stats.GetSineInterval(); | |
6559 | } else { | |
6560 | usecs_since_last = 0; | |
6561 | } | |
6562 | ||
1e59de90 TL |
6563 | if (FLAGS_sine_mix_rate && |
6564 | usecs_since_last > | |
6565 | (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) { | |
494da23a TL |
6566 | double usecs_since_start = |
6567 | static_cast<double>(now - thread->stats.GetStart()); | |
6568 | thread->stats.ResetSineInterval(); | |
6569 | double mix_rate_with_noise = AddNoise( | |
6570 | SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise); | |
6571 | read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]); | |
1e59de90 | 6572 | write_rate = mix_rate_with_noise * query.ratio_[1]; |
494da23a | 6573 | |
1e59de90 TL |
6574 | if (read_rate > 0) { |
6575 | thread->shared->read_rate_limiter->SetBytesPerSecond( | |
6576 | static_cast<int64_t>(read_rate)); | |
6577 | } | |
6578 | if (write_rate > 0) { | |
6579 | thread->shared->write_rate_limiter->SetBytesPerSecond( | |
6580 | static_cast<int64_t>(write_rate)); | |
6581 | } | |
494da23a TL |
6582 | } |
6583 | // Start the query | |
6584 | if (query_type == 0) { | |
6585 | // the Get query | |
6586 | gets++; | |
494da23a | 6587 | if (FLAGS_num_column_families > 1) { |
1e59de90 TL |
6588 | s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand), |
6589 | key, &pinnable_val); | |
494da23a TL |
6590 | } else { |
6591 | pinnable_val.Reset(); | |
1e59de90 | 6592 | s = db_with_cfh->db->Get(read_options_, |
494da23a TL |
6593 | db_with_cfh->db->DefaultColumnFamily(), key, |
6594 | &pinnable_val); | |
6595 | } | |
6596 | ||
6597 | if (s.ok()) { | |
1e59de90 | 6598 | get_found++; |
494da23a TL |
6599 | bytes += key.size() + pinnable_val.size(); |
6600 | } else if (!s.IsNotFound()) { | |
6601 | fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); | |
6602 | abort(); | |
6603 | } | |
6604 | ||
1e59de90 TL |
6605 | if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) { |
6606 | thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH, | |
6607 | nullptr /*stats*/); | |
494da23a TL |
6608 | } |
6609 | thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); | |
6610 | } else if (query_type == 1) { | |
6611 | // the Put query | |
6612 | puts++; | |
1e59de90 TL |
6613 | int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta, |
6614 | FLAGS_value_k, FLAGS_value_sigma); | |
6615 | if (val_size < 10) { | |
f67539c2 TL |
6616 | val_size = 10; |
6617 | } else if (val_size > value_max) { | |
6618 | val_size = val_size % value_max; | |
494da23a | 6619 | } |
1e59de90 TL |
6620 | total_val_size += val_size; |
6621 | ||
494da23a TL |
6622 | s = db_with_cfh->db->Put( |
6623 | write_options_, key, | |
f67539c2 | 6624 | gen.Generate(static_cast<unsigned int>(val_size))); |
494da23a TL |
6625 | if (!s.ok()) { |
6626 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
20effc67 | 6627 | ErrorExit(); |
494da23a TL |
6628 | } |
6629 | ||
1e59de90 TL |
6630 | if (thread->shared->write_rate_limiter && puts % 100 == 0) { |
6631 | thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH, | |
6632 | nullptr /*stats*/); | |
494da23a TL |
6633 | } |
6634 | thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite); | |
6635 | } else if (query_type == 2) { | |
6636 | // Seek query | |
6637 | if (db_with_cfh->db != nullptr) { | |
6638 | Iterator* single_iter = nullptr; | |
1e59de90 | 6639 | single_iter = db_with_cfh->db->NewIterator(read_options_); |
494da23a TL |
6640 | if (single_iter != nullptr) { |
6641 | single_iter->Seek(key); | |
6642 | seek++; | |
494da23a TL |
6643 | if (single_iter->Valid() && single_iter->key().compare(key) == 0) { |
6644 | seek_found++; | |
6645 | } | |
6646 | int64_t scan_length = | |
6647 | ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k, | |
6648 | FLAGS_iter_sigma) % | |
6649 | scan_len_max; | |
6650 | for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) { | |
6651 | Slice value = single_iter->value(); | |
6652 | memcpy(value_buffer, value.data(), | |
6653 | std::min(value.size(), sizeof(value_buffer))); | |
6654 | bytes += single_iter->key().size() + single_iter->value().size(); | |
6655 | single_iter->Next(); | |
6656 | assert(single_iter->status().ok()); | |
1e59de90 | 6657 | total_scan_length++; |
494da23a TL |
6658 | } |
6659 | } | |
6660 | delete single_iter; | |
6661 | } | |
6662 | thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek); | |
6663 | } | |
6664 | } | |
6665 | char msg[256]; | |
6666 | snprintf(msg, sizeof(msg), | |
1e59de90 TL |
6667 | "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 |
6668 | ", reads %" PRIu64 " in %" PRIu64 | |
6669 | " found, " | |
6670 | "avg size: %.1f value, %.1f scan)\n", | |
6671 | gets, puts, seek, get_found + seek_found, gets + seek, | |
6672 | total_val_size / puts, total_scan_length / seek); | |
494da23a TL |
6673 | |
6674 | thread->stats.AddBytes(bytes); | |
6675 | thread->stats.AddMessage(msg); | |
494da23a TL |
6676 | } |
6677 | ||
7c673cae FG |
6678 | void IteratorCreation(ThreadState* thread) { |
6679 | Duration duration(FLAGS_duration, reads_); | |
1e59de90 | 6680 | ReadOptions options = read_options_; |
20effc67 TL |
6681 | std::unique_ptr<char[]> ts_guard; |
6682 | if (user_timestamp_size_ > 0) { | |
6683 | ts_guard.reset(new char[user_timestamp_size_]); | |
6684 | } | |
7c673cae FG |
6685 | while (!duration.Done(1)) { |
6686 | DB* db = SelectDB(thread); | |
20effc67 TL |
6687 | Slice ts; |
6688 | if (user_timestamp_size_ > 0) { | |
6689 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
6690 | options.timestamp = &ts; | |
6691 | } | |
7c673cae FG |
6692 | Iterator* iter = db->NewIterator(options); |
6693 | delete iter; | |
6694 | thread->stats.FinishedOps(nullptr, db, 1, kOthers); | |
6695 | } | |
6696 | } | |
6697 | ||
6698 | void IteratorCreationWhileWriting(ThreadState* thread) { | |
6699 | if (thread->tid > 0) { | |
6700 | IteratorCreation(thread); | |
6701 | } else { | |
6702 | BGWriter(thread, kWrite); | |
6703 | } | |
6704 | } | |
6705 | ||
6706 | void SeekRandom(ThreadState* thread) { | |
6707 | int64_t read = 0; | |
6708 | int64_t found = 0; | |
6709 | int64_t bytes = 0; | |
1e59de90 | 6710 | ReadOptions options = read_options_; |
20effc67 TL |
6711 | std::unique_ptr<char[]> ts_guard; |
6712 | Slice ts; | |
6713 | if (user_timestamp_size_ > 0) { | |
6714 | ts_guard.reset(new char[user_timestamp_size_]); | |
6715 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
6716 | options.timestamp = &ts; | |
6717 | } | |
7c673cae | 6718 | |
1e59de90 TL |
6719 | std::vector<Iterator*> tailing_iters; |
6720 | if (FLAGS_use_tailing_iterator) { | |
6721 | if (db_.db != nullptr) { | |
6722 | tailing_iters.push_back(db_.db->NewIterator(options)); | |
6723 | } else { | |
6724 | for (const auto& db_with_cfh : multi_dbs_) { | |
6725 | tailing_iters.push_back(db_with_cfh.db->NewIterator(options)); | |
6726 | } | |
7c673cae FG |
6727 | } |
6728 | } | |
1e59de90 | 6729 | options.auto_prefix_mode = FLAGS_auto_prefix_mode; |
7c673cae FG |
6730 | |
6731 | std::unique_ptr<const char[]> key_guard; | |
6732 | Slice key = AllocateKey(&key_guard); | |
6733 | ||
494da23a TL |
6734 | std::unique_ptr<const char[]> upper_bound_key_guard; |
6735 | Slice upper_bound = AllocateKey(&upper_bound_key_guard); | |
6736 | std::unique_ptr<const char[]> lower_bound_key_guard; | |
6737 | Slice lower_bound = AllocateKey(&lower_bound_key_guard); | |
6738 | ||
7c673cae FG |
6739 | Duration duration(FLAGS_duration, reads_); |
6740 | char value_buffer[256]; | |
6741 | while (!duration.Done(1)) { | |
494da23a | 6742 | int64_t seek_pos = thread->rand.Next() % FLAGS_num; |
f67539c2 TL |
6743 | GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num, |
6744 | &key); | |
494da23a TL |
6745 | if (FLAGS_max_scan_distance != 0) { |
6746 | if (FLAGS_reverse_iterator) { | |
6747 | GenerateKeyFromInt( | |
6748 | static_cast<uint64_t>(std::max( | |
6749 | static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)), | |
6750 | FLAGS_num, &lower_bound); | |
6751 | options.iterate_lower_bound = &lower_bound; | |
6752 | } else { | |
f67539c2 TL |
6753 | auto min_num = |
6754 | std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance); | |
6755 | GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num, | |
6756 | &upper_bound); | |
494da23a TL |
6757 | options.iterate_upper_bound = &upper_bound; |
6758 | } | |
1e59de90 TL |
6759 | } else if (FLAGS_auto_prefix_mode && prefix_extractor_ && |
6760 | !FLAGS_reverse_iterator) { | |
6761 | // Set upper bound to next prefix | |
6762 | auto mutable_upper_bound = const_cast<char*>(upper_bound.data()); | |
6763 | std::memcpy(mutable_upper_bound, key.data(), prefix_size_); | |
6764 | mutable_upper_bound[prefix_size_ - 1]++; | |
6765 | upper_bound = Slice(upper_bound.data(), prefix_size_); | |
6766 | options.iterate_upper_bound = &upper_bound; | |
494da23a TL |
6767 | } |
6768 | ||
1e59de90 TL |
6769 | // Pick a Iterator to use |
6770 | uint64_t db_idx_to_use = | |
6771 | (db_.db == nullptr) | |
6772 | ? (uint64_t{thread->rand.Next()} % multi_dbs_.size()) | |
6773 | : 0; | |
6774 | std::unique_ptr<Iterator> single_iter; | |
6775 | Iterator* iter_to_use; | |
6776 | if (FLAGS_use_tailing_iterator) { | |
6777 | iter_to_use = tailing_iters[db_idx_to_use]; | |
6778 | } else { | |
7c673cae | 6779 | if (db_.db != nullptr) { |
1e59de90 | 6780 | single_iter.reset(db_.db->NewIterator(options)); |
7c673cae | 6781 | } else { |
1e59de90 | 6782 | single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options)); |
7c673cae | 6783 | } |
1e59de90 | 6784 | iter_to_use = single_iter.get(); |
7c673cae FG |
6785 | } |
6786 | ||
7c673cae FG |
6787 | iter_to_use->Seek(key); |
6788 | read++; | |
6789 | if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { | |
6790 | found++; | |
6791 | } | |
6792 | ||
6793 | for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) { | |
6794 | // Copy out iterator's value to make sure we read them. | |
6795 | Slice value = iter_to_use->value(); | |
6796 | memcpy(value_buffer, value.data(), | |
6797 | std::min(value.size(), sizeof(value_buffer))); | |
6798 | bytes += iter_to_use->key().size() + iter_to_use->value().size(); | |
6799 | ||
6800 | if (!FLAGS_reverse_iterator) { | |
6801 | iter_to_use->Next(); | |
6802 | } else { | |
6803 | iter_to_use->Prev(); | |
6804 | } | |
6805 | assert(iter_to_use->status().ok()); | |
6806 | } | |
6807 | ||
6808 | if (thread->shared->read_rate_limiter.get() != nullptr && | |
6809 | read % 256 == 255) { | |
11fdf7f2 TL |
6810 | thread->shared->read_rate_limiter->Request( |
6811 | 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); | |
7c673cae FG |
6812 | } |
6813 | ||
6814 | thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); | |
6815 | } | |
1e59de90 | 6816 | for (auto iter : tailing_iters) { |
7c673cae FG |
6817 | delete iter; |
6818 | } | |
6819 | ||
6820 | char msg[100]; | |
1e59de90 TL |
6821 | snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, |
6822 | read); | |
7c673cae FG |
6823 | thread->stats.AddBytes(bytes); |
6824 | thread->stats.AddMessage(msg); | |
7c673cae FG |
6825 | } |
6826 | ||
6827 | void SeekRandomWhileWriting(ThreadState* thread) { | |
6828 | if (thread->tid > 0) { | |
6829 | SeekRandom(thread); | |
6830 | } else { | |
6831 | BGWriter(thread, kWrite); | |
6832 | } | |
6833 | } | |
6834 | ||
6835 | void SeekRandomWhileMerging(ThreadState* thread) { | |
6836 | if (thread->tid > 0) { | |
6837 | SeekRandom(thread); | |
6838 | } else { | |
6839 | BGWriter(thread, kMerge); | |
6840 | } | |
6841 | } | |
6842 | ||
6843 | void DoDelete(ThreadState* thread, bool seq) { | |
20effc67 | 6844 | WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0, |
1e59de90 | 6845 | FLAGS_write_batch_protection_bytes_per_key, |
20effc67 | 6846 | user_timestamp_size_); |
7c673cae FG |
6847 | Duration duration(seq ? 0 : FLAGS_duration, deletes_); |
6848 | int64_t i = 0; | |
6849 | std::unique_ptr<const char[]> key_guard; | |
6850 | Slice key = AllocateKey(&key_guard); | |
20effc67 TL |
6851 | std::unique_ptr<char[]> ts_guard; |
6852 | Slice ts; | |
6853 | if (user_timestamp_size_ > 0) { | |
6854 | ts_guard.reset(new char[user_timestamp_size_]); | |
6855 | } | |
7c673cae FG |
6856 | |
6857 | while (!duration.Done(entries_per_batch_)) { | |
6858 | DB* db = SelectDB(thread); | |
6859 | batch.Clear(); | |
6860 | for (int64_t j = 0; j < entries_per_batch_; ++j) { | |
6861 | const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num); | |
6862 | GenerateKeyFromInt(k, FLAGS_num, &key); | |
6863 | batch.Delete(key); | |
6864 | } | |
20effc67 TL |
6865 | Status s; |
6866 | if (user_timestamp_size_ > 0) { | |
6867 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
6868 | s = batch.UpdateTimestamps( |
6869 | ts, [this](uint32_t) { return user_timestamp_size_; }); | |
20effc67 TL |
6870 | if (!s.ok()) { |
6871 | fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str()); | |
6872 | ErrorExit(); | |
6873 | } | |
6874 | } | |
6875 | s = db->Write(write_options_, &batch); | |
7c673cae FG |
6876 | thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete); |
6877 | if (!s.ok()) { | |
6878 | fprintf(stderr, "del error: %s\n", s.ToString().c_str()); | |
6879 | exit(1); | |
6880 | } | |
6881 | i += entries_per_batch_; | |
6882 | } | |
6883 | } | |
6884 | ||
1e59de90 | 6885 | void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); } |
7c673cae | 6886 | |
1e59de90 | 6887 | void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); } |
7c673cae FG |
6888 | |
6889 | void ReadWhileWriting(ThreadState* thread) { | |
6890 | if (thread->tid > 0) { | |
6891 | ReadRandom(thread); | |
6892 | } else { | |
6893 | BGWriter(thread, kWrite); | |
6894 | } | |
6895 | } | |
6896 | ||
1e59de90 TL |
6897 | void MultiReadWhileWriting(ThreadState* thread) { |
6898 | if (thread->tid > 0) { | |
6899 | MultiReadRandom(thread); | |
6900 | } else { | |
6901 | BGWriter(thread, kWrite); | |
6902 | } | |
6903 | } | |
6904 | ||
7c673cae FG |
6905 | void ReadWhileMerging(ThreadState* thread) { |
6906 | if (thread->tid > 0) { | |
6907 | ReadRandom(thread); | |
6908 | } else { | |
6909 | BGWriter(thread, kMerge); | |
6910 | } | |
6911 | } | |
6912 | ||
6913 | void BGWriter(ThreadState* thread, enum OperationType write_merge) { | |
6914 | // Special thread that keeps writing until other threads are done. | |
6915 | RandomGenerator gen; | |
6916 | int64_t bytes = 0; | |
6917 | ||
6918 | std::unique_ptr<RateLimiter> write_rate_limiter; | |
6919 | if (FLAGS_benchmark_write_rate_limit > 0) { | |
6920 | write_rate_limiter.reset( | |
6921 | NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit)); | |
6922 | } | |
6923 | ||
6924 | // Don't merge stats from this thread with the readers. | |
6925 | thread->stats.SetExcludeFromMerge(); | |
6926 | ||
6927 | std::unique_ptr<const char[]> key_guard; | |
6928 | Slice key = AllocateKey(&key_guard); | |
20effc67 | 6929 | std::unique_ptr<char[]> ts_guard; |
1e59de90 TL |
6930 | std::unique_ptr<const char[]> begin_key_guard; |
6931 | Slice begin_key = AllocateKey(&begin_key_guard); | |
6932 | std::unique_ptr<const char[]> end_key_guard; | |
6933 | Slice end_key = AllocateKey(&end_key_guard); | |
6934 | uint64_t num_range_deletions = 0; | |
6935 | std::vector<std::unique_ptr<const char[]>> expanded_key_guards; | |
6936 | std::vector<Slice> expanded_keys; | |
6937 | if (FLAGS_expand_range_tombstones) { | |
6938 | expanded_key_guards.resize(range_tombstone_width_); | |
6939 | for (auto& expanded_key_guard : expanded_key_guards) { | |
6940 | expanded_keys.emplace_back(AllocateKey(&expanded_key_guard)); | |
6941 | } | |
6942 | } | |
20effc67 TL |
6943 | if (user_timestamp_size_ > 0) { |
6944 | ts_guard.reset(new char[user_timestamp_size_]); | |
6945 | } | |
7c673cae FG |
6946 | uint32_t written = 0; |
6947 | bool hint_printed = false; | |
6948 | ||
6949 | while (true) { | |
6950 | DB* db = SelectDB(thread); | |
6951 | { | |
6952 | MutexLock l(&thread->shared->mu); | |
6953 | if (FLAGS_finish_after_writes && written == writes_) { | |
6954 | fprintf(stderr, "Exiting the writer after %u writes...\n", written); | |
6955 | break; | |
6956 | } | |
6957 | if (thread->shared->num_done + 1 >= thread->shared->num_initialized) { | |
6958 | // Other threads have finished | |
6959 | if (FLAGS_finish_after_writes) { | |
6960 | // Wait for the writes to be finished | |
6961 | if (!hint_printed) { | |
6962 | fprintf(stderr, "Reads are finished. Have %d more writes to do\n", | |
f67539c2 | 6963 | static_cast<int>(writes_) - written); |
7c673cae FG |
6964 | hint_printed = true; |
6965 | } | |
6966 | } else { | |
6967 | // Finish the write immediately | |
6968 | break; | |
6969 | } | |
6970 | } | |
6971 | } | |
6972 | ||
6973 | GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); | |
6974 | Status s; | |
6975 | ||
f67539c2 | 6976 | Slice val = gen.Generate(); |
20effc67 TL |
6977 | Slice ts; |
6978 | if (user_timestamp_size_ > 0) { | |
6979 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
20effc67 | 6980 | } |
7c673cae | 6981 | if (write_merge == kWrite) { |
1e59de90 TL |
6982 | if (user_timestamp_size_ == 0) { |
6983 | s = db->Put(write_options_, key, val); | |
6984 | } else { | |
6985 | s = db->Put(write_options_, key, ts, val); | |
6986 | } | |
7c673cae | 6987 | } else { |
f67539c2 | 6988 | s = db->Merge(write_options_, key, val); |
7c673cae | 6989 | } |
20effc67 | 6990 | // Restore write_options_ |
7c673cae FG |
6991 | written++; |
6992 | ||
6993 | if (!s.ok()) { | |
6994 | fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); | |
6995 | exit(1); | |
6996 | } | |
20effc67 | 6997 | bytes += key.size() + val.size() + user_timestamp_size_; |
7c673cae FG |
6998 | thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); |
6999 | ||
7000 | if (FLAGS_benchmark_write_rate_limit > 0) { | |
1e59de90 TL |
7001 | write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH, |
7002 | nullptr /* stats */, | |
7003 | RateLimiter::OpType::kWrite); | |
7004 | } | |
7005 | ||
7006 | if (writes_per_range_tombstone_ > 0 && | |
7007 | written > writes_before_delete_range_ && | |
7008 | (written - writes_before_delete_range_) / | |
7009 | writes_per_range_tombstone_ <= | |
7010 | max_num_range_tombstones_ && | |
7011 | (written - writes_before_delete_range_) % | |
7012 | writes_per_range_tombstone_ == | |
7013 | 0) { | |
7014 | num_range_deletions++; | |
7015 | int64_t begin_num = thread->rand.Next() % FLAGS_num; | |
7016 | if (FLAGS_expand_range_tombstones) { | |
7017 | for (int64_t offset = 0; offset < range_tombstone_width_; ++offset) { | |
7018 | GenerateKeyFromInt(begin_num + offset, FLAGS_num, | |
7019 | &expanded_keys[offset]); | |
7020 | if (!db->Delete(write_options_, expanded_keys[offset]).ok()) { | |
7021 | fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); | |
7022 | exit(1); | |
7023 | } | |
7024 | } | |
7025 | } else { | |
7026 | GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key); | |
7027 | GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num, | |
7028 | &end_key); | |
7029 | if (!db->DeleteRange(write_options_, db->DefaultColumnFamily(), | |
7030 | begin_key, end_key) | |
7031 | .ok()) { | |
7032 | fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str()); | |
7033 | exit(1); | |
7034 | } | |
7035 | } | |
7036 | thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); | |
7037 | // TODO: DeleteRange is not included in calculcation of bytes/rate | |
7038 | // limiter request | |
7c673cae FG |
7039 | } |
7040 | } | |
1e59de90 TL |
7041 | if (num_range_deletions > 0) { |
7042 | std::cout << "Number of range deletions: " << num_range_deletions | |
7043 | << std::endl; | |
7044 | } | |
7c673cae FG |
7045 | thread->stats.AddBytes(bytes); |
7046 | } | |
7047 | ||
11fdf7f2 TL |
7048 | void ReadWhileScanning(ThreadState* thread) { |
7049 | if (thread->tid > 0) { | |
7050 | ReadRandom(thread); | |
7051 | } else { | |
7052 | BGScan(thread); | |
7053 | } | |
7054 | } | |
7055 | ||
7056 | void BGScan(ThreadState* thread) { | |
7057 | if (FLAGS_num_multi_db > 0) { | |
7058 | fprintf(stderr, "Not supporting multiple DBs.\n"); | |
7059 | abort(); | |
7060 | } | |
7061 | assert(db_.db != nullptr); | |
1e59de90 | 7062 | ReadOptions read_options = read_options_; |
20effc67 TL |
7063 | std::unique_ptr<char[]> ts_guard; |
7064 | Slice ts; | |
7065 | if (user_timestamp_size_ > 0) { | |
7066 | ts_guard.reset(new char[user_timestamp_size_]); | |
7067 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
7068 | read_options.timestamp = &ts; | |
7069 | } | |
11fdf7f2 TL |
7070 | Iterator* iter = db_.db->NewIterator(read_options); |
7071 | ||
7072 | fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_); | |
7073 | Duration duration(FLAGS_duration, reads_); | |
7074 | uint64_t num_seek_to_first = 0; | |
7075 | uint64_t num_next = 0; | |
7076 | while (!duration.Done(1)) { | |
7077 | if (!iter->Valid()) { | |
7078 | iter->SeekToFirst(); | |
7079 | num_seek_to_first++; | |
7080 | } else if (!iter->status().ok()) { | |
7081 | fprintf(stderr, "Iterator error: %s\n", | |
7082 | iter->status().ToString().c_str()); | |
7083 | abort(); | |
7084 | } else { | |
7085 | iter->Next(); | |
7086 | num_next++; | |
7087 | } | |
7088 | ||
7089 | thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); | |
7090 | } | |
7091 | delete iter; | |
7092 | } | |
7093 | ||
7c673cae FG |
7094 | // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) |
7095 | // in DB atomically i.e in a single batch. Also refer GetMany. | |
7096 | Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key, | |
7097 | const Slice& value) { | |
7098 | std::string suffixes[3] = {"2", "1", "0"}; | |
7099 | std::string keys[3]; | |
7100 | ||
20effc67 | 7101 | WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0, |
1e59de90 | 7102 | FLAGS_write_batch_protection_bytes_per_key, |
20effc67 | 7103 | user_timestamp_size_); |
7c673cae FG |
7104 | Status s; |
7105 | for (int i = 0; i < 3; i++) { | |
7106 | keys[i] = key.ToString() + suffixes[i]; | |
7107 | batch.Put(keys[i], value); | |
7108 | } | |
7109 | ||
20effc67 TL |
7110 | std::unique_ptr<char[]> ts_guard; |
7111 | if (user_timestamp_size_ > 0) { | |
7112 | ts_guard.reset(new char[user_timestamp_size_]); | |
7113 | Slice ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
7114 | s = batch.UpdateTimestamps( |
7115 | ts, [this](uint32_t) { return user_timestamp_size_; }); | |
20effc67 TL |
7116 | if (!s.ok()) { |
7117 | fprintf(stderr, "assign timestamp to batch: %s\n", | |
7118 | s.ToString().c_str()); | |
7119 | ErrorExit(); | |
7120 | } | |
7121 | } | |
7122 | ||
7c673cae FG |
7123 | s = db->Write(writeoptions, &batch); |
7124 | return s; | |
7125 | } | |
7126 | ||
7c673cae FG |
7127 | // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V) |
7128 | // in DB atomically i.e in a single batch. Also refer GetMany. | |
7129 | Status DeleteMany(DB* db, const WriteOptions& writeoptions, | |
7130 | const Slice& key) { | |
7131 | std::string suffixes[3] = {"1", "2", "0"}; | |
7132 | std::string keys[3]; | |
7133 | ||
1e59de90 TL |
7134 | WriteBatch batch(0, 0, FLAGS_write_batch_protection_bytes_per_key, |
7135 | user_timestamp_size_); | |
7c673cae FG |
7136 | Status s; |
7137 | for (int i = 0; i < 3; i++) { | |
7138 | keys[i] = key.ToString() + suffixes[i]; | |
7139 | batch.Delete(keys[i]); | |
7140 | } | |
7141 | ||
20effc67 TL |
7142 | std::unique_ptr<char[]> ts_guard; |
7143 | if (user_timestamp_size_ > 0) { | |
7144 | ts_guard.reset(new char[user_timestamp_size_]); | |
7145 | Slice ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
7146 | s = batch.UpdateTimestamps( |
7147 | ts, [this](uint32_t) { return user_timestamp_size_; }); | |
20effc67 TL |
7148 | if (!s.ok()) { |
7149 | fprintf(stderr, "assign timestamp to batch: %s\n", | |
7150 | s.ToString().c_str()); | |
7151 | ErrorExit(); | |
7152 | } | |
7153 | } | |
7154 | ||
7c673cae FG |
7155 | s = db->Write(writeoptions, &batch); |
7156 | return s; | |
7157 | } | |
7158 | ||
7159 | // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2" | |
7160 | // in the same snapshot, and verifies that all the values are identical. | |
7161 | // ASSUMES that PutMany was used to put (K, V) into the DB. | |
1e59de90 | 7162 | Status GetMany(DB* db, const Slice& key, std::string* value) { |
7c673cae FG |
7163 | std::string suffixes[3] = {"0", "1", "2"}; |
7164 | std::string keys[3]; | |
7165 | Slice key_slices[3]; | |
7166 | std::string values[3]; | |
1e59de90 | 7167 | ReadOptions readoptionscopy = read_options_; |
20effc67 TL |
7168 | |
7169 | std::unique_ptr<char[]> ts_guard; | |
7170 | Slice ts; | |
7171 | if (user_timestamp_size_ > 0) { | |
7172 | ts_guard.reset(new char[user_timestamp_size_]); | |
7173 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
7174 | readoptionscopy.timestamp = &ts; | |
7175 | } | |
7176 | ||
7c673cae FG |
7177 | readoptionscopy.snapshot = db->GetSnapshot(); |
7178 | Status s; | |
7179 | for (int i = 0; i < 3; i++) { | |
7180 | keys[i] = key.ToString() + suffixes[i]; | |
7181 | key_slices[i] = keys[i]; | |
7182 | s = db->Get(readoptionscopy, key_slices[i], value); | |
7183 | if (!s.ok() && !s.IsNotFound()) { | |
7184 | fprintf(stderr, "get error: %s\n", s.ToString().c_str()); | |
7185 | values[i] = ""; | |
7186 | // we continue after error rather than exiting so that we can | |
7187 | // find more errors if any | |
7188 | } else if (s.IsNotFound()) { | |
7189 | values[i] = ""; | |
7190 | } else { | |
7191 | values[i] = *value; | |
7192 | } | |
7193 | } | |
7194 | db->ReleaseSnapshot(readoptionscopy.snapshot); | |
7195 | ||
7196 | if ((values[0] != values[1]) || (values[1] != values[2])) { | |
7197 | fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n", | |
7198 | key.ToString().c_str(), values[0].c_str(), values[1].c_str(), | |
7199 | values[2].c_str()); | |
7200 | // we continue after error rather than exiting so that we can | |
7201 | // find more errors if any | |
7202 | } | |
7203 | ||
7204 | return s; | |
7205 | } | |
7206 | ||
7207 | // Differs from readrandomwriterandom in the following ways: | |
7208 | // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs. | |
7209 | // (b) Does deletes as well (per FLAGS_deletepercent) | |
7210 | // (c) In order to achieve high % of 'found' during lookups, and to do | |
7211 | // multiple writes (including puts and deletes) it uses upto | |
7212 | // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys. | |
7213 | // (d) Does not have a MultiGet option. | |
7214 | void RandomWithVerify(ThreadState* thread) { | |
7c673cae FG |
7215 | RandomGenerator gen; |
7216 | std::string value; | |
7217 | int64_t found = 0; | |
7218 | int get_weight = 0; | |
7219 | int put_weight = 0; | |
7220 | int delete_weight = 0; | |
7221 | int64_t gets_done = 0; | |
7222 | int64_t puts_done = 0; | |
7223 | int64_t deletes_done = 0; | |
7224 | ||
7225 | std::unique_ptr<const char[]> key_guard; | |
7226 | Slice key = AllocateKey(&key_guard); | |
7227 | ||
7228 | // the number of iterations is the larger of read_ or write_ | |
7229 | for (int64_t i = 0; i < readwrites_; i++) { | |
7230 | DB* db = SelectDB(thread); | |
7231 | if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { | |
7232 | // one batch completed, reinitialize for next batch | |
7233 | get_weight = FLAGS_readwritepercent; | |
7234 | delete_weight = FLAGS_deletepercent; | |
7235 | put_weight = 100 - get_weight - delete_weight; | |
7236 | } | |
7237 | GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct, | |
1e59de90 | 7238 | FLAGS_numdistinct, &key); |
7c673cae FG |
7239 | if (get_weight > 0) { |
7240 | // do all the gets first | |
1e59de90 | 7241 | Status s = GetMany(db, key, &value); |
7c673cae FG |
7242 | if (!s.ok() && !s.IsNotFound()) { |
7243 | fprintf(stderr, "getmany error: %s\n", s.ToString().c_str()); | |
7244 | // we continue after error rather than exiting so that we can | |
7245 | // find more errors if any | |
7246 | } else if (!s.IsNotFound()) { | |
7247 | found++; | |
7248 | } | |
7249 | get_weight--; | |
7250 | gets_done++; | |
7251 | thread->stats.FinishedOps(&db_, db_.db, 1, kRead); | |
7252 | } else if (put_weight > 0) { | |
7253 | // then do all the corresponding number of puts | |
7254 | // for all the gets we have done earlier | |
f67539c2 | 7255 | Status s = PutMany(db, write_options_, key, gen.Generate()); |
7c673cae FG |
7256 | if (!s.ok()) { |
7257 | fprintf(stderr, "putmany error: %s\n", s.ToString().c_str()); | |
7258 | exit(1); | |
7259 | } | |
7260 | put_weight--; | |
7261 | puts_done++; | |
7262 | thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); | |
7263 | } else if (delete_weight > 0) { | |
7264 | Status s = DeleteMany(db, write_options_, key); | |
7265 | if (!s.ok()) { | |
7266 | fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str()); | |
7267 | exit(1); | |
7268 | } | |
7269 | delete_weight--; | |
7270 | deletes_done++; | |
7271 | thread->stats.FinishedOps(&db_, db_.db, 1, kDelete); | |
7272 | } | |
7273 | } | |
11fdf7f2 | 7274 | char msg[128]; |
7c673cae | 7275 | snprintf(msg, sizeof(msg), |
1e59de90 TL |
7276 | "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64 |
7277 | " found:%" PRIu64 ")", | |
7c673cae FG |
7278 | gets_done, puts_done, deletes_done, readwrites_, found); |
7279 | thread->stats.AddMessage(msg); | |
7280 | } | |
7281 | ||
7282 | // This is different from ReadWhileWriting because it does not use | |
7283 | // an extra thread. | |
7284 | void ReadRandomWriteRandom(ThreadState* thread) { | |
1e59de90 | 7285 | ReadOptions options = read_options_; |
7c673cae FG |
7286 | RandomGenerator gen; |
7287 | std::string value; | |
7288 | int64_t found = 0; | |
7289 | int get_weight = 0; | |
7290 | int put_weight = 0; | |
7291 | int64_t reads_done = 0; | |
7292 | int64_t writes_done = 0; | |
7293 | Duration duration(FLAGS_duration, readwrites_); | |
7294 | ||
7295 | std::unique_ptr<const char[]> key_guard; | |
7296 | Slice key = AllocateKey(&key_guard); | |
7297 | ||
20effc67 TL |
7298 | std::unique_ptr<char[]> ts_guard; |
7299 | if (user_timestamp_size_ > 0) { | |
7300 | ts_guard.reset(new char[user_timestamp_size_]); | |
7301 | } | |
7302 | ||
7c673cae FG |
7303 | // the number of iterations is the larger of read_ or write_ |
7304 | while (!duration.Done(1)) { | |
7305 | DB* db = SelectDB(thread); | |
7306 | GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); | |
7307 | if (get_weight == 0 && put_weight == 0) { | |
7308 | // one batch completed, reinitialize for next batch | |
7309 | get_weight = FLAGS_readwritepercent; | |
7310 | put_weight = 100 - get_weight; | |
7311 | } | |
7312 | if (get_weight > 0) { | |
7313 | // do all the gets first | |
20effc67 TL |
7314 | Slice ts; |
7315 | if (user_timestamp_size_ > 0) { | |
7316 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, | |
7317 | ts_guard.get()); | |
7318 | options.timestamp = &ts; | |
7319 | } | |
7c673cae FG |
7320 | Status s = db->Get(options, key, &value); |
7321 | if (!s.ok() && !s.IsNotFound()) { | |
7322 | fprintf(stderr, "get error: %s\n", s.ToString().c_str()); | |
7323 | // we continue after error rather than exiting so that we can | |
7324 | // find more errors if any | |
7325 | } else if (!s.IsNotFound()) { | |
7326 | found++; | |
7327 | } | |
7328 | get_weight--; | |
7329 | reads_done++; | |
7330 | thread->stats.FinishedOps(nullptr, db, 1, kRead); | |
1e59de90 | 7331 | } else if (put_weight > 0) { |
7c673cae FG |
7332 | // then do all the corresponding number of puts |
7333 | // for all the gets we have done earlier | |
1e59de90 | 7334 | Status s; |
20effc67 | 7335 | if (user_timestamp_size_ > 0) { |
1e59de90 TL |
7336 | Slice ts = mock_app_clock_->Allocate(ts_guard.get()); |
7337 | s = db->Put(write_options_, key, ts, gen.Generate()); | |
7338 | } else { | |
7339 | s = db->Put(write_options_, key, gen.Generate()); | |
20effc67 | 7340 | } |
7c673cae FG |
7341 | if (!s.ok()) { |
7342 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
20effc67 | 7343 | ErrorExit(); |
7c673cae FG |
7344 | } |
7345 | put_weight--; | |
7346 | writes_done++; | |
7347 | thread->stats.FinishedOps(nullptr, db, 1, kWrite); | |
7348 | } | |
7349 | } | |
7350 | char msg[100]; | |
1e59de90 TL |
7351 | snprintf(msg, sizeof(msg), |
7352 | "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 | |
7353 | " found:%" PRIu64 ")", | |
7c673cae FG |
7354 | reads_done, writes_done, readwrites_, found); |
7355 | thread->stats.AddMessage(msg); | |
7356 | } | |
7357 | ||
7358 | // | |
7359 | // Read-modify-write for random keys | |
7360 | void UpdateRandom(ThreadState* thread) { | |
1e59de90 | 7361 | ReadOptions options = read_options_; |
7c673cae FG |
7362 | RandomGenerator gen; |
7363 | std::string value; | |
7364 | int64_t found = 0; | |
7365 | int64_t bytes = 0; | |
7366 | Duration duration(FLAGS_duration, readwrites_); | |
7367 | ||
7368 | std::unique_ptr<const char[]> key_guard; | |
7369 | Slice key = AllocateKey(&key_guard); | |
20effc67 TL |
7370 | std::unique_ptr<char[]> ts_guard; |
7371 | if (user_timestamp_size_ > 0) { | |
7372 | ts_guard.reset(new char[user_timestamp_size_]); | |
7373 | } | |
7c673cae FG |
7374 | // the number of iterations is the larger of read_ or write_ |
7375 | while (!duration.Done(1)) { | |
7376 | DB* db = SelectDB(thread); | |
7377 | GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); | |
20effc67 TL |
7378 | Slice ts; |
7379 | if (user_timestamp_size_ > 0) { | |
7380 | // Read with newest timestamp because we are doing rmw. | |
7381 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
7382 | options.timestamp = &ts; | |
7383 | } | |
7c673cae FG |
7384 | |
7385 | auto status = db->Get(options, key, &value); | |
7386 | if (status.ok()) { | |
7387 | ++found; | |
20effc67 | 7388 | bytes += key.size() + value.size() + user_timestamp_size_; |
7c673cae FG |
7389 | } else if (!status.IsNotFound()) { |
7390 | fprintf(stderr, "Get returned an error: %s\n", | |
7391 | status.ToString().c_str()); | |
7392 | abort(); | |
7393 | } | |
7394 | ||
11fdf7f2 TL |
7395 | if (thread->shared->write_rate_limiter) { |
7396 | thread->shared->write_rate_limiter->Request( | |
f67539c2 | 7397 | key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/, |
11fdf7f2 TL |
7398 | RateLimiter::OpType::kWrite); |
7399 | } | |
7400 | ||
f67539c2 | 7401 | Slice val = gen.Generate(); |
1e59de90 | 7402 | Status s; |
20effc67 TL |
7403 | if (user_timestamp_size_ > 0) { |
7404 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
7405 | s = db->Put(write_options_, key, ts, val); |
7406 | } else { | |
7407 | s = db->Put(write_options_, key, val); | |
20effc67 | 7408 | } |
7c673cae FG |
7409 | if (!s.ok()) { |
7410 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
7411 | exit(1); | |
7412 | } | |
20effc67 | 7413 | bytes += key.size() + val.size() + user_timestamp_size_; |
7c673cae FG |
7414 | thread->stats.FinishedOps(nullptr, db, 1, kUpdate); |
7415 | } | |
7416 | char msg[100]; | |
1e59de90 TL |
7417 | snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", |
7418 | readwrites_, found); | |
7c673cae FG |
7419 | thread->stats.AddBytes(bytes); |
7420 | thread->stats.AddMessage(msg); | |
7421 | } | |
7422 | ||
11fdf7f2 TL |
7423 | // Read-XOR-write for random keys. Xors the existing value with a randomly |
7424 | // generated value, and stores the result. Assuming A in the array of bytes | |
7425 | // representing the existing value, we generate an array B of the same size, | |
7426 | // then compute C = A^B as C[i]=A[i]^B[i], and store C | |
7427 | void XORUpdateRandom(ThreadState* thread) { | |
1e59de90 | 7428 | ReadOptions options = read_options_; |
11fdf7f2 TL |
7429 | RandomGenerator gen; |
7430 | std::string existing_value; | |
7431 | int64_t found = 0; | |
7432 | Duration duration(FLAGS_duration, readwrites_); | |
7433 | ||
7434 | BytesXOROperator xor_operator; | |
7435 | ||
7436 | std::unique_ptr<const char[]> key_guard; | |
7437 | Slice key = AllocateKey(&key_guard); | |
20effc67 TL |
7438 | std::unique_ptr<char[]> ts_guard; |
7439 | if (user_timestamp_size_ > 0) { | |
7440 | ts_guard.reset(new char[user_timestamp_size_]); | |
7441 | } | |
11fdf7f2 TL |
7442 | // the number of iterations is the larger of read_ or write_ |
7443 | while (!duration.Done(1)) { | |
7444 | DB* db = SelectDB(thread); | |
7445 | GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); | |
20effc67 TL |
7446 | Slice ts; |
7447 | if (user_timestamp_size_ > 0) { | |
7448 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
7449 | options.timestamp = &ts; | |
7450 | } | |
11fdf7f2 TL |
7451 | |
7452 | auto status = db->Get(options, key, &existing_value); | |
7453 | if (status.ok()) { | |
7454 | ++found; | |
7455 | } else if (!status.IsNotFound()) { | |
7456 | fprintf(stderr, "Get returned an error: %s\n", | |
7457 | status.ToString().c_str()); | |
7458 | exit(1); | |
7459 | } | |
7460 | ||
1e59de90 TL |
7461 | Slice value = |
7462 | gen.Generate(static_cast<unsigned int>(existing_value.size())); | |
11fdf7f2 TL |
7463 | std::string new_value; |
7464 | ||
7465 | if (status.ok()) { | |
7466 | Slice existing_value_slice = Slice(existing_value); | |
7467 | xor_operator.XOR(&existing_value_slice, value, &new_value); | |
7468 | } else { | |
7469 | xor_operator.XOR(nullptr, value, &new_value); | |
7470 | } | |
7471 | ||
1e59de90 | 7472 | Status s; |
20effc67 TL |
7473 | if (user_timestamp_size_ > 0) { |
7474 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
7475 | s = db->Put(write_options_, key, ts, Slice(new_value)); |
7476 | } else { | |
7477 | s = db->Put(write_options_, key, Slice(new_value)); | |
20effc67 | 7478 | } |
11fdf7f2 TL |
7479 | if (!s.ok()) { |
7480 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
20effc67 | 7481 | ErrorExit(); |
11fdf7f2 TL |
7482 | } |
7483 | thread->stats.FinishedOps(nullptr, db, 1); | |
7484 | } | |
7485 | char msg[100]; | |
1e59de90 TL |
7486 | snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", |
7487 | readwrites_, found); | |
11fdf7f2 TL |
7488 | thread->stats.AddMessage(msg); |
7489 | } | |
7490 | ||
7c673cae FG |
7491 | // Read-modify-write for random keys. |
7492 | // Each operation causes the key grow by value_size (simulating an append). | |
7493 | // Generally used for benchmarking against merges of similar type | |
7494 | void AppendRandom(ThreadState* thread) { | |
1e59de90 | 7495 | ReadOptions options = read_options_; |
7c673cae FG |
7496 | RandomGenerator gen; |
7497 | std::string value; | |
7498 | int64_t found = 0; | |
7499 | int64_t bytes = 0; | |
7500 | ||
7501 | std::unique_ptr<const char[]> key_guard; | |
7502 | Slice key = AllocateKey(&key_guard); | |
20effc67 TL |
7503 | std::unique_ptr<char[]> ts_guard; |
7504 | if (user_timestamp_size_ > 0) { | |
7505 | ts_guard.reset(new char[user_timestamp_size_]); | |
7506 | } | |
7c673cae FG |
7507 | // The number of iterations is the larger of read_ or write_ |
7508 | Duration duration(FLAGS_duration, readwrites_); | |
7509 | while (!duration.Done(1)) { | |
7510 | DB* db = SelectDB(thread); | |
7511 | GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); | |
20effc67 TL |
7512 | Slice ts; |
7513 | if (user_timestamp_size_ > 0) { | |
7514 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
7515 | options.timestamp = &ts; | |
7516 | } | |
7c673cae FG |
7517 | |
7518 | auto status = db->Get(options, key, &value); | |
7519 | if (status.ok()) { | |
7520 | ++found; | |
20effc67 | 7521 | bytes += key.size() + value.size() + user_timestamp_size_; |
7c673cae FG |
7522 | } else if (!status.IsNotFound()) { |
7523 | fprintf(stderr, "Get returned an error: %s\n", | |
7524 | status.ToString().c_str()); | |
7525 | abort(); | |
7526 | } else { | |
7527 | // If not existing, then just assume an empty string of data | |
7528 | value.clear(); | |
7529 | } | |
7530 | ||
7531 | // Update the value (by appending data) | |
f67539c2 | 7532 | Slice operand = gen.Generate(); |
7c673cae FG |
7533 | if (value.size() > 0) { |
7534 | // Use a delimiter to match the semantics for StringAppendOperator | |
1e59de90 | 7535 | value.append(1, ','); |
7c673cae FG |
7536 | } |
7537 | value.append(operand.data(), operand.size()); | |
7538 | ||
1e59de90 | 7539 | Status s; |
20effc67 TL |
7540 | if (user_timestamp_size_ > 0) { |
7541 | ts = mock_app_clock_->Allocate(ts_guard.get()); | |
1e59de90 TL |
7542 | s = db->Put(write_options_, key, ts, value); |
7543 | } else { | |
7544 | // Write back to the database | |
7545 | s = db->Put(write_options_, key, value); | |
20effc67 | 7546 | } |
7c673cae FG |
7547 | if (!s.ok()) { |
7548 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
20effc67 | 7549 | ErrorExit(); |
7c673cae | 7550 | } |
20effc67 | 7551 | bytes += key.size() + value.size() + user_timestamp_size_; |
7c673cae FG |
7552 | thread->stats.FinishedOps(nullptr, db, 1, kUpdate); |
7553 | } | |
7554 | ||
7555 | char msg[100]; | |
7556 | snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", | |
1e59de90 | 7557 | readwrites_, found); |
7c673cae FG |
7558 | thread->stats.AddBytes(bytes); |
7559 | thread->stats.AddMessage(msg); | |
7560 | } | |
7561 | ||
7562 | // Read-modify-write for random keys (using MergeOperator) | |
7563 | // The merge operator to use should be defined by FLAGS_merge_operator | |
7564 | // Adjust FLAGS_value_size so that the keys are reasonable for this operator | |
7565 | // Assumes that the merge operator is non-null (i.e.: is well-defined) | |
7566 | // | |
7567 | // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8 | |
7568 | // to simulate random additions over 64-bit integers using merge. | |
7569 | // | |
7570 | // The number of merges on the same key can be controlled by adjusting | |
7571 | // FLAGS_merge_keys. | |
7572 | void MergeRandom(ThreadState* thread) { | |
7573 | RandomGenerator gen; | |
7574 | int64_t bytes = 0; | |
7575 | std::unique_ptr<const char[]> key_guard; | |
7576 | Slice key = AllocateKey(&key_guard); | |
7577 | // The number of iterations is the larger of read_ or write_ | |
7578 | Duration duration(FLAGS_duration, readwrites_); | |
7579 | while (!duration.Done(1)) { | |
11fdf7f2 TL |
7580 | DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); |
7581 | int64_t key_rand = thread->rand.Next() % merge_keys_; | |
7582 | GenerateKeyFromInt(key_rand, merge_keys_, &key); | |
7c673cae | 7583 | |
11fdf7f2 | 7584 | Status s; |
f67539c2 | 7585 | Slice val = gen.Generate(); |
11fdf7f2 TL |
7586 | if (FLAGS_num_column_families > 1) { |
7587 | s = db_with_cfh->db->Merge(write_options_, | |
1e59de90 | 7588 | db_with_cfh->GetCfh(key_rand), key, val); |
11fdf7f2 | 7589 | } else { |
1e59de90 TL |
7590 | s = db_with_cfh->db->Merge( |
7591 | write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val); | |
11fdf7f2 | 7592 | } |
7c673cae FG |
7593 | |
7594 | if (!s.ok()) { | |
7595 | fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); | |
7596 | exit(1); | |
7597 | } | |
f67539c2 | 7598 | bytes += key.size() + val.size(); |
11fdf7f2 | 7599 | thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge); |
7c673cae FG |
7600 | } |
7601 | ||
7602 | // Print some statistics | |
7603 | char msg[100]; | |
7604 | snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_); | |
7605 | thread->stats.AddBytes(bytes); | |
7606 | thread->stats.AddMessage(msg); | |
7607 | } | |
7608 | ||
7609 | // Read and merge random keys. The amount of reads and merges are controlled | |
7610 | // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct | |
7611 | // keys (and thus also the number of reads and merges on the same key) can be | |
7612 | // adjusted with FLAGS_merge_keys. | |
7613 | // | |
7614 | // As with MergeRandom, the merge operator to use should be defined by | |
7615 | // FLAGS_merge_operator. | |
7616 | void ReadRandomMergeRandom(ThreadState* thread) { | |
7c673cae FG |
7617 | RandomGenerator gen; |
7618 | std::string value; | |
7619 | int64_t num_hits = 0; | |
7620 | int64_t num_gets = 0; | |
7621 | int64_t num_merges = 0; | |
7622 | size_t max_length = 0; | |
7623 | ||
7624 | std::unique_ptr<const char[]> key_guard; | |
7625 | Slice key = AllocateKey(&key_guard); | |
7626 | // the number of iterations is the larger of read_ or write_ | |
7627 | Duration duration(FLAGS_duration, readwrites_); | |
7628 | while (!duration.Done(1)) { | |
7629 | DB* db = SelectDB(thread); | |
7630 | GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key); | |
7631 | ||
7632 | bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; | |
7633 | ||
7634 | if (do_merge) { | |
f67539c2 | 7635 | Status s = db->Merge(write_options_, key, gen.Generate()); |
7c673cae FG |
7636 | if (!s.ok()) { |
7637 | fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); | |
7638 | exit(1); | |
7639 | } | |
7640 | num_merges++; | |
7641 | thread->stats.FinishedOps(nullptr, db, 1, kMerge); | |
7642 | } else { | |
1e59de90 TL |
7643 | Status s = db->Get(read_options_, key, &value); |
7644 | if (value.length() > max_length) max_length = value.length(); | |
7c673cae FG |
7645 | |
7646 | if (!s.ok() && !s.IsNotFound()) { | |
7647 | fprintf(stderr, "get error: %s\n", s.ToString().c_str()); | |
7648 | // we continue after error rather than exiting so that we can | |
7649 | // find more errors if any | |
7650 | } else if (!s.IsNotFound()) { | |
7651 | num_hits++; | |
7652 | } | |
7653 | num_gets++; | |
7654 | thread->stats.FinishedOps(nullptr, db, 1, kRead); | |
7655 | } | |
7656 | } | |
7657 | ||
7658 | char msg[100]; | |
7659 | snprintf(msg, sizeof(msg), | |
7660 | "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 | |
7661 | " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")", | |
7662 | num_gets, num_merges, readwrites_, num_hits, max_length); | |
7663 | thread->stats.AddMessage(msg); | |
7664 | } | |
7665 | ||
7666 | void WriteSeqSeekSeq(ThreadState* thread) { | |
7667 | writes_ = FLAGS_num; | |
7668 | DoWrite(thread, SEQUENTIAL); | |
7669 | // exclude writes from the ops/sec calculation | |
7670 | thread->stats.Start(thread->tid); | |
7671 | ||
7672 | DB* db = SelectDB(thread); | |
1e59de90 | 7673 | ReadOptions read_opts = read_options_; |
20effc67 TL |
7674 | std::unique_ptr<char[]> ts_guard; |
7675 | Slice ts; | |
7676 | if (user_timestamp_size_ > 0) { | |
7677 | ts_guard.reset(new char[user_timestamp_size_]); | |
7678 | ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); | |
7679 | read_opts.timestamp = &ts; | |
7680 | } | |
7681 | std::unique_ptr<Iterator> iter(db->NewIterator(read_opts)); | |
7c673cae FG |
7682 | |
7683 | std::unique_ptr<const char[]> key_guard; | |
7684 | Slice key = AllocateKey(&key_guard); | |
7685 | for (int64_t i = 0; i < FLAGS_num; ++i) { | |
7686 | GenerateKeyFromInt(i, FLAGS_num, &key); | |
7687 | iter->Seek(key); | |
7688 | assert(iter->Valid() && iter->key() == key); | |
7689 | thread->stats.FinishedOps(nullptr, db, 1, kSeek); | |
7690 | ||
7691 | for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) { | |
7692 | if (!FLAGS_reverse_iterator) { | |
7693 | iter->Next(); | |
7694 | } else { | |
7695 | iter->Prev(); | |
7696 | } | |
7697 | GenerateKeyFromInt(++i, FLAGS_num, &key); | |
7698 | assert(iter->Valid() && iter->key() == key); | |
7699 | thread->stats.FinishedOps(nullptr, db, 1, kSeek); | |
7700 | } | |
7701 | ||
7702 | iter->Seek(key); | |
7703 | assert(iter->Valid() && iter->key() == key); | |
7704 | thread->stats.FinishedOps(nullptr, db, 1, kSeek); | |
7705 | } | |
7706 | } | |
7707 | ||
f67539c2 TL |
7708 | bool binary_search(std::vector<int>& data, int start, int end, int key) { |
7709 | if (data.empty()) return false; | |
7710 | if (start > end) return false; | |
7711 | int mid = start + (end - start) / 2; | |
7712 | if (mid > static_cast<int>(data.size()) - 1) return false; | |
7713 | if (data[mid] == key) { | |
7714 | return true; | |
7715 | } else if (data[mid] > key) { | |
7716 | return binary_search(data, start, mid - 1, key); | |
7717 | } else { | |
7718 | return binary_search(data, mid + 1, end, key); | |
7719 | } | |
7720 | } | |
7721 | ||
7722 | // Does a bunch of merge operations for a key(key1) where the merge operand | |
7723 | // is a sorted list. Next performance comparison is done between doing a Get | |
7724 | // for key1 followed by searching for another key(key2) in the large sorted | |
7725 | // list vs calling GetMergeOperands for key1 and then searching for the key2 | |
7726 | // in all the sorted sub-lists. Later case is expected to be a lot faster. | |
7727 | void GetMergeOperands(ThreadState* thread) { | |
7728 | DB* db = SelectDB(thread); | |
7729 | const int kTotalValues = 100000; | |
7730 | const int kListSize = 100; | |
7731 | std::string key = "my_key"; | |
7732 | std::string value; | |
7733 | ||
7734 | for (int i = 1; i < kTotalValues; i++) { | |
7735 | if (i % kListSize == 0) { | |
7736 | // Remove trailing ',' | |
7737 | value.pop_back(); | |
7738 | db->Merge(WriteOptions(), key, value); | |
7739 | value.clear(); | |
7740 | } else { | |
7741 | value.append(std::to_string(i)).append(","); | |
7742 | } | |
7743 | } | |
7744 | ||
7745 | SortList s; | |
7746 | std::vector<int> data; | |
7747 | // This value can be experimented with and it will demonstrate the | |
7748 | // perf difference between doing a Get and searching for lookup_key in the | |
7749 | // resultant large sorted list vs doing GetMergeOperands and searching | |
7750 | // for lookup_key within this resultant sorted sub-lists. | |
7751 | int lookup_key = 1; | |
7752 | ||
7753 | // Get API call | |
7754 | std::cout << "--- Get API call --- \n"; | |
7755 | PinnableSlice p_slice; | |
7756 | uint64_t st = FLAGS_env->NowNanos(); | |
7757 | db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice); | |
7758 | s.MakeVector(data, p_slice); | |
7759 | bool found = | |
7760 | binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key); | |
7761 | std::cout << "Found key? " << std::to_string(found) << "\n"; | |
7762 | uint64_t sp = FLAGS_env->NowNanos(); | |
7763 | std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n"; | |
7764 | std::string* dat_ = p_slice.GetSelf(); | |
7765 | std::cout << "Sample data from Get API call: " << dat_->substr(0, 10) | |
7766 | << "\n"; | |
7767 | data.clear(); | |
7768 | ||
7769 | // GetMergeOperands API call | |
7770 | std::cout << "--- GetMergeOperands API --- \n"; | |
7771 | std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1); | |
7772 | st = FLAGS_env->NowNanos(); | |
7773 | int number_of_operands = 0; | |
7774 | GetMergeOperandsOptions get_merge_operands_options; | |
7775 | get_merge_operands_options.expected_max_number_of_operands = | |
7776 | (kTotalValues / 100) + 1; | |
7777 | db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key, | |
7778 | a_slice.data(), &get_merge_operands_options, | |
7779 | &number_of_operands); | |
7780 | for (PinnableSlice& psl : a_slice) { | |
7781 | s.MakeVector(data, psl); | |
7782 | found = | |
7783 | binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key); | |
7784 | data.clear(); | |
7785 | if (found) break; | |
7786 | } | |
7787 | std::cout << "Found key? " << std::to_string(found) << "\n"; | |
7788 | sp = FLAGS_env->NowNanos(); | |
7789 | std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0 | |
7790 | << " seconds \n"; | |
7791 | int to_print = 0; | |
7792 | std::cout << "Sample data from GetMergeOperands API call: "; | |
7793 | for (PinnableSlice& psl : a_slice) { | |
7794 | std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n"; | |
7795 | if (to_print++ > 2) break; | |
7796 | } | |
7797 | } | |
7798 | ||
7c673cae | 7799 | #ifndef ROCKSDB_LITE |
1e59de90 TL |
7800 | void VerifyChecksum(ThreadState* thread) { |
7801 | DB* db = SelectDB(thread); | |
7802 | ReadOptions ro; | |
7803 | ro.adaptive_readahead = FLAGS_adaptive_readahead; | |
7804 | ro.async_io = FLAGS_async_io; | |
7805 | ro.rate_limiter_priority = | |
7806 | FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; | |
7807 | ro.readahead_size = FLAGS_readahead_size; | |
7808 | Status s = db->VerifyChecksum(ro); | |
7809 | if (!s.ok()) { | |
7810 | fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str()); | |
7811 | exit(1); | |
7812 | } | |
7813 | } | |
7814 | ||
7815 | void VerifyFileChecksums(ThreadState* thread) { | |
7816 | DB* db = SelectDB(thread); | |
7817 | ReadOptions ro; | |
7818 | ro.adaptive_readahead = FLAGS_adaptive_readahead; | |
7819 | ro.async_io = FLAGS_async_io; | |
7820 | ro.rate_limiter_priority = | |
7821 | FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; | |
7822 | ro.readahead_size = FLAGS_readahead_size; | |
7823 | Status s = db->VerifyFileChecksums(ro); | |
7824 | if (!s.ok()) { | |
7825 | fprintf(stderr, "VerifyFileChecksums() failed: %s\n", | |
7826 | s.ToString().c_str()); | |
7827 | exit(1); | |
7828 | } | |
7829 | } | |
7830 | ||
7c673cae FG |
7831 | // This benchmark stress tests Transactions. For a given --duration (or |
7832 | // total number of --writes, a Transaction will perform a read-modify-write | |
7833 | // to increment the value of a key in each of N(--transaction-sets) sets of | |
7834 | // keys (where each set has --num keys). If --threads is set, this will be | |
7835 | // done in parallel. | |
7836 | // | |
7837 | // To test transactions, use --transaction_db=true. Not setting this | |
7838 | // parameter | |
7839 | // will run the same benchmark without transactions. | |
7840 | // | |
7841 | // RandomTransactionVerify() will then validate the correctness of the results | |
7842 | // by checking if the sum of all keys in each set is the same. | |
7843 | void RandomTransaction(ThreadState* thread) { | |
7c673cae | 7844 | Duration duration(FLAGS_duration, readwrites_); |
7c673cae FG |
7845 | uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets); |
7846 | uint64_t transactions_done = 0; | |
7847 | ||
7848 | if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) { | |
7849 | fprintf(stderr, "invalid value for transaction_sets\n"); | |
7850 | abort(); | |
7851 | } | |
7852 | ||
7853 | TransactionOptions txn_options; | |
7854 | txn_options.lock_timeout = FLAGS_transaction_lock_timeout; | |
7855 | txn_options.set_snapshot = FLAGS_transaction_set_snapshot; | |
7856 | ||
7857 | RandomTransactionInserter inserter(&thread->rand, write_options_, | |
1e59de90 | 7858 | read_options_, FLAGS_num, |
7c673cae FG |
7859 | num_prefix_ranges); |
7860 | ||
7861 | if (FLAGS_num_multi_db > 1) { | |
7862 | fprintf(stderr, | |
7863 | "Cannot run RandomTransaction benchmark with " | |
7864 | "FLAGS_multi_db > 1."); | |
7865 | abort(); | |
7866 | } | |
7867 | ||
7868 | while (!duration.Done(1)) { | |
7869 | bool success; | |
7870 | ||
7871 | // RandomTransactionInserter will attempt to insert a key for each | |
7872 | // # of FLAGS_transaction_sets | |
7873 | if (FLAGS_optimistic_transaction_db) { | |
7874 | success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db); | |
7875 | } else if (FLAGS_transaction_db) { | |
7876 | TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db); | |
7877 | success = inserter.TransactionDBInsert(txn_db, txn_options); | |
7878 | } else { | |
7879 | success = inserter.DBInsert(db_.db); | |
7880 | } | |
7881 | ||
7882 | if (!success) { | |
7883 | fprintf(stderr, "Unexpected error: %s\n", | |
7884 | inserter.GetLastStatus().ToString().c_str()); | |
7885 | abort(); | |
7886 | } | |
7887 | ||
7888 | thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers); | |
7889 | transactions_done++; | |
7890 | } | |
7891 | ||
7892 | char msg[100]; | |
7893 | if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) { | |
7894 | snprintf(msg, sizeof(msg), | |
7895 | "( transactions:%" PRIu64 " aborts:%" PRIu64 ")", | |
7896 | transactions_done, inserter.GetFailureCount()); | |
7897 | } else { | |
7898 | snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done); | |
7899 | } | |
7900 | thread->stats.AddMessage(msg); | |
11fdf7f2 | 7901 | thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted())); |
7c673cae FG |
7902 | } |
7903 | ||
7904 | // Verifies consistency of data after RandomTransaction() has been run. | |
7905 | // Since each iteration of RandomTransaction() incremented a key in each set | |
7906 | // by the same value, the sum of the keys in each set should be the same. | |
7907 | void RandomTransactionVerify() { | |
7908 | if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) { | |
7909 | // transactions not used, nothing to verify. | |
7910 | return; | |
7911 | } | |
7912 | ||
1e59de90 TL |
7913 | Status s = RandomTransactionInserter::Verify( |
7914 | db_.db, static_cast<uint16_t>(FLAGS_transaction_sets)); | |
7c673cae FG |
7915 | |
7916 | if (s.ok()) { | |
7917 | fprintf(stdout, "RandomTransactionVerify Success.\n"); | |
7918 | } else { | |
7919 | fprintf(stdout, "RandomTransactionVerify FAILED!!\n"); | |
7920 | } | |
7921 | } | |
7922 | #endif // ROCKSDB_LITE | |
7923 | ||
7924 | // Writes and deletes random keys without overwriting keys. | |
7925 | // | |
7926 | // This benchmark is intended to partially replicate the behavior of MyRocks | |
7927 | // secondary indices: All data is stored in keys and updates happen by | |
7928 | // deleting the old version of the key and inserting the new version. | |
7929 | void RandomReplaceKeys(ThreadState* thread) { | |
7930 | std::unique_ptr<const char[]> key_guard; | |
7931 | Slice key = AllocateKey(&key_guard); | |
20effc67 TL |
7932 | std::unique_ptr<char[]> ts_guard; |
7933 | if (user_timestamp_size_ > 0) { | |
7934 | ts_guard.reset(new char[user_timestamp_size_]); | |
7935 | } | |
7c673cae FG |
7936 | std::vector<uint32_t> counters(FLAGS_numdistinct, 0); |
7937 | size_t max_counter = 50; | |
7938 | RandomGenerator gen; | |
7939 | ||
7940 | Status s; | |
7941 | DB* db = SelectDB(thread); | |
7942 | for (int64_t i = 0; i < FLAGS_numdistinct; i++) { | |
7943 | GenerateKeyFromInt(i * max_counter, FLAGS_num, &key); | |
20effc67 | 7944 | if (user_timestamp_size_ > 0) { |
1e59de90 TL |
7945 | Slice ts = mock_app_clock_->Allocate(ts_guard.get()); |
7946 | s = db->Put(write_options_, key, ts, gen.Generate()); | |
7947 | } else { | |
7948 | s = db->Put(write_options_, key, gen.Generate()); | |
20effc67 | 7949 | } |
7c673cae FG |
7950 | if (!s.ok()) { |
7951 | fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str()); | |
7952 | exit(1); | |
7953 | } | |
7954 | } | |
7955 | ||
7956 | db->GetSnapshot(); | |
7957 | ||
7958 | std::default_random_engine generator; | |
7959 | std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0, | |
7960 | FLAGS_stddev); | |
7961 | Duration duration(FLAGS_duration, FLAGS_num); | |
7962 | while (!duration.Done(1)) { | |
7963 | int64_t rnd_id = static_cast<int64_t>(distribution(generator)); | |
7964 | int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id), | |
7965 | static_cast<int64_t>(0)); | |
7966 | GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num, | |
7967 | &key); | |
20effc67 | 7968 | if (user_timestamp_size_ > 0) { |
1e59de90 TL |
7969 | Slice ts = mock_app_clock_->Allocate(ts_guard.get()); |
7970 | s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts) | |
7971 | : db->Delete(write_options_, key, ts); | |
7972 | } else { | |
7973 | s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key) | |
7974 | : db->Delete(write_options_, key); | |
20effc67 | 7975 | } |
7c673cae FG |
7976 | if (s.ok()) { |
7977 | counters[key_id] = (counters[key_id] + 1) % max_counter; | |
7978 | GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num, | |
7979 | &key); | |
20effc67 | 7980 | if (user_timestamp_size_ > 0) { |
1e59de90 TL |
7981 | Slice ts = mock_app_clock_->Allocate(ts_guard.get()); |
7982 | s = db->Put(write_options_, key, ts, Slice()); | |
7983 | } else { | |
7984 | s = db->Put(write_options_, key, Slice()); | |
20effc67 | 7985 | } |
7c673cae FG |
7986 | } |
7987 | ||
7988 | if (!s.ok()) { | |
7989 | fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str()); | |
7990 | exit(1); | |
7991 | } | |
7992 | ||
7993 | thread->stats.FinishedOps(nullptr, db, 1, kOthers); | |
7994 | } | |
7995 | ||
7996 | char msg[200]; | |
7997 | snprintf(msg, sizeof(msg), | |
7998 | "use single deletes: %d, " | |
7999 | "standard deviation: %lf\n", | |
8000 | FLAGS_use_single_deletes, FLAGS_stddev); | |
8001 | thread->stats.AddMessage(msg); | |
8002 | } | |
8003 | ||
8004 | void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) { | |
7c673cae FG |
8005 | int64_t read = 0; |
8006 | int64_t found = 0; | |
8007 | int64_t bytes = 0; | |
8008 | ||
8009 | Iterator* iter = nullptr; | |
8010 | // Only work on single database | |
8011 | assert(db_.db != nullptr); | |
1e59de90 | 8012 | iter = db_.db->NewIterator(read_options_); |
7c673cae FG |
8013 | |
8014 | std::unique_ptr<const char[]> key_guard; | |
8015 | Slice key = AllocateKey(&key_guard); | |
8016 | ||
8017 | char value_buffer[256]; | |
8018 | while (true) { | |
8019 | { | |
8020 | MutexLock l(&thread->shared->mu); | |
8021 | if (thread->shared->num_done >= 1) { | |
8022 | // Write thread have finished | |
8023 | break; | |
8024 | } | |
8025 | } | |
8026 | if (!FLAGS_use_tailing_iterator) { | |
8027 | delete iter; | |
1e59de90 | 8028 | iter = db_.db->NewIterator(read_options_); |
7c673cae FG |
8029 | } |
8030 | // Pick a Iterator to use | |
8031 | ||
8032 | int64_t key_id = thread->rand.Next() % FLAGS_key_id_range; | |
8033 | GenerateKeyFromInt(key_id, FLAGS_num, &key); | |
8034 | // Reset last 8 bytes to 0 | |
8035 | char* start = const_cast<char*>(key.data()); | |
8036 | start += key.size() - 8; | |
8037 | memset(start, 0, 8); | |
8038 | ++read; | |
8039 | ||
8040 | bool key_found = false; | |
8041 | // Seek the prefix | |
8042 | for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key); | |
8043 | iter->Next()) { | |
8044 | key_found = true; | |
8045 | // Copy out iterator's value to make sure we read them. | |
8046 | if (do_deletion) { | |
8047 | bytes += iter->key().size(); | |
8048 | if (KeyExpired(timestamp_emulator_.get(), iter->key())) { | |
8049 | thread->stats.FinishedOps(&db_, db_.db, 1, kDelete); | |
8050 | db_.db->Delete(write_options_, iter->key()); | |
8051 | } else { | |
8052 | break; | |
8053 | } | |
8054 | } else { | |
8055 | bytes += iter->key().size() + iter->value().size(); | |
8056 | thread->stats.FinishedOps(&db_, db_.db, 1, kRead); | |
8057 | Slice value = iter->value(); | |
8058 | memcpy(value_buffer, value.data(), | |
8059 | std::min(value.size(), sizeof(value_buffer))); | |
8060 | ||
8061 | assert(iter->status().ok()); | |
8062 | } | |
8063 | } | |
8064 | found += key_found; | |
8065 | ||
8066 | if (thread->shared->read_rate_limiter.get() != nullptr) { | |
11fdf7f2 TL |
8067 | thread->shared->read_rate_limiter->Request( |
8068 | 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); | |
7c673cae FG |
8069 | } |
8070 | } | |
8071 | delete iter; | |
8072 | ||
8073 | char msg[100]; | |
8074 | snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found, | |
8075 | read); | |
8076 | thread->stats.AddBytes(bytes); | |
8077 | thread->stats.AddMessage(msg); | |
7c673cae FG |
8078 | } |
8079 | ||
8080 | void TimeSeriesWrite(ThreadState* thread) { | |
8081 | // Special thread that keeps writing until other threads are done. | |
8082 | RandomGenerator gen; | |
8083 | int64_t bytes = 0; | |
8084 | ||
8085 | // Don't merge stats from this thread with the readers. | |
8086 | thread->stats.SetExcludeFromMerge(); | |
8087 | ||
8088 | std::unique_ptr<RateLimiter> write_rate_limiter; | |
8089 | if (FLAGS_benchmark_write_rate_limit > 0) { | |
8090 | write_rate_limiter.reset( | |
8091 | NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit)); | |
8092 | } | |
8093 | ||
8094 | std::unique_ptr<const char[]> key_guard; | |
8095 | Slice key = AllocateKey(&key_guard); | |
8096 | ||
8097 | Duration duration(FLAGS_duration, writes_); | |
8098 | while (!duration.Done(1)) { | |
8099 | DB* db = SelectDB(thread); | |
8100 | ||
8101 | uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range; | |
8102 | // Write key id | |
8103 | GenerateKeyFromInt(key_id, FLAGS_num, &key); | |
8104 | // Write timestamp | |
8105 | ||
8106 | char* start = const_cast<char*>(key.data()); | |
8107 | char* pos = start + 8; | |
8108 | int bytes_to_fill = | |
8109 | std::min(key_size_ - static_cast<int>(pos - start), 8); | |
8110 | uint64_t timestamp_value = timestamp_emulator_->Get(); | |
8111 | if (port::kLittleEndian) { | |
8112 | for (int i = 0; i < bytes_to_fill; ++i) { | |
8113 | pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; | |
8114 | } | |
8115 | } else { | |
8116 | memcpy(pos, static_cast<void*>(×tamp_value), bytes_to_fill); | |
8117 | } | |
8118 | ||
8119 | timestamp_emulator_->Inc(); | |
8120 | ||
8121 | Status s; | |
f67539c2 TL |
8122 | Slice val = gen.Generate(); |
8123 | s = db->Put(write_options_, key, val); | |
7c673cae FG |
8124 | |
8125 | if (!s.ok()) { | |
8126 | fprintf(stderr, "put error: %s\n", s.ToString().c_str()); | |
20effc67 | 8127 | ErrorExit(); |
7c673cae | 8128 | } |
f67539c2 | 8129 | bytes = key.size() + val.size(); |
7c673cae FG |
8130 | thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); |
8131 | thread->stats.AddBytes(bytes); | |
8132 | ||
8133 | if (FLAGS_benchmark_write_rate_limit > 0) { | |
1e59de90 TL |
8134 | write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH, |
8135 | nullptr /* stats */, | |
8136 | RateLimiter::OpType::kWrite); | |
7c673cae FG |
8137 | } |
8138 | } | |
8139 | } | |
8140 | ||
8141 | void TimeSeries(ThreadState* thread) { | |
8142 | if (thread->tid > 0) { | |
8143 | bool do_deletion = FLAGS_expire_style == "delete" && | |
8144 | thread->tid <= FLAGS_num_deletion_threads; | |
8145 | TimeSeriesReadOrDelete(thread, do_deletion); | |
8146 | } else { | |
8147 | TimeSeriesWrite(thread); | |
8148 | thread->stats.Stop(); | |
8149 | thread->stats.Report("timeseries write"); | |
8150 | } | |
8151 | } | |
8152 | ||
8153 | void Compact(ThreadState* thread) { | |
8154 | DB* db = SelectDB(thread); | |
11fdf7f2 | 8155 | CompactRangeOptions cro; |
f67539c2 TL |
8156 | cro.bottommost_level_compaction = |
8157 | BottommostLevelCompaction::kForceOptimized; | |
11fdf7f2 TL |
8158 | db->CompactRange(cro, nullptr, nullptr); |
8159 | } | |
8160 | ||
8161 | void CompactAll() { | |
8162 | if (db_.db != nullptr) { | |
8163 | db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr); | |
8164 | } | |
8165 | for (const auto& db_with_cfh : multi_dbs_) { | |
8166 | db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr); | |
8167 | } | |
7c673cae FG |
8168 | } |
8169 | ||
1e59de90 TL |
8170 | #ifndef ROCKSDB_LITE |
8171 | void WaitForCompactionHelper(DBWithColumnFamilies& db) { | |
8172 | // This is an imperfect way of waiting for compaction. The loop and sleep | |
8173 | // is done because a thread that finishes a compaction job should get a | |
8174 | // chance to pickup a new compaction job. | |
8175 | ||
8176 | std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending, | |
8177 | DB::Properties::kNumRunningFlushes, | |
8178 | DB::Properties::kCompactionPending, | |
8179 | DB::Properties::kNumRunningCompactions}; | |
8180 | ||
8181 | fprintf(stdout, "waitforcompaction(%s): started\n", | |
8182 | db.db->GetName().c_str()); | |
8183 | ||
8184 | while (true) { | |
8185 | bool retry = false; | |
8186 | ||
8187 | for (const auto& k : keys) { | |
8188 | uint64_t v; | |
8189 | if (!db.db->GetIntProperty(k, &v)) { | |
8190 | fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n", | |
8191 | db.db->GetName().c_str(), k.c_str()); | |
8192 | exit(1); | |
8193 | } else if (v > 0) { | |
8194 | fprintf(stdout, | |
8195 | "waitforcompaction(%s): active(%s). Sleep 10 seconds\n", | |
8196 | db.db->GetName().c_str(), k.c_str()); | |
8197 | FLAGS_env->SleepForMicroseconds(10 * 1000000); | |
8198 | retry = true; | |
8199 | break; | |
8200 | } | |
8201 | } | |
8202 | ||
8203 | if (!retry) { | |
8204 | fprintf(stdout, "waitforcompaction(%s): finished\n", | |
8205 | db.db->GetName().c_str()); | |
8206 | return; | |
8207 | } | |
8208 | } | |
8209 | } | |
8210 | ||
8211 | void WaitForCompaction() { | |
8212 | // Give background threads a chance to wake | |
8213 | FLAGS_env->SleepForMicroseconds(5 * 1000000); | |
8214 | ||
8215 | // I am skeptical that this check race free. I hope that checking twice | |
8216 | // reduces the chance. | |
8217 | if (db_.db != nullptr) { | |
8218 | WaitForCompactionHelper(db_); | |
8219 | WaitForCompactionHelper(db_); | |
8220 | } else { | |
8221 | for (auto& db_with_cfh : multi_dbs_) { | |
8222 | WaitForCompactionHelper(db_with_cfh); | |
8223 | WaitForCompactionHelper(db_with_cfh); | |
8224 | } | |
8225 | } | |
8226 | } | |
8227 | ||
8228 | bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) { | |
8229 | std::vector<LiveFileMetaData> files; | |
8230 | db_with_cfh.db->GetLiveFilesMetaData(&files); | |
8231 | ||
8232 | assert(from_level == 0 || from_level == 1); | |
8233 | ||
8234 | int real_from_level = from_level; | |
8235 | if (real_from_level > 0) { | |
8236 | // With dynamic leveled compaction the first level with data beyond L0 | |
8237 | // might not be L1. | |
8238 | real_from_level = std::numeric_limits<int>::max(); | |
8239 | ||
8240 | for (auto& f : files) { | |
8241 | if (f.level > 0 && f.level < real_from_level) real_from_level = f.level; | |
8242 | } | |
8243 | ||
8244 | if (real_from_level == std::numeric_limits<int>::max()) { | |
8245 | fprintf(stdout, "compact%d found 0 files to compact\n", from_level); | |
8246 | return true; | |
8247 | } | |
8248 | } | |
8249 | ||
8250 | // The goal is to compact from from_level to the level that follows it, | |
8251 | // and with dynamic leveled compaction the next level might not be | |
8252 | // real_from_level+1 | |
8253 | int next_level = std::numeric_limits<int>::max(); | |
8254 | ||
8255 | std::vector<std::string> files_to_compact; | |
8256 | for (auto& f : files) { | |
8257 | if (f.level == real_from_level) | |
8258 | files_to_compact.push_back(f.name); | |
8259 | else if (f.level > real_from_level && f.level < next_level) | |
8260 | next_level = f.level; | |
8261 | } | |
8262 | ||
8263 | if (files_to_compact.empty()) { | |
8264 | fprintf(stdout, "compact%d found 0 files to compact\n", from_level); | |
8265 | return true; | |
8266 | } else if (next_level == std::numeric_limits<int>::max()) { | |
8267 | // There is no data beyond real_from_level. So we are done. | |
8268 | fprintf(stdout, "compact%d found no data beyond L%d\n", from_level, | |
8269 | real_from_level); | |
8270 | return true; | |
8271 | } | |
8272 | ||
8273 | fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n", | |
8274 | from_level, static_cast<int>(files_to_compact.size()), | |
8275 | real_from_level, next_level); | |
8276 | ||
8277 | ROCKSDB_NAMESPACE::CompactionOptions options; | |
8278 | // Lets RocksDB use the configured compression for this level | |
8279 | options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption; | |
8280 | ||
8281 | ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc; | |
8282 | db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc); | |
8283 | options.output_file_size_limit = cfDesc.options.target_file_size_base; | |
8284 | ||
8285 | Status status = | |
8286 | db_with_cfh.db->CompactFiles(options, files_to_compact, next_level); | |
8287 | if (!status.ok()) { | |
8288 | // This can fail for valid reasons including the operation was aborted | |
8289 | // or a filename is invalid because background compaction removed it. | |
8290 | // Having read the current cases for which an error is raised I prefer | |
8291 | // not to figure out whether an exception should be thrown here. | |
8292 | fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level, | |
8293 | status.ToString().c_str()); | |
8294 | return false; | |
8295 | } | |
8296 | return true; | |
8297 | } | |
8298 | ||
8299 | void CompactLevel(int from_level) { | |
8300 | if (db_.db != nullptr) { | |
8301 | while (!CompactLevelHelper(db_, from_level)) WaitForCompaction(); | |
8302 | } | |
8303 | for (auto& db_with_cfh : multi_dbs_) { | |
8304 | while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction(); | |
8305 | } | |
8306 | } | |
8307 | #endif | |
8308 | ||
8309 | void Flush() { | |
8310 | FlushOptions flush_opt; | |
8311 | flush_opt.wait = true; | |
8312 | ||
8313 | if (db_.db != nullptr) { | |
8314 | Status s; | |
8315 | if (FLAGS_num_column_families > 1) { | |
8316 | s = db_.db->Flush(flush_opt, db_.cfh); | |
8317 | } else { | |
8318 | s = db_.db->Flush(flush_opt, db_.db->DefaultColumnFamily()); | |
8319 | } | |
8320 | ||
8321 | if (!s.ok()) { | |
8322 | fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str()); | |
8323 | exit(1); | |
8324 | } | |
8325 | } else { | |
8326 | for (const auto& db_with_cfh : multi_dbs_) { | |
8327 | Status s; | |
8328 | if (FLAGS_num_column_families > 1) { | |
8329 | s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh); | |
8330 | } else { | |
8331 | s = db_with_cfh.db->Flush(flush_opt, | |
8332 | db_with_cfh.db->DefaultColumnFamily()); | |
8333 | } | |
8334 | ||
8335 | if (!s.ok()) { | |
8336 | fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str()); | |
8337 | exit(1); | |
8338 | } | |
8339 | } | |
8340 | } | |
8341 | fprintf(stdout, "flush memtable\n"); | |
8342 | } | |
8343 | ||
7c673cae FG |
8344 | void ResetStats() { |
8345 | if (db_.db != nullptr) { | |
8346 | db_.db->ResetStats(); | |
8347 | } | |
8348 | for (const auto& db_with_cfh : multi_dbs_) { | |
8349 | db_with_cfh.db->ResetStats(); | |
8350 | } | |
8351 | } | |
8352 | ||
f67539c2 TL |
8353 | void PrintStatsHistory() { |
8354 | if (db_.db != nullptr) { | |
8355 | PrintStatsHistoryImpl(db_.db, false); | |
8356 | } | |
8357 | for (const auto& db_with_cfh : multi_dbs_) { | |
8358 | PrintStatsHistoryImpl(db_with_cfh.db, true); | |
8359 | } | |
8360 | } | |
8361 | ||
8362 | void PrintStatsHistoryImpl(DB* db, bool print_header) { | |
8363 | if (print_header) { | |
8364 | fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); | |
8365 | } | |
8366 | ||
8367 | std::unique_ptr<StatsHistoryIterator> shi; | |
1e59de90 TL |
8368 | Status s = |
8369 | db->GetStatsHistory(0, std::numeric_limits<uint64_t>::max(), &shi); | |
f67539c2 TL |
8370 | if (!s.ok()) { |
8371 | fprintf(stdout, "%s\n", s.ToString().c_str()); | |
8372 | return; | |
8373 | } | |
8374 | assert(shi); | |
8375 | while (shi->Valid()) { | |
8376 | uint64_t stats_time = shi->GetStatsTime(); | |
8377 | fprintf(stdout, "------ %s ------\n", | |
8378 | TimeToHumanString(static_cast<int>(stats_time)).c_str()); | |
8379 | for (auto& entry : shi->GetStatsMap()) { | |
8380 | fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time, | |
8381 | entry.first.c_str(), entry.second); | |
8382 | } | |
8383 | shi->Next(); | |
8384 | } | |
8385 | } | |
8386 | ||
7c673cae FG |
8387 | void PrintStats(const char* key) { |
8388 | if (db_.db != nullptr) { | |
8389 | PrintStats(db_.db, key, false); | |
8390 | } | |
8391 | for (const auto& db_with_cfh : multi_dbs_) { | |
8392 | PrintStats(db_with_cfh.db, key, true); | |
8393 | } | |
8394 | } | |
8395 | ||
8396 | void PrintStats(DB* db, const char* key, bool print_header = false) { | |
8397 | if (print_header) { | |
8398 | fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); | |
8399 | } | |
8400 | std::string stats; | |
8401 | if (!db->GetProperty(key, &stats)) { | |
8402 | stats = "(failed)"; | |
8403 | } | |
8404 | fprintf(stdout, "\n%s\n", stats.c_str()); | |
8405 | } | |
11fdf7f2 | 8406 | |
1e59de90 TL |
8407 | void PrintStats(const std::vector<std::string>& keys) { |
8408 | if (db_.db != nullptr) { | |
8409 | PrintStats(db_.db, keys); | |
8410 | } | |
8411 | for (const auto& db_with_cfh : multi_dbs_) { | |
8412 | PrintStats(db_with_cfh.db, keys, true); | |
8413 | } | |
8414 | } | |
8415 | ||
8416 | void PrintStats(DB* db, const std::vector<std::string>& keys, | |
8417 | bool print_header = false) { | |
8418 | if (print_header) { | |
8419 | fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); | |
8420 | } | |
8421 | ||
8422 | for (const auto& key : keys) { | |
8423 | std::string stats; | |
8424 | if (!db->GetProperty(key, &stats)) { | |
8425 | stats = "(failed)"; | |
8426 | } | |
8427 | fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str()); | |
8428 | } | |
8429 | } | |
8430 | ||
8431 | #ifndef ROCKSDB_LITE | |
8432 | ||
11fdf7f2 TL |
8433 | void Replay(ThreadState* thread) { |
8434 | if (db_.db != nullptr) { | |
8435 | Replay(thread, &db_); | |
8436 | } | |
8437 | } | |
8438 | ||
8439 | void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) { | |
8440 | Status s; | |
494da23a | 8441 | std::unique_ptr<TraceReader> trace_reader; |
11fdf7f2 TL |
8442 | s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file, |
8443 | &trace_reader); | |
8444 | if (!s.ok()) { | |
8445 | fprintf( | |
8446 | stderr, | |
8447 | "Encountered an error creating a TraceReader from the trace file. " | |
8448 | "Error: %s\n", | |
8449 | s.ToString().c_str()); | |
8450 | exit(1); | |
8451 | } | |
1e59de90 TL |
8452 | std::unique_ptr<Replayer> replayer; |
8453 | s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh, | |
8454 | std::move(trace_reader), &replayer); | |
8455 | if (!s.ok()) { | |
8456 | fprintf(stderr, | |
8457 | "Encountered an error creating a default Replayer. " | |
8458 | "Error: %s\n", | |
8459 | s.ToString().c_str()); | |
8460 | exit(1); | |
8461 | } | |
8462 | s = replayer->Prepare(); | |
8463 | if (!s.ok()) { | |
8464 | fprintf(stderr, "Prepare for replay failed. Error: %s\n", | |
8465 | s.ToString().c_str()); | |
8466 | } | |
8467 | s = replayer->Replay( | |
8468 | ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads), | |
8469 | FLAGS_trace_replay_fast_forward), | |
8470 | nullptr); | |
8471 | replayer.reset(); | |
11fdf7f2 | 8472 | if (s.ok()) { |
1e59de90 | 8473 | fprintf(stdout, "Replay completed from trace_file: %s\n", |
11fdf7f2 TL |
8474 | FLAGS_trace_file.c_str()); |
8475 | } else { | |
1e59de90 | 8476 | fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str()); |
11fdf7f2 TL |
8477 | } |
8478 | } | |
1e59de90 TL |
8479 | |
8480 | void Backup(ThreadState* thread) { | |
8481 | DB* db = SelectDB(thread); | |
8482 | std::unique_ptr<BackupEngineOptions> engine_options( | |
8483 | new BackupEngineOptions(FLAGS_backup_dir)); | |
8484 | Status s; | |
8485 | BackupEngine* backup_engine; | |
8486 | if (FLAGS_backup_rate_limit > 0) { | |
8487 | engine_options->backup_rate_limiter.reset(NewGenericRateLimiter( | |
8488 | FLAGS_backup_rate_limit, 100000 /* refill_period_us */, | |
8489 | 10 /* fairness */, RateLimiter::Mode::kAllIo)); | |
8490 | } | |
8491 | // Build new backup of the entire DB | |
8492 | engine_options->destroy_old_data = true; | |
8493 | s = BackupEngine::Open(FLAGS_env, *engine_options, &backup_engine); | |
8494 | assert(s.ok()); | |
8495 | s = backup_engine->CreateNewBackup(db); | |
8496 | assert(s.ok()); | |
8497 | std::vector<BackupInfo> backup_info; | |
8498 | backup_engine->GetBackupInfo(&backup_info); | |
8499 | // Verify that a new backup is created | |
8500 | assert(backup_info.size() == 1); | |
8501 | } | |
8502 | ||
8503 | void Restore(ThreadState* /* thread */) { | |
8504 | std::unique_ptr<BackupEngineOptions> engine_options( | |
8505 | new BackupEngineOptions(FLAGS_backup_dir)); | |
8506 | if (FLAGS_restore_rate_limit > 0) { | |
8507 | engine_options->restore_rate_limiter.reset(NewGenericRateLimiter( | |
8508 | FLAGS_restore_rate_limit, 100000 /* refill_period_us */, | |
8509 | 10 /* fairness */, RateLimiter::Mode::kAllIo)); | |
8510 | } | |
8511 | BackupEngineReadOnly* backup_engine; | |
8512 | Status s = | |
8513 | BackupEngineReadOnly::Open(FLAGS_env, *engine_options, &backup_engine); | |
8514 | assert(s.ok()); | |
8515 | s = backup_engine->RestoreDBFromLatestBackup(FLAGS_restore_dir, | |
8516 | FLAGS_restore_dir); | |
8517 | assert(s.ok()); | |
8518 | delete backup_engine; | |
8519 | } | |
8520 | ||
8521 | #endif // ROCKSDB_LITE | |
7c673cae FG |
8522 | }; |
8523 | ||
8524 | int db_bench_tool(int argc, char** argv) { | |
f67539c2 | 8525 | ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); |
1e59de90 | 8526 | ConfigOptions config_options; |
7c673cae FG |
8527 | static bool initialized = false; |
8528 | if (!initialized) { | |
8529 | SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + | |
8530 | " [OPTIONS]..."); | |
1e59de90 | 8531 | SetVersionString(GetRocksVersionAsString(true)); |
7c673cae FG |
8532 | initialized = true; |
8533 | } | |
8534 | ParseCommandLineFlags(&argc, &argv, true); | |
f67539c2 TL |
8535 | FLAGS_compaction_style_e = |
8536 | (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style; | |
7c673cae FG |
8537 | #ifndef ROCKSDB_LITE |
8538 | if (FLAGS_statistics && !FLAGS_statistics_string.empty()) { | |
8539 | fprintf(stderr, | |
8540 | "Cannot provide both --statistics and --statistics_string.\n"); | |
8541 | exit(1); | |
8542 | } | |
8543 | if (!FLAGS_statistics_string.empty()) { | |
1e59de90 TL |
8544 | Status s = Statistics::CreateFromString(config_options, |
8545 | FLAGS_statistics_string, &dbstats); | |
7c673cae | 8546 | if (dbstats == nullptr) { |
f67539c2 TL |
8547 | fprintf(stderr, |
8548 | "No Statistics registered matching string: %s status=%s\n", | |
8549 | FLAGS_statistics_string.c_str(), s.ToString().c_str()); | |
7c673cae FG |
8550 | exit(1); |
8551 | } | |
8552 | } | |
8553 | #endif // ROCKSDB_LITE | |
8554 | if (FLAGS_statistics) { | |
f67539c2 | 8555 | dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics(); |
7c673cae | 8556 | } |
494da23a TL |
8557 | if (dbstats) { |
8558 | dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level)); | |
8559 | } | |
f67539c2 TL |
8560 | FLAGS_compaction_pri_e = |
8561 | (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri; | |
7c673cae | 8562 | |
f67539c2 | 8563 | std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit( |
7c673cae FG |
8564 | FLAGS_max_bytes_for_level_multiplier_additional, ','); |
8565 | for (size_t j = 0; j < fanout.size(); j++) { | |
8566 | FLAGS_max_bytes_for_level_multiplier_additional_v.push_back( | |
8567 | #ifndef CYGWIN | |
8568 | std::stoi(fanout[j])); | |
8569 | #else | |
8570 | stoi(fanout[j])); | |
8571 | #endif | |
8572 | } | |
8573 | ||
8574 | FLAGS_compression_type_e = | |
1e59de90 TL |
8575 | StringToCompressionType(FLAGS_compression_type.c_str()); |
8576 | ||
8577 | FLAGS_wal_compression_e = | |
8578 | StringToCompressionType(FLAGS_wal_compression.c_str()); | |
8579 | ||
8580 | FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType( | |
8581 | FLAGS_compressed_secondary_cache_compression_type.c_str()); | |
7c673cae FG |
8582 | |
8583 | #ifndef ROCKSDB_LITE | |
1e59de90 | 8584 | // Stacked BlobDB |
f67539c2 | 8585 | FLAGS_blob_db_compression_type_e = |
1e59de90 | 8586 | StringToCompressionType(FLAGS_blob_db_compression_type.c_str()); |
f67539c2 | 8587 | |
1e59de90 | 8588 | int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty(); |
20effc67 | 8589 | if (env_opts > 1) { |
1e59de90 | 8590 | fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n"); |
7c673cae | 8591 | exit(1); |
20effc67 TL |
8592 | } |
8593 | ||
1e59de90 TL |
8594 | if (env_opts == 1) { |
8595 | Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri, | |
8596 | &FLAGS_env, &env_guard); | |
8597 | if (!s.ok()) { | |
8598 | fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str()); | |
20effc67 TL |
8599 | exit(1); |
8600 | } | |
1e59de90 TL |
8601 | } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") { |
8602 | //**TODO: Make the simulate fs something that can be loaded | |
8603 | // from the ObjectRegistry... | |
8604 | static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env = | |
8605 | NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>( | |
8606 | FileSystem::Default(), FLAGS_simulate_hybrid_fs_file, | |
8607 | /*throughput_multiplier=*/ | |
8608 | int{FLAGS_simulate_hybrid_hdd_multipliers}, | |
8609 | /*is_full_fs_warm=*/FLAGS_simulate_hdd)); | |
8610 | FLAGS_env = composite_env.get(); | |
7c673cae | 8611 | } |
1e59de90 TL |
8612 | |
8613 | // Let -readonly imply -use_existing_db | |
8614 | FLAGS_use_existing_db |= FLAGS_readonly; | |
7c673cae | 8615 | #endif // ROCKSDB_LITE |
1e59de90 TL |
8616 | |
8617 | if (FLAGS_build_info) { | |
8618 | std::string build_info; | |
8619 | std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl; | |
8620 | // Similar to --version, nothing else will be done when this flag is set | |
8621 | exit(0); | |
8622 | } | |
8623 | ||
8624 | if (!FLAGS_seed) { | |
8625 | uint64_t now = FLAGS_env->GetSystemClock()->NowMicros(); | |
8626 | seed_base = static_cast<int64_t>(now); | |
8627 | fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n", | |
8628 | seed_base); | |
8629 | } else { | |
8630 | seed_base = FLAGS_seed; | |
8631 | } | |
8632 | ||
494da23a TL |
8633 | if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) { |
8634 | fprintf(stderr, | |
8635 | "`-use_existing_db` must be true for `-use_existing_keys` to be " | |
8636 | "settable\n"); | |
8637 | exit(1); | |
8638 | } | |
8639 | ||
7c673cae | 8640 | if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE")) |
f67539c2 | 8641 | FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE; |
7c673cae | 8642 | else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL")) |
f67539c2 | 8643 | FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL; |
7c673cae | 8644 | else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL")) |
f67539c2 | 8645 | FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL; |
7c673cae | 8646 | else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED")) |
f67539c2 | 8647 | FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED; |
7c673cae FG |
8648 | else { |
8649 | fprintf(stdout, "Unknown compaction fadvice:%s\n", | |
8650 | FLAGS_compaction_fadvice.c_str()); | |
1e59de90 | 8651 | exit(1); |
7c673cae FG |
8652 | } |
8653 | ||
f67539c2 | 8654 | FLAGS_value_size_distribution_type_e = |
1e59de90 | 8655 | StringToDistributionType(FLAGS_value_size_distribution_type.c_str()); |
7c673cae | 8656 | |
11fdf7f2 TL |
8657 | // Note options sanitization may increase thread pool sizes according to |
8658 | // max_background_flushes/max_background_compactions/max_background_jobs | |
8659 | FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads, | |
f67539c2 | 8660 | ROCKSDB_NAMESPACE::Env::Priority::HIGH); |
11fdf7f2 | 8661 | FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads, |
f67539c2 | 8662 | ROCKSDB_NAMESPACE::Env::Priority::BOTTOM); |
11fdf7f2 | 8663 | FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads, |
f67539c2 | 8664 | ROCKSDB_NAMESPACE::Env::Priority::LOW); |
7c673cae FG |
8665 | |
8666 | // Choose a location for the test database if none given with --db=<path> | |
8667 | if (FLAGS_db.empty()) { | |
8668 | std::string default_db_path; | |
f67539c2 | 8669 | FLAGS_env->GetTestDirectory(&default_db_path); |
7c673cae FG |
8670 | default_db_path += "/dbbench"; |
8671 | FLAGS_db = default_db_path; | |
8672 | } | |
8673 | ||
1e59de90 TL |
8674 | if (FLAGS_backup_dir.empty()) { |
8675 | FLAGS_backup_dir = FLAGS_db + "/backup"; | |
8676 | } | |
8677 | ||
8678 | if (FLAGS_restore_dir.empty()) { | |
8679 | FLAGS_restore_dir = FLAGS_db + "/restore"; | |
8680 | } | |
8681 | ||
7c673cae FG |
8682 | if (FLAGS_stats_interval_seconds > 0) { |
8683 | // When both are set then FLAGS_stats_interval determines the frequency | |
8684 | // at which the timer is checked for FLAGS_stats_interval_seconds | |
8685 | FLAGS_stats_interval = 1000; | |
8686 | } | |
8687 | ||
f67539c2 TL |
8688 | if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) { |
8689 | fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n"); | |
8690 | exit(1); | |
8691 | } | |
8692 | ||
8693 | ROCKSDB_NAMESPACE::Benchmark benchmark; | |
7c673cae | 8694 | benchmark.Run(); |
494da23a TL |
8695 | |
8696 | #ifndef ROCKSDB_LITE | |
8697 | if (FLAGS_print_malloc_stats) { | |
8698 | std::string stats_string; | |
f67539c2 | 8699 | ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string); |
494da23a TL |
8700 | fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str()); |
8701 | } | |
8702 | #endif // ROCKSDB_LITE | |
8703 | ||
7c673cae FG |
8704 | return 0; |
8705 | } | |
f67539c2 | 8706 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 8707 | #endif |