]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | #! /usr/bin/env python |
2 | import os | |
7c673cae FG |
3 | import sys |
4 | import time | |
5 | import random | |
7c673cae FG |
6 | import tempfile |
7 | import subprocess | |
8 | import shutil | |
9 | import argparse | |
10 | ||
11 | # params overwrite priority: | |
12 | # for default: | |
11fdf7f2 | 13 | # default_params < {blackbox,whitebox}_default_params < args |
7c673cae | 14 | # for simple: |
11fdf7f2 TL |
15 | # default_params < {blackbox,whitebox}_default_params < |
16 | # simple_default_params < | |
17 | # {blackbox,whitebox}_simple_default_params < args | |
494da23a TL |
18 | # for enable_atomic_flush: |
19 | # default_params < {blackbox,whitebox}_default_params < | |
20 | # atomic_flush_params < args | |
11fdf7f2 TL |
21 | |
22 | expected_values_file = tempfile.NamedTemporaryFile() | |
7c673cae FG |
23 | |
24 | default_params = { | |
11fdf7f2 | 25 | "acquire_snapshot_one_in": 10000, |
7c673cae FG |
26 | "block_size": 16384, |
27 | "cache_size": 1048576, | |
11fdf7f2 | 28 | "checkpoint_one_in": 1000000, |
494da23a | 29 | "compression_type": "snappy", |
11fdf7f2 TL |
30 | "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1), |
31 | "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1), | |
32 | "clear_column_family_one_in": 0, | |
33 | "compact_files_one_in": 1000000, | |
34 | "compact_range_one_in": 1000000, | |
494da23a TL |
35 | "delpercent": 4, |
36 | "delrangepercent": 1, | |
7c673cae | 37 | "destroy_db_initially": 0, |
11fdf7f2 TL |
38 | "enable_pipelined_write": lambda: random.randint(0, 1), |
39 | "expected_values_path": expected_values_file.name, | |
40 | "flush_one_in": 1000000, | |
7c673cae FG |
41 | "max_background_compactions": 20, |
42 | "max_bytes_for_level_base": 10485760, | |
43 | "max_key": 100000000, | |
44 | "max_write_buffer_number": 3, | |
7c673cae | 45 | "mmap_read": lambda: random.randint(0, 1), |
11fdf7f2 | 46 | "nooverwritepercent": 1, |
7c673cae | 47 | "open_files": 500000, |
7c673cae FG |
48 | "prefixpercent": 5, |
49 | "progress_reports": 0, | |
50 | "readpercent": 45, | |
494da23a | 51 | "recycle_log_file_num": lambda: random.randint(0, 1), |
7c673cae | 52 | "reopen": 20, |
11fdf7f2 TL |
53 | "snapshot_hold_ops": 100000, |
54 | "subcompactions": lambda: random.randint(1, 4), | |
7c673cae FG |
55 | "target_file_size_base": 2097152, |
56 | "target_file_size_multiplier": 2, | |
11fdf7f2 TL |
57 | "use_direct_reads": lambda: random.randint(0, 1), |
58 | "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), | |
59 | "use_full_merge_v1": lambda: random.randint(0, 1), | |
60 | "use_merge": lambda: random.randint(0, 1), | |
7c673cae FG |
61 | "verify_checksum": 1, |
62 | "write_buffer_size": 4 * 1024 * 1024, | |
63 | "writepercent": 35, | |
11fdf7f2 TL |
64 | "format_version": lambda: random.randint(2, 4), |
65 | "index_block_restart_interval": lambda: random.choice(range(1, 16)), | |
7c673cae FG |
66 | } |
67 | ||
11fdf7f2 TL |
68 | _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' |
69 | ||
7c673cae FG |
70 | |
71 | def get_dbname(test_name): | |
11fdf7f2 | 72 | test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) |
7c673cae FG |
73 | if test_tmpdir is None or test_tmpdir == "": |
74 | dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name) | |
75 | else: | |
76 | dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name | |
77 | shutil.rmtree(dbname, True) | |
11fdf7f2 | 78 | os.mkdir(dbname) |
7c673cae FG |
79 | return dbname |
80 | ||
11fdf7f2 TL |
81 | |
82 | def is_direct_io_supported(dbname): | |
83 | with tempfile.NamedTemporaryFile(dir=dbname) as f: | |
84 | try: | |
85 | os.open(f.name, os.O_DIRECT) | |
86 | except: | |
87 | return False | |
88 | return True | |
89 | ||
90 | ||
7c673cae FG |
91 | blackbox_default_params = { |
92 | # total time for this script to test db_stress | |
93 | "duration": 6000, | |
94 | # time for one db_stress instance to run | |
95 | "interval": 120, | |
96 | # since we will be killing anyway, use large value for ops_per_thread | |
97 | "ops_per_thread": 100000000, | |
98 | "set_options_one_in": 10000, | |
99 | "test_batches_snapshots": 1, | |
100 | } | |
101 | ||
102 | whitebox_default_params = { | |
103 | "duration": 10000, | |
104 | "log2_keys_per_lock": 10, | |
7c673cae | 105 | "ops_per_thread": 200000, |
7c673cae | 106 | "random_kill_odd": 888887, |
11fdf7f2 | 107 | "test_batches_snapshots": lambda: random.randint(0, 1), |
7c673cae FG |
108 | } |
109 | ||
110 | simple_default_params = { | |
7c673cae | 111 | "allow_concurrent_memtable_write": lambda: random.randint(0, 1), |
11fdf7f2 | 112 | "column_families": 1, |
7c673cae FG |
113 | "max_background_compactions": 1, |
114 | "max_bytes_for_level_base": 67108864, | |
7c673cae | 115 | "memtablerep": "skip_list", |
11fdf7f2 TL |
116 | "prefixpercent": 25, |
117 | "readpercent": 25, | |
7c673cae FG |
118 | "target_file_size_base": 16777216, |
119 | "target_file_size_multiplier": 1, | |
120 | "test_batches_snapshots": 0, | |
7c673cae | 121 | "write_buffer_size": 32 * 1024 * 1024, |
7c673cae FG |
122 | } |
123 | ||
124 | blackbox_simple_default_params = { | |
7c673cae | 125 | "open_files": -1, |
7c673cae | 126 | "set_options_one_in": 0, |
7c673cae FG |
127 | } |
128 | ||
11fdf7f2 | 129 | whitebox_simple_default_params = {} |
7c673cae | 130 | |
494da23a TL |
131 | atomic_flush_params = { |
132 | "disable_wal": 1, | |
133 | "reopen": 0, | |
134 | "test_atomic_flush": 1, | |
135 | # use small value for write_buffer_size so that RocksDB triggers flush | |
136 | # more frequently | |
137 | "write_buffer_size": 1024 * 1024, | |
138 | } | |
139 | ||
7c673cae FG |
140 | |
141 | def finalize_and_sanitize(src_params): | |
142 | dest_params = dict([(k, v() if callable(v) else v) | |
143 | for (k, v) in src_params.items()]) | |
11fdf7f2 TL |
144 | if dest_params.get("compression_type") != "zstd" or \ |
145 | dest_params.get("compression_max_dict_bytes") == 0: | |
146 | dest_params["compression_zstd_max_train_bytes"] = 0 | |
7c673cae FG |
147 | if dest_params.get("allow_concurrent_memtable_write", 1) == 1: |
148 | dest_params["memtablerep"] = "skip_list" | |
11fdf7f2 TL |
149 | if dest_params["mmap_read"] == 1 or not is_direct_io_supported( |
150 | dest_params["db"]): | |
151 | dest_params["use_direct_io_for_flush_and_compaction"] = 0 | |
152 | dest_params["use_direct_reads"] = 0 | |
494da23a TL |
153 | if dest_params.get("test_batches_snapshots") == 1: |
154 | dest_params["delpercent"] += dest_params["delrangepercent"] | |
155 | dest_params["delrangepercent"] = 0 | |
7c673cae FG |
156 | return dest_params |
157 | ||
158 | ||
159 | def gen_cmd_params(args): | |
160 | params = {} | |
161 | ||
11fdf7f2 TL |
162 | params.update(default_params) |
163 | if args.test_type == 'blackbox': | |
164 | params.update(blackbox_default_params) | |
165 | if args.test_type == 'whitebox': | |
166 | params.update(whitebox_default_params) | |
7c673cae FG |
167 | if args.simple: |
168 | params.update(simple_default_params) | |
169 | if args.test_type == 'blackbox': | |
170 | params.update(blackbox_simple_default_params) | |
171 | if args.test_type == 'whitebox': | |
172 | params.update(whitebox_simple_default_params) | |
494da23a TL |
173 | if args.enable_atomic_flush: |
174 | params.update(atomic_flush_params) | |
7c673cae | 175 | |
7c673cae FG |
176 | for k, v in vars(args).items(): |
177 | if v is not None: | |
178 | params[k] = v | |
179 | return params | |
180 | ||
181 | ||
11fdf7f2 TL |
182 | def gen_cmd(params, unknown_params): |
183 | cmd = ['./db_stress'] + [ | |
7c673cae FG |
184 | '--{0}={1}'.format(k, v) |
185 | for k, v in finalize_and_sanitize(params).items() | |
186 | if k not in set(['test_type', 'simple', 'duration', 'interval', | |
494da23a | 187 | 'random_kill_odd', 'enable_atomic_flush']) |
11fdf7f2 | 188 | and v is not None] + unknown_params |
7c673cae FG |
189 | return cmd |
190 | ||
191 | ||
192 | # This script runs and kills db_stress multiple times. It checks consistency | |
193 | # in case of unsafe crashes in RocksDB. | |
11fdf7f2 | 194 | def blackbox_crash_main(args, unknown_args): |
7c673cae FG |
195 | cmd_params = gen_cmd_params(args) |
196 | dbname = get_dbname('blackbox') | |
197 | exit_time = time.time() + cmd_params['duration'] | |
198 | ||
199 | print("Running blackbox-crash-test with \n" | |
200 | + "interval_between_crash=" + str(cmd_params['interval']) + "\n" | |
11fdf7f2 | 201 | + "total-duration=" + str(cmd_params['duration']) + "\n") |
7c673cae FG |
202 | |
203 | while time.time() < exit_time: | |
204 | run_had_errors = False | |
205 | killtime = time.time() + cmd_params['interval'] | |
206 | ||
11fdf7f2 TL |
207 | cmd = gen_cmd(dict( |
208 | cmd_params.items() + | |
209 | {'db': dbname}.items()), unknown_args) | |
7c673cae | 210 | |
11fdf7f2 | 211 | child = subprocess.Popen(cmd, stderr=subprocess.PIPE) |
7c673cae | 212 | print("Running db_stress with pid=%d: %s\n\n" |
11fdf7f2 | 213 | % (child.pid, ' '.join(cmd))) |
7c673cae FG |
214 | |
215 | stop_early = False | |
216 | while time.time() < killtime: | |
217 | if child.poll() is not None: | |
218 | print("WARNING: db_stress ended before kill: exitcode=%d\n" | |
219 | % child.returncode) | |
220 | stop_early = True | |
221 | break | |
222 | time.sleep(1) | |
223 | ||
224 | if not stop_early: | |
225 | if child.poll() is not None: | |
226 | print("WARNING: db_stress ended before kill: exitcode=%d\n" | |
227 | % child.returncode) | |
228 | else: | |
229 | child.kill() | |
230 | print("KILLED %d\n" % child.pid) | |
231 | time.sleep(1) # time to stabilize after a kill | |
232 | ||
233 | while True: | |
234 | line = child.stderr.readline().strip() | |
11fdf7f2 TL |
235 | if line == '': |
236 | break | |
237 | elif not line.startswith('WARNING'): | |
7c673cae FG |
238 | run_had_errors = True |
239 | print('stderr has error message:') | |
240 | print('***' + line + '***') | |
7c673cae FG |
241 | |
242 | if run_had_errors: | |
243 | sys.exit(2) | |
244 | ||
245 | time.sleep(1) # time to stabilize before the next run | |
246 | ||
247 | # we need to clean up after ourselves -- only do this on test success | |
248 | shutil.rmtree(dbname, True) | |
249 | ||
250 | ||
251 | # This python script runs db_stress multiple times. Some runs with | |
252 | # kill_random_test that causes rocksdb to crash at various points in code. | |
11fdf7f2 | 253 | def whitebox_crash_main(args, unknown_args): |
7c673cae FG |
254 | cmd_params = gen_cmd_params(args) |
255 | dbname = get_dbname('whitebox') | |
256 | ||
257 | cur_time = time.time() | |
258 | exit_time = cur_time + cmd_params['duration'] | |
259 | half_time = cur_time + cmd_params['duration'] / 2 | |
260 | ||
261 | print("Running whitebox-crash-test with \n" | |
11fdf7f2 | 262 | + "total-duration=" + str(cmd_params['duration']) + "\n") |
7c673cae FG |
263 | |
264 | total_check_mode = 4 | |
265 | check_mode = 0 | |
266 | kill_random_test = cmd_params['random_kill_odd'] | |
267 | kill_mode = 0 | |
268 | ||
269 | while time.time() < exit_time: | |
270 | if check_mode == 0: | |
271 | additional_opts = { | |
272 | # use large ops per thread since we will kill it anyway | |
273 | "ops_per_thread": 100 * cmd_params['ops_per_thread'], | |
274 | } | |
275 | # run with kill_random_test, with three modes. | |
276 | # Mode 0 covers all kill points. Mode 1 covers less kill points but | |
277 | # increases change of triggering them. Mode 2 covers even less | |
278 | # frequent kill points and further increases triggering change. | |
279 | if kill_mode == 0: | |
280 | additional_opts.update({ | |
281 | "kill_random_test": kill_random_test, | |
282 | }) | |
283 | elif kill_mode == 1: | |
284 | additional_opts.update({ | |
285 | "kill_random_test": (kill_random_test / 10 + 1), | |
286 | "kill_prefix_blacklist": "WritableFileWriter::Append," | |
287 | + "WritableFileWriter::WriteBuffered", | |
288 | }) | |
289 | elif kill_mode == 2: | |
290 | # TODO: May need to adjust random odds if kill_random_test | |
291 | # is too small. | |
292 | additional_opts.update({ | |
293 | "kill_random_test": (kill_random_test / 5000 + 1), | |
294 | "kill_prefix_blacklist": "WritableFileWriter::Append," | |
295 | "WritableFileWriter::WriteBuffered," | |
296 | "PosixMmapFile::Allocate,WritableFileWriter::Flush", | |
297 | }) | |
298 | # Run kill mode 0, 1 and 2 by turn. | |
299 | kill_mode = (kill_mode + 1) % 3 | |
300 | elif check_mode == 1: | |
301 | # normal run with universal compaction mode | |
302 | additional_opts = { | |
303 | "kill_random_test": None, | |
304 | "ops_per_thread": cmd_params['ops_per_thread'], | |
305 | "compaction_style": 1, | |
306 | } | |
307 | elif check_mode == 2: | |
308 | # normal run with FIFO compaction mode | |
309 | # ops_per_thread is divided by 5 because FIFO compaction | |
310 | # style is quite a bit slower on reads with lot of files | |
311 | additional_opts = { | |
312 | "kill_random_test": None, | |
313 | "ops_per_thread": cmd_params['ops_per_thread'] / 5, | |
314 | "compaction_style": 2, | |
315 | } | |
316 | else: | |
317 | # normal run | |
11fdf7f2 | 318 | additional_opts = { |
7c673cae FG |
319 | "kill_random_test": None, |
320 | "ops_per_thread": cmd_params['ops_per_thread'], | |
321 | } | |
322 | ||
323 | cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items() | |
11fdf7f2 | 324 | + {'db': dbname}.items()), unknown_args) |
7c673cae | 325 | |
11fdf7f2 | 326 | print "Running:" + ' '.join(cmd) + "\n" # noqa: E999 T25377293 Grandfathered in |
7c673cae | 327 | |
11fdf7f2 TL |
328 | popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, |
329 | stderr=subprocess.STDOUT) | |
7c673cae FG |
330 | stdoutdata, stderrdata = popen.communicate() |
331 | retncode = popen.returncode | |
332 | msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( | |
333 | check_mode, additional_opts['kill_random_test'], retncode)) | |
334 | print msg | |
335 | print stdoutdata | |
336 | ||
337 | expected = False | |
338 | if additional_opts['kill_random_test'] is None and (retncode == 0): | |
339 | # we expect zero retncode if no kill option | |
340 | expected = True | |
341 | elif additional_opts['kill_random_test'] is not None and retncode < 0: | |
342 | # we expect negative retncode if kill option was given | |
343 | expected = True | |
344 | ||
345 | if not expected: | |
346 | print "TEST FAILED. See kill option and exit code above!!!\n" | |
347 | sys.exit(1) | |
348 | ||
349 | stdoutdata = stdoutdata.lower() | |
350 | errorcount = (stdoutdata.count('error') - | |
351 | stdoutdata.count('got errors 0 times')) | |
352 | print "#times error occurred in output is " + str(errorcount) + "\n" | |
353 | ||
354 | if (errorcount > 0): | |
355 | print "TEST FAILED. Output has 'error'!!!\n" | |
356 | sys.exit(2) | |
357 | if (stdoutdata.find('fail') >= 0): | |
358 | print "TEST FAILED. Output has 'fail'!!!\n" | |
359 | sys.exit(2) | |
360 | ||
361 | # First half of the duration, keep doing kill test. For the next half, | |
362 | # try different modes. | |
363 | if time.time() > half_time: | |
364 | # we need to clean up after ourselves -- only do this on test | |
365 | # success | |
366 | shutil.rmtree(dbname, True) | |
11fdf7f2 TL |
367 | os.mkdir(dbname) |
368 | cmd_params.pop('expected_values_path', None) | |
7c673cae FG |
369 | check_mode = (check_mode + 1) % total_check_mode |
370 | ||
371 | time.sleep(1) # time to stabilize after a kill | |
372 | ||
373 | ||
374 | def main(): | |
375 | parser = argparse.ArgumentParser(description="This script runs and kills \ | |
376 | db_stress multiple times") | |
377 | parser.add_argument("test_type", choices=["blackbox", "whitebox"]) | |
378 | parser.add_argument("--simple", action="store_true") | |
494da23a | 379 | parser.add_argument("--enable_atomic_flush", action='store_true') |
7c673cae FG |
380 | |
381 | all_params = dict(default_params.items() | |
382 | + blackbox_default_params.items() | |
383 | + whitebox_default_params.items() | |
384 | + simple_default_params.items() | |
385 | + blackbox_simple_default_params.items() | |
386 | + whitebox_simple_default_params.items()) | |
387 | ||
388 | for k, v in all_params.items(): | |
389 | parser.add_argument("--" + k, type=type(v() if callable(v) else v)) | |
11fdf7f2 TL |
390 | # unknown_args are passed directly to db_stress |
391 | args, unknown_args = parser.parse_known_args() | |
392 | ||
393 | test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) | |
394 | if test_tmpdir is not None and not os.path.isdir(test_tmpdir): | |
395 | print('%s env var is set to a non-existent directory: %s' % | |
396 | (_TEST_DIR_ENV_VAR, test_tmpdir)) | |
397 | sys.exit(1) | |
7c673cae FG |
398 | |
399 | if args.test_type == 'blackbox': | |
11fdf7f2 | 400 | blackbox_crash_main(args, unknown_args) |
7c673cae | 401 | if args.test_type == 'whitebox': |
11fdf7f2 | 402 | whitebox_crash_main(args, unknown_args) |
7c673cae FG |
403 | |
404 | if __name__ == '__main__': | |
405 | main() |