]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/tools/db_crashtest.py
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / tools / db_crashtest.py
1 #! /usr/bin/env python
2 import os
3 import sys
4 import time
5 import random
6 import tempfile
7 import subprocess
8 import shutil
9 import argparse
10
11 # params overwrite priority:
12 # for default:
13 # default_params < {blackbox,whitebox}_default_params < args
14 # for simple:
15 # default_params < {blackbox,whitebox}_default_params <
16 # simple_default_params <
17 # {blackbox,whitebox}_simple_default_params < args
18 # for enable_atomic_flush:
19 # default_params < {blackbox,whitebox}_default_params <
20 # atomic_flush_params < args
21
22 expected_values_file = tempfile.NamedTemporaryFile()
23
24 default_params = {
25 "acquire_snapshot_one_in": 10000,
26 "block_size": 16384,
27 "cache_size": 1048576,
28 "checkpoint_one_in": 1000000,
29 "compression_type": "snappy",
30 "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
31 "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
32 "clear_column_family_one_in": 0,
33 "compact_files_one_in": 1000000,
34 "compact_range_one_in": 1000000,
35 "delpercent": 4,
36 "delrangepercent": 1,
37 "destroy_db_initially": 0,
38 "enable_pipelined_write": lambda: random.randint(0, 1),
39 "expected_values_path": expected_values_file.name,
40 "flush_one_in": 1000000,
41 "max_background_compactions": 20,
42 "max_bytes_for_level_base": 10485760,
43 "max_key": 100000000,
44 "max_write_buffer_number": 3,
45 "mmap_read": lambda: random.randint(0, 1),
46 "nooverwritepercent": 1,
47 "open_files": 500000,
48 "prefixpercent": 5,
49 "progress_reports": 0,
50 "readpercent": 45,
51 "recycle_log_file_num": lambda: random.randint(0, 1),
52 "reopen": 20,
53 "snapshot_hold_ops": 100000,
54 "subcompactions": lambda: random.randint(1, 4),
55 "target_file_size_base": 2097152,
56 "target_file_size_multiplier": 2,
57 "use_direct_reads": lambda: random.randint(0, 1),
58 "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
59 "use_full_merge_v1": lambda: random.randint(0, 1),
60 "use_merge": lambda: random.randint(0, 1),
61 "verify_checksum": 1,
62 "write_buffer_size": 4 * 1024 * 1024,
63 "writepercent": 35,
64 "format_version": lambda: random.randint(2, 4),
65 "index_block_restart_interval": lambda: random.choice(range(1, 16)),
66 }
67
68 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
69
70
71 def get_dbname(test_name):
72 test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
73 if test_tmpdir is None or test_tmpdir == "":
74 dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
75 else:
76 dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
77 shutil.rmtree(dbname, True)
78 os.mkdir(dbname)
79 return dbname
80
81
82 def is_direct_io_supported(dbname):
83 with tempfile.NamedTemporaryFile(dir=dbname) as f:
84 try:
85 os.open(f.name, os.O_DIRECT)
86 except:
87 return False
88 return True
89
90
91 blackbox_default_params = {
92 # total time for this script to test db_stress
93 "duration": 6000,
94 # time for one db_stress instance to run
95 "interval": 120,
96 # since we will be killing anyway, use large value for ops_per_thread
97 "ops_per_thread": 100000000,
98 "set_options_one_in": 10000,
99 "test_batches_snapshots": 1,
100 }
101
102 whitebox_default_params = {
103 "duration": 10000,
104 "log2_keys_per_lock": 10,
105 "ops_per_thread": 200000,
106 "random_kill_odd": 888887,
107 "test_batches_snapshots": lambda: random.randint(0, 1),
108 }
109
110 simple_default_params = {
111 "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
112 "column_families": 1,
113 "max_background_compactions": 1,
114 "max_bytes_for_level_base": 67108864,
115 "memtablerep": "skip_list",
116 "prefixpercent": 25,
117 "readpercent": 25,
118 "target_file_size_base": 16777216,
119 "target_file_size_multiplier": 1,
120 "test_batches_snapshots": 0,
121 "write_buffer_size": 32 * 1024 * 1024,
122 }
123
124 blackbox_simple_default_params = {
125 "open_files": -1,
126 "set_options_one_in": 0,
127 }
128
129 whitebox_simple_default_params = {}
130
131 atomic_flush_params = {
132 "disable_wal": 1,
133 "reopen": 0,
134 "test_atomic_flush": 1,
135 # use small value for write_buffer_size so that RocksDB triggers flush
136 # more frequently
137 "write_buffer_size": 1024 * 1024,
138 }
139
140
141 def finalize_and_sanitize(src_params):
142 dest_params = dict([(k, v() if callable(v) else v)
143 for (k, v) in src_params.items()])
144 if dest_params.get("compression_type") != "zstd" or \
145 dest_params.get("compression_max_dict_bytes") == 0:
146 dest_params["compression_zstd_max_train_bytes"] = 0
147 if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
148 dest_params["memtablerep"] = "skip_list"
149 if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
150 dest_params["db"]):
151 dest_params["use_direct_io_for_flush_and_compaction"] = 0
152 dest_params["use_direct_reads"] = 0
153 if dest_params.get("test_batches_snapshots") == 1:
154 dest_params["delpercent"] += dest_params["delrangepercent"]
155 dest_params["delrangepercent"] = 0
156 return dest_params
157
158
159 def gen_cmd_params(args):
160 params = {}
161
162 params.update(default_params)
163 if args.test_type == 'blackbox':
164 params.update(blackbox_default_params)
165 if args.test_type == 'whitebox':
166 params.update(whitebox_default_params)
167 if args.simple:
168 params.update(simple_default_params)
169 if args.test_type == 'blackbox':
170 params.update(blackbox_simple_default_params)
171 if args.test_type == 'whitebox':
172 params.update(whitebox_simple_default_params)
173 if args.enable_atomic_flush:
174 params.update(atomic_flush_params)
175
176 for k, v in vars(args).items():
177 if v is not None:
178 params[k] = v
179 return params
180
181
182 def gen_cmd(params, unknown_params):
183 cmd = ['./db_stress'] + [
184 '--{0}={1}'.format(k, v)
185 for k, v in finalize_and_sanitize(params).items()
186 if k not in set(['test_type', 'simple', 'duration', 'interval',
187 'random_kill_odd', 'enable_atomic_flush'])
188 and v is not None] + unknown_params
189 return cmd
190
191
192 # This script runs and kills db_stress multiple times. It checks consistency
193 # in case of unsafe crashes in RocksDB.
194 def blackbox_crash_main(args, unknown_args):
195 cmd_params = gen_cmd_params(args)
196 dbname = get_dbname('blackbox')
197 exit_time = time.time() + cmd_params['duration']
198
199 print("Running blackbox-crash-test with \n"
200 + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
201 + "total-duration=" + str(cmd_params['duration']) + "\n")
202
203 while time.time() < exit_time:
204 run_had_errors = False
205 killtime = time.time() + cmd_params['interval']
206
207 cmd = gen_cmd(dict(
208 cmd_params.items() +
209 {'db': dbname}.items()), unknown_args)
210
211 child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
212 print("Running db_stress with pid=%d: %s\n\n"
213 % (child.pid, ' '.join(cmd)))
214
215 stop_early = False
216 while time.time() < killtime:
217 if child.poll() is not None:
218 print("WARNING: db_stress ended before kill: exitcode=%d\n"
219 % child.returncode)
220 stop_early = True
221 break
222 time.sleep(1)
223
224 if not stop_early:
225 if child.poll() is not None:
226 print("WARNING: db_stress ended before kill: exitcode=%d\n"
227 % child.returncode)
228 else:
229 child.kill()
230 print("KILLED %d\n" % child.pid)
231 time.sleep(1) # time to stabilize after a kill
232
233 while True:
234 line = child.stderr.readline().strip()
235 if line == '':
236 break
237 elif not line.startswith('WARNING'):
238 run_had_errors = True
239 print('stderr has error message:')
240 print('***' + line + '***')
241
242 if run_had_errors:
243 sys.exit(2)
244
245 time.sleep(1) # time to stabilize before the next run
246
247 # we need to clean up after ourselves -- only do this on test success
248 shutil.rmtree(dbname, True)
249
250
251 # This python script runs db_stress multiple times. Some runs with
252 # kill_random_test that causes rocksdb to crash at various points in code.
253 def whitebox_crash_main(args, unknown_args):
254 cmd_params = gen_cmd_params(args)
255 dbname = get_dbname('whitebox')
256
257 cur_time = time.time()
258 exit_time = cur_time + cmd_params['duration']
259 half_time = cur_time + cmd_params['duration'] / 2
260
261 print("Running whitebox-crash-test with \n"
262 + "total-duration=" + str(cmd_params['duration']) + "\n")
263
264 total_check_mode = 4
265 check_mode = 0
266 kill_random_test = cmd_params['random_kill_odd']
267 kill_mode = 0
268
269 while time.time() < exit_time:
270 if check_mode == 0:
271 additional_opts = {
272 # use large ops per thread since we will kill it anyway
273 "ops_per_thread": 100 * cmd_params['ops_per_thread'],
274 }
275 # run with kill_random_test, with three modes.
276 # Mode 0 covers all kill points. Mode 1 covers less kill points but
277 # increases change of triggering them. Mode 2 covers even less
278 # frequent kill points and further increases triggering change.
279 if kill_mode == 0:
280 additional_opts.update({
281 "kill_random_test": kill_random_test,
282 })
283 elif kill_mode == 1:
284 additional_opts.update({
285 "kill_random_test": (kill_random_test / 10 + 1),
286 "kill_prefix_blacklist": "WritableFileWriter::Append,"
287 + "WritableFileWriter::WriteBuffered",
288 })
289 elif kill_mode == 2:
290 # TODO: May need to adjust random odds if kill_random_test
291 # is too small.
292 additional_opts.update({
293 "kill_random_test": (kill_random_test / 5000 + 1),
294 "kill_prefix_blacklist": "WritableFileWriter::Append,"
295 "WritableFileWriter::WriteBuffered,"
296 "PosixMmapFile::Allocate,WritableFileWriter::Flush",
297 })
298 # Run kill mode 0, 1 and 2 by turn.
299 kill_mode = (kill_mode + 1) % 3
300 elif check_mode == 1:
301 # normal run with universal compaction mode
302 additional_opts = {
303 "kill_random_test": None,
304 "ops_per_thread": cmd_params['ops_per_thread'],
305 "compaction_style": 1,
306 }
307 elif check_mode == 2:
308 # normal run with FIFO compaction mode
309 # ops_per_thread is divided by 5 because FIFO compaction
310 # style is quite a bit slower on reads with lot of files
311 additional_opts = {
312 "kill_random_test": None,
313 "ops_per_thread": cmd_params['ops_per_thread'] / 5,
314 "compaction_style": 2,
315 }
316 else:
317 # normal run
318 additional_opts = {
319 "kill_random_test": None,
320 "ops_per_thread": cmd_params['ops_per_thread'],
321 }
322
323 cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
324 + {'db': dbname}.items()), unknown_args)
325
326 print "Running:" + ' '.join(cmd) + "\n" # noqa: E999 T25377293 Grandfathered in
327
328 popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
329 stderr=subprocess.STDOUT)
330 stdoutdata, stderrdata = popen.communicate()
331 retncode = popen.returncode
332 msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
333 check_mode, additional_opts['kill_random_test'], retncode))
334 print msg
335 print stdoutdata
336
337 expected = False
338 if additional_opts['kill_random_test'] is None and (retncode == 0):
339 # we expect zero retncode if no kill option
340 expected = True
341 elif additional_opts['kill_random_test'] is not None and retncode < 0:
342 # we expect negative retncode if kill option was given
343 expected = True
344
345 if not expected:
346 print "TEST FAILED. See kill option and exit code above!!!\n"
347 sys.exit(1)
348
349 stdoutdata = stdoutdata.lower()
350 errorcount = (stdoutdata.count('error') -
351 stdoutdata.count('got errors 0 times'))
352 print "#times error occurred in output is " + str(errorcount) + "\n"
353
354 if (errorcount > 0):
355 print "TEST FAILED. Output has 'error'!!!\n"
356 sys.exit(2)
357 if (stdoutdata.find('fail') >= 0):
358 print "TEST FAILED. Output has 'fail'!!!\n"
359 sys.exit(2)
360
361 # First half of the duration, keep doing kill test. For the next half,
362 # try different modes.
363 if time.time() > half_time:
364 # we need to clean up after ourselves -- only do this on test
365 # success
366 shutil.rmtree(dbname, True)
367 os.mkdir(dbname)
368 cmd_params.pop('expected_values_path', None)
369 check_mode = (check_mode + 1) % total_check_mode
370
371 time.sleep(1) # time to stabilize after a kill
372
373
374 def main():
375 parser = argparse.ArgumentParser(description="This script runs and kills \
376 db_stress multiple times")
377 parser.add_argument("test_type", choices=["blackbox", "whitebox"])
378 parser.add_argument("--simple", action="store_true")
379 parser.add_argument("--enable_atomic_flush", action='store_true')
380
381 all_params = dict(default_params.items()
382 + blackbox_default_params.items()
383 + whitebox_default_params.items()
384 + simple_default_params.items()
385 + blackbox_simple_default_params.items()
386 + whitebox_simple_default_params.items())
387
388 for k, v in all_params.items():
389 parser.add_argument("--" + k, type=type(v() if callable(v) else v))
390 # unknown_args are passed directly to db_stress
391 args, unknown_args = parser.parse_known_args()
392
393 test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
394 if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
395 print('%s env var is set to a non-existent directory: %s' %
396 (_TEST_DIR_ENV_VAR, test_tmpdir))
397 sys.exit(1)
398
399 if args.test_type == 'blackbox':
400 blackbox_crash_main(args, unknown_args)
401 if args.test_type == 'whitebox':
402 whitebox_crash_main(args, unknown_args)
403
404 if __name__ == '__main__':
405 main()