ceph/src/rocksdb/tools/db_crashtest.py

   1 #! /usr/bin/env python
   2 import os
   3 import sys
   4 import time
   5 import random
   6 import tempfile
   7 import subprocess
   8 import shutil
   9 import argparse
  10
  11 # params overwrite priority:
  12 #   for default:
  13 #       default_params < {blackbox,whitebox}_default_params < args
  14 #   for simple:
  15 #       default_params < {blackbox,whitebox}_default_params <
  16 #       simple_default_params <
  17 #       {blackbox,whitebox}_simple_default_params < args
  18 #   for enable_atomic_flush:
  19 #       default_params < {blackbox,whitebox}_default_params <
  20 #       atomic_flush_params < args
  21
  22 expected_values_file = tempfile.NamedTemporaryFile()
  23
  24 default_params = {
  25     "acquire_snapshot_one_in": 10000,
  26     "block_size": 16384,
  27     "cache_size": 1048576,
  28     "checkpoint_one_in": 1000000,
  29     "compression_type": "snappy",
  30     "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
  31     "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
  32     "clear_column_family_one_in": 0,
  33     "compact_files_one_in": 1000000,
  34     "compact_range_one_in": 1000000,
  35     "delpercent": 4,
  36     "delrangepercent": 1,
  37     "destroy_db_initially": 0,
  38     "enable_pipelined_write": lambda: random.randint(0, 1),
  39     "expected_values_path": expected_values_file.name,
  40     "flush_one_in": 1000000,
  41     "max_background_compactions": 20,
  42     "max_bytes_for_level_base": 10485760,
  43     "max_key": 100000000,
  44     "max_write_buffer_number": 3,
  45     "mmap_read": lambda: random.randint(0, 1),
  46     "nooverwritepercent": 1,
  47     "open_files": 500000,
  48     "prefixpercent": 5,
  49     "progress_reports": 0,
  50     "readpercent": 45,
  51     "recycle_log_file_num": lambda: random.randint(0, 1),
  52     "reopen": 20,
  53     "snapshot_hold_ops": 100000,
  54     "subcompactions": lambda: random.randint(1, 4),
  55     "target_file_size_base": 2097152,
  56     "target_file_size_multiplier": 2,
  57     "use_direct_reads": lambda: random.randint(0, 1),
  58     "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
  59     "use_full_merge_v1": lambda: random.randint(0, 1),
  60     "use_merge": lambda: random.randint(0, 1),
  61     "verify_checksum": 1,
  62     "write_buffer_size": 4 * 1024 * 1024,
  63     "writepercent": 35,
  64     "format_version": lambda: random.randint(2, 4),
  65     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
  66 }
  67
  68 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
  69
  70
  71 def get_dbname(test_name):
  72     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
  73     if test_tmpdir is None or test_tmpdir == "":
  74         dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
  75     else:
  76         dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
  77         shutil.rmtree(dbname, True)
  78         os.mkdir(dbname)
  79     return dbname
  80
  81
  82 def is_direct_io_supported(dbname):
  83     with tempfile.NamedTemporaryFile(dir=dbname) as f:
  84         try:
  85             os.open(f.name, os.O_DIRECT)
  86         except:
  87             return False
  88         return True
  89
  90
  91 blackbox_default_params = {
  92     # total time for this script to test db_stress
  93     "duration": 6000,
  94     # time for one db_stress instance to run
  95     "interval": 120,
  96     # since we will be killing anyway, use large value for ops_per_thread
  97     "ops_per_thread": 100000000,
  98     "set_options_one_in": 10000,
  99     "test_batches_snapshots": 1,
 100 }
 101
 102 whitebox_default_params = {
 103     "duration": 10000,
 104     "log2_keys_per_lock": 10,
 105     "ops_per_thread": 200000,
 106     "random_kill_odd": 888887,
 107     "test_batches_snapshots": lambda: random.randint(0, 1),
 108 }
 109
 110 simple_default_params = {
 111     "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
 112     "column_families": 1,
 113     "max_background_compactions": 1,
 114     "max_bytes_for_level_base": 67108864,
 115     "memtablerep": "skip_list",
 116     "prefixpercent": 25,
 117     "readpercent": 25,
 118     "target_file_size_base": 16777216,
 119     "target_file_size_multiplier": 1,
 120     "test_batches_snapshots": 0,
 121     "write_buffer_size": 32 * 1024 * 1024,
 122 }
 123
 124 blackbox_simple_default_params = {
 125     "open_files": -1,
 126     "set_options_one_in": 0,
 127 }
 128
 129 whitebox_simple_default_params = {}
 130
 131 atomic_flush_params = {
 132     "disable_wal": 1,
 133     "reopen": 0,
 134     "test_atomic_flush": 1,
 135     # use small value for write_buffer_size so that RocksDB triggers flush
 136     # more frequently
 137     "write_buffer_size": 1024 * 1024,
 138 }
 139
 140
 141 def finalize_and_sanitize(src_params):
 142     dest_params = dict([(k,  v() if callable(v) else v)
 143                         for (k, v) in src_params.items()])
 144     if dest_params.get("compression_type") != "zstd" or \
 145             dest_params.get("compression_max_dict_bytes") == 0:
 146         dest_params["compression_zstd_max_train_bytes"] = 0
 147     if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
 148         dest_params["memtablerep"] = "skip_list"
 149     if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
 150             dest_params["db"]):
 151         dest_params["use_direct_io_for_flush_and_compaction"] = 0
 152         dest_params["use_direct_reads"] = 0
 153     if dest_params.get("test_batches_snapshots") == 1:
 154         dest_params["delpercent"] += dest_params["delrangepercent"]
 155         dest_params["delrangepercent"] = 0
 156     return dest_params
 157
 158
 159 def gen_cmd_params(args):
 160     params = {}
 161
 162     params.update(default_params)
 163     if args.test_type == 'blackbox':
 164         params.update(blackbox_default_params)
 165     if args.test_type == 'whitebox':
 166         params.update(whitebox_default_params)
 167     if args.simple:
 168         params.update(simple_default_params)
 169         if args.test_type == 'blackbox':
 170             params.update(blackbox_simple_default_params)
 171         if args.test_type == 'whitebox':
 172             params.update(whitebox_simple_default_params)
 173     if args.enable_atomic_flush:
 174         params.update(atomic_flush_params)
 175
 176     for k, v in vars(args).items():
 177         if v is not None:
 178             params[k] = v
 179     return params
 180
 181
 182 def gen_cmd(params, unknown_params):
 183     cmd = ['./db_stress'] + [
 184         '--{0}={1}'.format(k, v)
 185         for k, v in finalize_and_sanitize(params).items()
 186         if k not in set(['test_type', 'simple', 'duration', 'interval',
 187                          'random_kill_odd', 'enable_atomic_flush'])
 188         and v is not None] + unknown_params
 189     return cmd
 190
 191
 192 # This script runs and kills db_stress multiple times. It checks consistency
 193 # in case of unsafe crashes in RocksDB.
 194 def blackbox_crash_main(args, unknown_args):
 195     cmd_params = gen_cmd_params(args)
 196     dbname = get_dbname('blackbox')
 197     exit_time = time.time() + cmd_params['duration']
 198
 199     print("Running blackbox-crash-test with \n"
 200           + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
 201           + "total-duration=" + str(cmd_params['duration']) + "\n")
 202
 203     while time.time() < exit_time:
 204         run_had_errors = False
 205         killtime = time.time() + cmd_params['interval']
 206
 207         cmd = gen_cmd(dict(
 208             cmd_params.items() +
 209             {'db': dbname}.items()), unknown_args)
 210
 211         child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
 212         print("Running db_stress with pid=%d: %s\n\n"
 213               % (child.pid, ' '.join(cmd)))
 214
 215         stop_early = False
 216         while time.time() < killtime:
 217             if child.poll() is not None:
 218                 print("WARNING: db_stress ended before kill: exitcode=%d\n"
 219                       % child.returncode)
 220                 stop_early = True
 221                 break
 222             time.sleep(1)
 223
 224         if not stop_early:
 225             if child.poll() is not None:
 226                 print("WARNING: db_stress ended before kill: exitcode=%d\n"
 227                       % child.returncode)
 228             else:
 229                 child.kill()
 230                 print("KILLED %d\n" % child.pid)
 231                 time.sleep(1)  # time to stabilize after a kill
 232
 233         while True:
 234             line = child.stderr.readline().strip()
 235             if line == '':
 236                 break
 237             elif not line.startswith('WARNING'):
 238                 run_had_errors = True
 239                 print('stderr has error message:')
 240                 print('***' + line + '***')
 241
 242         if run_had_errors:
 243             sys.exit(2)
 244
 245         time.sleep(1)  # time to stabilize before the next run
 246
 247     # we need to clean up after ourselves -- only do this on test success
 248     shutil.rmtree(dbname, True)
 249
 250
 251 # This python script runs db_stress multiple times. Some runs with
 252 # kill_random_test that causes rocksdb to crash at various points in code.
 253 def whitebox_crash_main(args, unknown_args):
 254     cmd_params = gen_cmd_params(args)
 255     dbname = get_dbname('whitebox')
 256
 257     cur_time = time.time()
 258     exit_time = cur_time + cmd_params['duration']
 259     half_time = cur_time + cmd_params['duration'] / 2
 260
 261     print("Running whitebox-crash-test with \n"
 262           + "total-duration=" + str(cmd_params['duration']) + "\n")
 263
 264     total_check_mode = 4
 265     check_mode = 0
 266     kill_random_test = cmd_params['random_kill_odd']
 267     kill_mode = 0
 268
 269     while time.time() < exit_time:
 270         if check_mode == 0:
 271             additional_opts = {
 272                 # use large ops per thread since we will kill it anyway
 273                 "ops_per_thread": 100 * cmd_params['ops_per_thread'],
 274             }
 275             # run with kill_random_test, with three modes.
 276             # Mode 0 covers all kill points. Mode 1 covers less kill points but
 277             # increases change of triggering them. Mode 2 covers even less
 278             # frequent kill points and further increases triggering change.
 279             if kill_mode == 0:
 280                 additional_opts.update({
 281                     "kill_random_test": kill_random_test,
 282                 })
 283             elif kill_mode == 1:
 284                 additional_opts.update({
 285                     "kill_random_test": (kill_random_test / 10 + 1),
 286                     "kill_prefix_blacklist": "WritableFileWriter::Append,"
 287                     + "WritableFileWriter::WriteBuffered",
 288                 })
 289             elif kill_mode == 2:
 290                 # TODO: May need to adjust random odds if kill_random_test
 291                 # is too small.
 292                 additional_opts.update({
 293                     "kill_random_test": (kill_random_test / 5000 + 1),
 294                     "kill_prefix_blacklist": "WritableFileWriter::Append,"
 295                     "WritableFileWriter::WriteBuffered,"
 296                     "PosixMmapFile::Allocate,WritableFileWriter::Flush",
 297                 })
 298             # Run kill mode 0, 1 and 2 by turn.
 299             kill_mode = (kill_mode + 1) % 3
 300         elif check_mode == 1:
 301             # normal run with universal compaction mode
 302             additional_opts = {
 303                 "kill_random_test": None,
 304                 "ops_per_thread": cmd_params['ops_per_thread'],
 305                 "compaction_style": 1,
 306             }
 307         elif check_mode == 2:
 308             # normal run with FIFO compaction mode
 309             # ops_per_thread is divided by 5 because FIFO compaction
 310             # style is quite a bit slower on reads with lot of files
 311             additional_opts = {
 312                 "kill_random_test": None,
 313                 "ops_per_thread": cmd_params['ops_per_thread'] / 5,
 314                 "compaction_style": 2,
 315             }
 316         else:
 317             # normal run
 318             additional_opts = {
 319                 "kill_random_test": None,
 320                 "ops_per_thread": cmd_params['ops_per_thread'],
 321             }
 322
 323         cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
 324                            + {'db': dbname}.items()), unknown_args)
 325
 326         print "Running:" + ' '.join(cmd) + "\n"  # noqa: E999 T25377293 Grandfathered in
 327
 328         popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
 329                                  stderr=subprocess.STDOUT)
 330         stdoutdata, stderrdata = popen.communicate()
 331         retncode = popen.returncode
 332         msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
 333                check_mode, additional_opts['kill_random_test'], retncode))
 334         print msg
 335         print stdoutdata
 336
 337         expected = False
 338         if additional_opts['kill_random_test'] is None and (retncode == 0):
 339             # we expect zero retncode if no kill option
 340             expected = True
 341         elif additional_opts['kill_random_test'] is not None and retncode < 0:
 342             # we expect negative retncode if kill option was given
 343             expected = True
 344
 345         if not expected:
 346             print "TEST FAILED. See kill option and exit code above!!!\n"
 347             sys.exit(1)
 348
 349         stdoutdata = stdoutdata.lower()
 350         errorcount = (stdoutdata.count('error') -
 351                       stdoutdata.count('got errors 0 times'))
 352         print "#times error occurred in output is " + str(errorcount) + "\n"
 353
 354         if (errorcount > 0):
 355             print "TEST FAILED. Output has 'error'!!!\n"
 356             sys.exit(2)
 357         if (stdoutdata.find('fail') >= 0):
 358             print "TEST FAILED. Output has 'fail'!!!\n"
 359             sys.exit(2)
 360
 361         # First half of the duration, keep doing kill test. For the next half,
 362         # try different modes.
 363         if time.time() > half_time:
 364             # we need to clean up after ourselves -- only do this on test
 365             # success
 366             shutil.rmtree(dbname, True)
 367             os.mkdir(dbname)
 368             cmd_params.pop('expected_values_path', None)
 369             check_mode = (check_mode + 1) % total_check_mode
 370
 371         time.sleep(1)  # time to stabilize after a kill
 372
 373
 374 def main():
 375     parser = argparse.ArgumentParser(description="This script runs and kills \
 376         db_stress multiple times")
 377     parser.add_argument("test_type", choices=["blackbox", "whitebox"])
 378     parser.add_argument("--simple", action="store_true")
 379     parser.add_argument("--enable_atomic_flush", action='store_true')
 380
 381     all_params = dict(default_params.items()
 382                       + blackbox_default_params.items()
 383                       + whitebox_default_params.items()
 384                       + simple_default_params.items()
 385                       + blackbox_simple_default_params.items()
 386                       + whitebox_simple_default_params.items())
 387
 388     for k, v in all_params.items():
 389         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
 390     # unknown_args are passed directly to db_stress
 391     args, unknown_args = parser.parse_known_args()
 392
 393     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
 394     if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
 395         print('%s env var is set to a non-existent directory: %s' %
 396                 (_TEST_DIR_ENV_VAR, test_tmpdir))
 397         sys.exit(1)
 398
 399     if args.test_type == 'blackbox':
 400         blackbox_crash_main(args, unknown_args)
 401     if args.test_type == 'whitebox':
 402         whitebox_crash_main(args, unknown_args)
 403
 404 if __name__ == '__main__':
 405     main()