[ceph.git] / ceph / qa / tasks / thrashosds.py

"""
Thrash -- Simulate random osd failures.
"""
import contextlib
import logging
from tasks import ceph_manager
from teuthology import misc as teuthology


log = logging.getLogger(__name__)

@contextlib.contextmanager
def task(ctx, config):
    """
    "Thrash" the OSDs by randomly marking them out/down (and then back
    in) until the task is ended. This loops, and every op_delay
    seconds it randomly chooses to add or remove an OSD (even odds)
    unless there are fewer than min_out OSDs out of the cluster, or
    more than min_in OSDs in the cluster.

    All commands are run on mon0 and it stops when __exit__ is called.

    The config is optional, and is a dict containing some or all of:

    cluster: (default 'ceph') the name of the cluster to thrash

    min_in: (default 4) the minimum number of OSDs to keep in the
       cluster

    min_out: (default 0) the minimum number of OSDs to keep out of the
       cluster

    op_delay: (5) the length of time to sleep between changing an
       OSD's status

    min_dead: (0) minimum number of osds to leave down/dead.

    max_dead: (0) maximum number of osds to leave down/dead before waiting
       for clean.  This should probably be num_replicas - 1.

    clean_interval: (60) the approximate length of time to loop before
       waiting until the cluster goes clean. (In reality this is used
       to probabilistically choose when to wait, and the method used
       makes it closer to -- but not identical to -- the half-life.)

    scrub_interval: (-1) the approximate length of time to loop before
       waiting until a scrub is performed while cleaning. (In reality
       this is used to probabilistically choose when to wait, and it
       only applies to the cases where cleaning is being performed).
       -1 is used to indicate that no scrubbing will be done.

    chance_down: (0.4) the probability that the thrasher will mark an
       OSD down rather than marking it out. (The thrasher will not
       consider that OSD out of the cluster, since presently an OSD
       wrongly marked down will mark itself back up again.) This value
       can be either an integer (eg, 75) or a float probability (eg
       0.75).

    chance_test_min_size: (0) chance to run test_pool_min_size,
       which:
       - kills all but one osd
       - waits
       - kills that osd
       - revives all other osds
       - verifies that the osds fully recover

    timeout: (360) the number of seconds to wait for the cluster
       to become clean after each cluster change. If this doesn't
       happen within the timeout, an exception will be raised.

    revive_timeout: (150) number of seconds to wait for an osd asok to
       appear after attempting to revive the osd

    thrash_primary_affinity: (true) randomly adjust primary-affinity

    chance_pgnum_grow: (0) chance to increase a pool's size
    chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
    pool_grow_by: (10) amount to increase pgnum by
    chance_pgnum_shrink: (0) chance to decrease a pool's size
    pool_shrink_by: (10) amount to decrease pgnum by
    max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd

    pause_short: (3) duration of short pause
    pause_long: (80) duration of long pause
    pause_check_after: (50) assert osd down after this long
    chance_inject_pause_short: (1) chance of injecting short stall
    chance_inject_pause_long: (0) chance of injecting long stall

    clean_wait: (0) duration to wait before resuming thrashing once clean

    sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
                  random live osd

    powercycle: (false) whether to power cycle the node instead
        of just the osd process. Note that this assumes that a single
        osd is the only important process on the node.

    bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
        the delay lets the BlockDevice "accept" more aio operations but blocks
        any flush, and then eventually crashes (losing some or all ios).  If 0,
        no bdev failure injection is enabled.

    bdev_inject_crash_probability: (.5) probability of doing a bdev failure
        injection crash vs a normal OSD kill.

    chance_test_backfill_full: (0) chance to simulate full disks stopping
        backfill

    chance_test_map_discontinuity: (0) chance to test map discontinuity
    map_discontinuity_sleep_time: (40) time to wait for map trims

    ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
    chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)

    optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
                  enablement to all osds

    dump_ops_enable: (true) continuously dump ops on all live osds

    noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub

    disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
                                    tests

    chance_thrash_cluster_full: .05

    chance_thrash_pg_upmap: 1.0
    chance_thrash_pg_upmap_items: 1.0

    aggressive_pg_num_changes: (true)  whether we should bypass the careful throttling of pg_num and pgp_num changes in mgr's adjust_pgs() controller

    example:

    tasks:
    - ceph:
    - thrashosds:
        cluster: ceph
        chance_down: 10
        op_delay: 3
        min_in: 1
        timeout: 600
    - interactive:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'thrashosds task only accepts a dict for configuration'
    # add default value for sighup_delay
    config['sighup_delay'] = config.get('sighup_delay', 0.1)
    # add default value for optrack_toggle_delay
    config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
    # add default value for dump_ops_enable
    config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
    # add default value for noscrub_toggle_delay
    config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
    # add default value for random_eio
    config['random_eio'] = config.get('random_eio', 0.0)
    aggro = config.get('aggressive_pg_num_changes', True)

    log.info("config is {config}".format(config=str(config)))

    overrides = ctx.config.get('overrides', {})
    log.info("overrides is {overrides}".format(overrides=str(overrides)))
    teuthology.deep_merge(config, overrides.get('thrashosds', {}))
    cluster = config.get('cluster', 'ceph')

    log.info("config is {config}".format(config=str(config)))

    if 'powercycle' in config:

        # sync everyone first to avoid collateral damage to / etc.
        log.info('Doing preliminary sync to avoid collateral damage...')
        ctx.cluster.run(args=['sync'])

        if 'ipmi_user' in ctx.teuthology_config:
            for remote in ctx.cluster.remotes.keys():
                log.debug('checking console status of %s' % remote.shortname)
                if not remote.console.check_status():
                    log.warning('Failed to get console status for %s',
                             remote.shortname)

            # check that all osd remotes have a valid console
            osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
            for remote in osds.remotes.keys():
                if not remote.console.has_ipmi_credentials:
                    raise Exception(
                        'IPMI console required for powercycling, '
                        'but not available on osd role: {r}'.format(
                            r=remote.name))

    cluster_manager = ctx.managers[cluster]
    for f in ['powercycle', 'bdev_inject_crash']:
        if config.get(f):
            cluster_manager.config[f] = config.get(f)

    if aggro:
        cluster_manager.raw_cluster_cmd(
            'config', 'set', 'mgr',
            'mgr_debug_aggressive_pg_num_changes',
            'true')

    log.info('Beginning thrashosds...')
    thrash_proc = ceph_manager.OSDThrasher(
        cluster_manager,
        config,
        "OSDThrasher",
        logger=log.getChild('thrasher')
        )
    ctx.ceph[cluster].thrashers.append(thrash_proc)
    try:
        yield
    finally:
        log.info('joining thrashosds')
        thrash_proc.do_join()
        cluster_manager.wait_for_all_osds_up()
        cluster_manager.flush_all_pg_stats()
        cluster_manager.wait_for_recovery(config.get('timeout', 360))
        if aggro:
            cluster_manager.raw_cluster_cmd(
                'config', 'rm', 'mgr',
                'mgr_debug_aggressive_pg_num_changes')
Commit	Line	Data
7c673cae FG	1	"""
	2	Thrash -- Simulate random osd failures.
	3	"""
	4	import contextlib
	5	import logging
9f95a23c	6	from tasks import ceph_manager
7c673cae FG	7	from teuthology import misc as teuthology
	8
	9
	10	log = logging.getLogger(__name__)
	11
	12	@contextlib.contextmanager
	13	def task(ctx, config):
	14	"""
	15	"Thrash" the OSDs by randomly marking them out/down (and then back
	16	in) until the task is ended. This loops, and every op_delay
	17	seconds it randomly chooses to add or remove an OSD (even odds)
	18	unless there are fewer than min_out OSDs out of the cluster, or
	19	more than min_in OSDs in the cluster.
	20
	21	All commands are run on mon0 and it stops when __exit__ is called.
	22
	23	The config is optional, and is a dict containing some or all of:
	24
	25	cluster: (default 'ceph') the name of the cluster to thrash
	26
3efd9988	27	min_in: (default 4) the minimum number of OSDs to keep in the
7c673cae FG	28	cluster
	29
	30	min_out: (default 0) the minimum number of OSDs to keep out of the
	31	cluster
	32
	33	op_delay: (5) the length of time to sleep between changing an
	34	OSD's status
	35
	36	min_dead: (0) minimum number of osds to leave down/dead.
	37
	38	max_dead: (0) maximum number of osds to leave down/dead before waiting
	39	for clean. This should probably be num_replicas - 1.
	40
	41	clean_interval: (60) the approximate length of time to loop before
	42	waiting until the cluster goes clean. (In reality this is used
	43	to probabilistically choose when to wait, and the method used
	44	makes it closer to -- but not identical to -- the half-life.)
	45
	46	scrub_interval: (-1) the approximate length of time to loop before
	47	waiting until a scrub is performed while cleaning. (In reality
	48	this is used to probabilistically choose when to wait, and it
	49	only applies to the cases where cleaning is being performed).
	50	-1 is used to indicate that no scrubbing will be done.
	51
	52	chance_down: (0.4) the probability that the thrasher will mark an
	53	OSD down rather than marking it out. (The thrasher will not
	54	consider that OSD out of the cluster, since presently an OSD
	55	wrongly marked down will mark itself back up again.) This value
	56	can be either an integer (eg, 75) or a float probability (eg
	57	0.75).
	58
	59	chance_test_min_size: (0) chance to run test_pool_min_size,
	60	which:
	61	- kills all but one osd
	62	- waits
	63	- kills that osd
	64	- revives all other osds
	65	- verifies that the osds fully recover
	66
	67	timeout: (360) the number of seconds to wait for the cluster
	68	to become clean after each cluster change. If this doesn't
	69	happen within the timeout, an exception will be raised.
	70
	71	revive_timeout: (150) number of seconds to wait for an osd asok to
	72	appear after attempting to revive the osd
	73
	74	thrash_primary_affinity: (true) randomly adjust primary-affinity
	75
	76	chance_pgnum_grow: (0) chance to increase a pool's size
	77	chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
	78	pool_grow_by: (10) amount to increase pgnum by
11fdf7f2 TL	79	chance_pgnum_shrink: (0) chance to decrease a pool's size
11fdf7f2 TL	80	pool_shrink_by: (10) amount to decrease pgnum by
7c673cae FG	81	max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
	82
	83	pause_short: (3) duration of short pause
	84	pause_long: (80) duration of long pause
	85	pause_check_after: (50) assert osd down after this long
	86	chance_inject_pause_short: (1) chance of injecting short stall
	87	chance_inject_pause_long: (0) chance of injecting long stall
	88
	89	clean_wait: (0) duration to wait before resuming thrashing once clean
	90
	91	sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
	92	random live osd
	93
	94	powercycle: (false) whether to power cycle the node instead
	95	of just the osd process. Note that this assumes that a single
	96	osd is the only important process on the node.
	97
	98	bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
	99	the delay lets the BlockDevice "accept" more aio operations but blocks
	100	any flush, and then eventually crashes (losing some or all ios). If 0,
	101	no bdev failure injection is enabled.
	102
	103	bdev_inject_crash_probability: (.5) probability of doing a bdev failure
	104	injection crash vs a normal OSD kill.
	105
	106	chance_test_backfill_full: (0) chance to simulate full disks stopping
	107	backfill
	108
	109	chance_test_map_discontinuity: (0) chance to test map discontinuity
	110	map_discontinuity_sleep_time: (40) time to wait for map trims
	111
	112	ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
	113	chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
	114
	115	optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
	116	enablement to all osds
	117
	118	dump_ops_enable: (true) continuously dump ops on all live osds
	119
	120	noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub
	121
	122	disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
	123	tests
	124
	125	chance_thrash_cluster_full: .05
	126
	127	chance_thrash_pg_upmap: 1.0
	128	chance_thrash_pg_upmap_items: 1.0
	129
11fdf7f2 TL	130	aggressive_pg_num_changes: (true) whether we should bypass the careful throttling of pg_num and pgp_num changes in mgr's adjust_pgs() controller
11fdf7f2 TL	131
7c673cae FG	132	example:
	133
	134	tasks:
	135	- ceph:
	136	- thrashosds:
	137	cluster: ceph
	138	chance_down: 10
	139	op_delay: 3
	140	min_in: 1
	141	timeout: 600
	142	- interactive:
	143	"""
	144	if config is None:
	145	config = {}
	146	assert isinstance(config, dict), \
	147	'thrashosds task only accepts a dict for configuration'
	148	# add default value for sighup_delay
	149	config['sighup_delay'] = config.get('sighup_delay', 0.1)
	150	# add default value for optrack_toggle_delay
	151	config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
	152	# add default value for dump_ops_enable
	153	config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
	154	# add default value for noscrub_toggle_delay
	155	config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
224ce89b WB	156	# add default value for random_eio
224ce89b WB	157	config['random_eio'] = config.get('random_eio', 0.0)
11fdf7f2	158	aggro = config.get('aggressive_pg_num_changes', True)
7c673cae FG	159
	160	log.info("config is {config}".format(config=str(config)))
	161
	162	overrides = ctx.config.get('overrides', {})
	163	log.info("overrides is {overrides}".format(overrides=str(overrides)))
	164	teuthology.deep_merge(config, overrides.get('thrashosds', {}))
	165	cluster = config.get('cluster', 'ceph')
	166
	167	log.info("config is {config}".format(config=str(config)))
	168
	169	if 'powercycle' in config:
	170
	171	# sync everyone first to avoid collateral damage to / etc.
	172	log.info('Doing preliminary sync to avoid collateral damage...')
	173	ctx.cluster.run(args=['sync'])
	174
	175	if 'ipmi_user' in ctx.teuthology_config:
	176	for remote in ctx.cluster.remotes.keys():
	177	log.debug('checking console status of %s' % remote.shortname)
	178	if not remote.console.check_status():
e306af50	179	log.warning('Failed to get console status for %s',
7c673cae FG	180	remote.shortname)
	181
	182	# check that all osd remotes have a valid console
	183	osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
	184	for remote in osds.remotes.keys():
	185	if not remote.console.has_ipmi_credentials:
	186	raise Exception(
	187	'IPMI console required for powercycling, '
	188	'but not available on osd role: {r}'.format(
	189	r=remote.name))
	190
	191	cluster_manager = ctx.managers[cluster]
	192	for f in ['powercycle', 'bdev_inject_crash']:
	193	if config.get(f):
	194	cluster_manager.config[f] = config.get(f)
	195
11fdf7f2 TL	196	if aggro:
	197	cluster_manager.raw_cluster_cmd(
	198	'config', 'set', 'mgr',
	199	'mgr_debug_aggressive_pg_num_changes',
	200	'true')
	201
7c673cae	202	log.info('Beginning thrashosds...')
9f95a23c	203	thrash_proc = ceph_manager.OSDThrasher(
7c673cae FG	204	cluster_manager,
7c673cae FG	205	config,
9f95a23c	206	"OSDThrasher",
7c673cae FG	207	logger=log.getChild('thrasher')
7c673cae FG	208	)
9f95a23c	209	ctx.ceph[cluster].thrashers.append(thrash_proc)
7c673cae FG	210	try:
	211	yield
	212	finally:
	213	log.info('joining thrashosds')
	214	thrash_proc.do_join()
c07f9fc5	215	cluster_manager.wait_for_all_osds_up()
31f18b77	216	cluster_manager.flush_all_pg_stats()
7c673cae	217	cluster_manager.wait_for_recovery(config.get('timeout', 360))
11fdf7f2 TL	218	if aggro:
	219	cluster_manager.raw_cluster_cmd(
	220	'config', 'rm', 'mgr',
	221	'mgr_debug_aggressive_pg_num_changes')