[ceph.git] / ceph / qa / tasks / dump_stuck.py

"""
Dump_stuck command
"""
import logging
import time

from tasks import ceph_manager
from teuthology import misc as teuthology


log = logging.getLogger(__name__)

def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
    """
    Do checks.  Make sure get_stuck_pgs return the right amount of information, then
    extract health information from the raw_cluster_cmd and compare the results with
    values passed in.  This passes if all asserts pass.
 
    :param num_manager: Ceph manager
    :param num_inactive: number of inaactive pages that are stuck
    :param num_unclean: number of unclean pages that are stuck
    :param num_stale: number of stale pages that are stuck
    :param timeout: timeout value for get_stuck_pgs calls
    """
    inactive = manager.get_stuck_pgs('inactive', timeout)
    unclean = manager.get_stuck_pgs('unclean', timeout)
    stale = manager.get_stuck_pgs('stale', timeout)
    log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
             len(inactive), num_inactive,
             len(unclean), num_unclean,
             len(stale), num_stale)
    assert len(inactive) == num_inactive
    assert len(unclean) == num_unclean
    assert len(stale) == num_stale

def task(ctx, config):
    """
    Test the dump_stuck command.

    :param ctx: Context
    :param config: Configuration
    """
    assert config is None, \
        'dump_stuck requires no configuration'
    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
        'dump_stuck requires exactly 2 osds'

    timeout = 60
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    manager.flush_pg_stats([0, 1])
    manager.wait_for_clean(timeout)

    manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
#                            '--mon-osd-report-timeout 90',
                            '--mon-pg-stuck-threshold 10')

    # all active+clean
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
    num_pgs = manager.get_num_pgs()

    manager.mark_out_osd(0)
    time.sleep(timeout)
    manager.flush_pg_stats([1])
    manager.wait_for_recovery(timeout)

    # all active+clean+remapped
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )

    manager.mark_in_osd(0)
    manager.flush_pg_stats([0, 1])
    manager.wait_for_clean(timeout)

    # all active+clean
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )

    log.info('stopping first osd')
    manager.kill_osd(0)
    manager.mark_down_osd(0)
    manager.wait_for_active(timeout)

    log.info('waiting for all to be unclean')
    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=num_pgs,
                num_stale=0,
                )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise


    log.info('stopping second osd')
    manager.kill_osd(1)
    manager.mark_down_osd(1)

    log.info('waiting for all to be stale')
    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=num_pgs,
                num_stale=num_pgs,
                )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise

    log.info('reviving')
    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.revive_osd(id_)
        manager.mark_in_osd(id_)
    while True:
        try:
            manager.flush_pg_stats([0, 1])
            break
        except Exception:
            log.exception('osds must not be started yet, waiting...')
            time.sleep(1)
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
Commit	Line	Data
7c673cae FG	1	"""
	2	Dump_stuck command
	3	"""
	4	import logging
7c673cae FG	5	import time
7c673cae FG	6
e306af50	7	from tasks import ceph_manager
7c673cae FG	8	from teuthology import misc as teuthology
	9
	10
	11	log = logging.getLogger(__name__)
	12
	13	def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
	14	"""
11fdf7f2	15	Do checks. Make sure get_stuck_pgs return the right amount of information, then
7c673cae FG	16	extract health information from the raw_cluster_cmd and compare the results with
	17	values passed in. This passes if all asserts pass.
	18
	19	:param num_manager: Ceph manager
	20	:param num_inactive: number of inaactive pages that are stuck
	21	:param num_unclean: number of unclean pages that are stuck
9f95a23c	22	:param num_stale: number of stale pages that are stuck
7c673cae FG	23	:param timeout: timeout value for get_stuck_pgs calls
	24	"""
	25	inactive = manager.get_stuck_pgs('inactive', timeout)
	26	unclean = manager.get_stuck_pgs('unclean', timeout)
	27	stale = manager.get_stuck_pgs('stale', timeout)
	28	log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
	29	len(inactive), num_inactive,
	30	len(unclean), num_unclean,
	31	len(stale), num_stale)
	32	assert len(inactive) == num_inactive
	33	assert len(unclean) == num_unclean
	34	assert len(stale) == num_stale
	35
	36	def task(ctx, config):
	37	"""
	38	Test the dump_stuck command.
	39
	40	:param ctx: Context
	41	:param config: Configuration
	42	"""
	43	assert config is None, \
	44	'dump_stuck requires no configuration'
	45	assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
	46	'dump_stuck requires exactly 2 osds'
	47
	48	timeout = 60
	49	first_mon = teuthology.get_first_mon(ctx, config)
9f95a23c	50	(mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae FG	51
	52	manager = ceph_manager.CephManager(
	53	mon,
	54	ctx=ctx,
	55	logger=log.getChild('ceph_manager'),
	56	)
	57
31f18b77	58	manager.flush_pg_stats([0, 1])
7c673cae FG	59	manager.wait_for_clean(timeout)
7c673cae FG	60
9f95a23c	61	manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
7c673cae FG	62	# '--mon-osd-report-timeout 90',
	63	'--mon-pg-stuck-threshold 10')
	64
224ce89b	65	# all active+clean
7c673cae FG	66	check_stuck(
	67	manager,
	68	num_inactive=0,
	69	num_unclean=0,
	70	num_stale=0,
	71	)
	72	num_pgs = manager.get_num_pgs()
	73
	74	manager.mark_out_osd(0)
	75	time.sleep(timeout)
31f18b77	76	manager.flush_pg_stats([1])
7c673cae FG	77	manager.wait_for_recovery(timeout)
7c673cae FG	78
224ce89b	79	# all active+clean+remapped
7c673cae FG	80	check_stuck(
	81	manager,
	82	num_inactive=0,
224ce89b	83	num_unclean=0,
7c673cae FG	84	num_stale=0,
	85	)
	86
	87	manager.mark_in_osd(0)
31f18b77	88	manager.flush_pg_stats([0, 1])
7c673cae FG	89	manager.wait_for_clean(timeout)
7c673cae FG	90
224ce89b	91	# all active+clean
7c673cae FG	92	check_stuck(
	93	manager,
	94	num_inactive=0,
	95	num_unclean=0,
	96	num_stale=0,
	97	)
	98
	99	log.info('stopping first osd')
	100	manager.kill_osd(0)
	101	manager.mark_down_osd(0)
c07f9fc5	102	manager.wait_for_active(timeout)
7c673cae FG	103
	104	log.info('waiting for all to be unclean')
	105	starttime = time.time()
	106	done = False
	107	while not done:
	108	try:
	109	check_stuck(
	110	manager,
	111	num_inactive=0,
	112	num_unclean=num_pgs,
	113	num_stale=0,
	114	)
	115	done = True
	116	except AssertionError:
	117	# wait up to 15 minutes to become stale
	118	if time.time() - starttime > 900:
	119	raise
	120
	121
	122	log.info('stopping second osd')
	123	manager.kill_osd(1)
	124	manager.mark_down_osd(1)
	125
	126	log.info('waiting for all to be stale')
	127	starttime = time.time()
	128	done = False
	129	while not done:
	130	try:
	131	check_stuck(
	132	manager,
	133	num_inactive=0,
	134	num_unclean=num_pgs,
	135	num_stale=num_pgs,
	136	)
	137	done = True
	138	except AssertionError:
	139	# wait up to 15 minutes to become stale
	140	if time.time() - starttime > 900:
	141	raise
	142
	143	log.info('reviving')
	144	for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
	145	manager.revive_osd(id_)
	146	manager.mark_in_osd(id_)
	147	while True:
	148	try:
31f18b77	149	manager.flush_pg_stats([0, 1])
7c673cae FG	150	break
	151	except Exception:
	152	log.exception('osds must not be started yet, waiting...')
	153	time.sleep(1)
	154	manager.wait_for_clean(timeout)
	155
	156	check_stuck(
	157	manager,
	158	num_inactive=0,
	159	num_unclean=0,
	160	num_stale=0,
	161	)