[ceph.git] / ceph / qa / tasks / rebuild_mondb.py

"""
Test if we can recover the leveldb from OSD after where all leveldbs are
corrupted
"""

import logging
import os.path
import shutil
import tempfile

import ceph_manager
from teuthology import misc as teuthology

log = logging.getLogger(__name__)


def _push_directory(path, remote, remote_dir):
    """
    local_temp_path=`mktemp`
    tar czf $local_temp_path $path
    ssh remote mkdir -p remote_dir
    remote_temp_path=`mktemp`
    scp $local_temp_path $remote_temp_path
    rm $local_temp_path
    tar xzf $remote_temp_path -C $remote_dir
    ssh remote:$remote_temp_path
    """
    fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
                                           prefix='rebuild_mondb-')
    os.close(fd)
    cmd = ' '.join(['tar', 'cz',
                    '-f', local_temp_path,
                    '-C', path,
                    '--', '.'])
    teuthology.sh(cmd)
    _, fname = os.path.split(local_temp_path)
    fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
                                            prefix='rebuild_mondb-')
    os.close(fd)
    remote.put_file(local_temp_path, remote_temp_path)
    os.remove(local_temp_path)
    remote.run(args=['sudo',
                     'tar', 'xz',
                     '-C', remote_dir,
                     '-f', remote_temp_path])
    remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])


def _nuke_mons(manager, mons, mon_id):
    assert mons
    is_mon = teuthology.is_type('mon')
    for remote, roles in mons.remotes.iteritems():
        for role in roles:
            if not is_mon(role):
                continue
            cluster, _, m = teuthology.split_role(role)
            log.info('killing {cluster}:mon.{mon}'.format(
                cluster=cluster,
                mon=m))
            manager.kill_mon(m)
            mon_data = os.path.join('/var/lib/ceph/mon/',
                                    '{0}-{1}'.format(cluster, m))
            if m == mon_id:
                # so we will only need to recreate the store.db for the
                # first mon, would be easier than mkfs on it then replace
                # the its store.db with the recovered one
                store_dir = os.path.join(mon_data, 'store.db')
                remote.run(args=['sudo', 'rm', '-r', store_dir])
            else:
                remote.run(args=['sudo', 'rm', '-r', mon_data])


def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
    local_mstore = tempfile.mkdtemp()

    # collect the maps from all OSDs
    is_osd = teuthology.is_type('osd')
    osds = ctx.cluster.only(is_osd)
    assert osds
    for osd, roles in osds.remotes.iteritems():
        for role in roles:
            if not is_osd(role):
                continue
            cluster, _, osd_id = teuthology.split_role(role)
            assert cluster_name == cluster
            log.info('collecting maps from {cluster}:osd.{osd}'.format(
                cluster=cluster,
                osd=osd_id))
            # push leveldb to OSD
            osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
            osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])

            _push_directory(local_mstore, osd, osd_mstore)
            log.info('rm -rf {0}'.format(local_mstore))
            shutil.rmtree(local_mstore)
            # update leveldb with OSD data
            options = '--op update-mon-db --mon-store-path {0}'
            log.info('cot {0}'.format(osd_mstore))
            manager.objectstore_tool(pool=None,
                                     options=options.format(osd_mstore),
                                     args='',
                                     osd=osd_id,
                                     do_revive=False)
            # pull the updated mon db
            log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
            local_mstore = tempfile.mkdtemp()
            teuthology.pull_directory(osd, osd_mstore, local_mstore)
            log.info('rm -rf osd:{0}'.format(osd_mstore))
            osd.run(args=['sudo', 'rm', '-fr', osd_mstore])

    # recover the first_mon with re-built mon db
    # pull from recovered leveldb from client
    mon_store_dir = os.path.join('/var/lib/ceph/mon',
                                 '{0}-{1}'.format(cluster_name, mon_id))
    _push_directory(local_mstore, mon, mon_store_dir)
    mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
    shutil.rmtree(local_mstore)

    # fill up the caps in the keyring file
    mon.run(args=['sudo',
                  'ceph-authtool', keyring_path,
                  '-n', 'mon.',
                  '--cap', 'mon', 'allow *'])
    mon.run(args=['sudo',
                  'ceph-authtool', keyring_path,
                  '-n', 'client.admin',
                  '--cap', 'mon', 'allow *',
                  '--cap', 'osd', 'allow *',
                  '--cap', 'mds', 'allow *',
                  '--cap', 'mgr', 'allow *'])
    mon.run(args=['sudo', '-u', 'ceph',
                  'ceph-monstore-tool', mon_store_dir,
                  'rebuild', '--', '--keyring',
                  keyring_path])


def _revive_mons(manager, mons, recovered, keyring_path):
    # revive monitors
    # the initial monmap is in the ceph.conf, so we are good.
    n_mons = 0
    is_mon = teuthology.is_type('mon')
    for remote, roles in mons.remotes.iteritems():
        for role in roles:
            if not is_mon(role):
                continue
            cluster, _, m = teuthology.split_role(role)
            if recovered != m:
                log.info('running mkfs on {cluster}:mon.{mon}'.format(
                    cluster=cluster,
                    mon=m))
                remote.run(
                    args=[
                        'sudo',
                        'ceph-mon',
                        '--cluster', cluster,
                        '--mkfs',
                        '-i', m,
                        '--keyring', keyring_path])
            log.info('reviving mon.{0}'.format(m))
            manager.revive_mon(m)
            n_mons += 1
    manager.wait_for_mon_quorum_size(n_mons, timeout=30)


def _revive_mgrs(ctx, manager):
    is_mgr = teuthology.is_type('mgr')
    mgrs = ctx.cluster.only(is_mgr)
    for _, roles in mgrs.remotes.iteritems():
        for role in roles:
            if not is_mgr(role):
                continue
            _, _, mgr_id = teuthology.split_role(role)
            log.info('reviving mgr.{0}'.format(mgr_id))
            manager.revive_mgr(mgr_id)


def _revive_osds(ctx, manager):
    is_osd = teuthology.is_type('osd')
    osds = ctx.cluster.only(is_osd)
    for _, roles in osds.remotes.iteritems():
        for role in roles:
            if not is_osd(role):
                continue
            _, _, osd_id = teuthology.split_role(role)
            log.info('reviving osd.{0}'.format(osd_id))
            manager.revive_osd(osd_id)


def task(ctx, config):
    """
    Test monitor recovery from OSD
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'))

    mons = ctx.cluster.only(teuthology.is_type('mon'))
    # note down the first cluster_name and mon_id
    # we will recover it later on
    cluster_name, _, mon_id = teuthology.split_role(first_mon)
    _nuke_mons(manager, mons, mon_id)
    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
        cluster=cluster_name)
    keyring_path = config.get('keyring_path', default_keyring)
    _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
    _revive_mons(manager, mons, mon_id, keyring_path)
    _revive_mgrs(ctx, manager)
    _revive_osds(ctx, manager)
Commit	Line	Data
7c673cae FG	1	"""
	2	Test if we can recover the leveldb from OSD after where all leveldbs are
	3	corrupted
	4	"""
	5
	6	import logging
	7	import os.path
	8	import shutil
	9	import tempfile
	10
	11	import ceph_manager
	12	from teuthology import misc as teuthology
	13
	14	log = logging.getLogger(__name__)
	15
	16
31f18b77	17	def _push_directory(path, remote, remote_dir):
7c673cae FG	18	"""
	19	local_temp_path=`mktemp`
	20	tar czf $local_temp_path $path
	21	ssh remote mkdir -p remote_dir
	22	remote_temp_path=`mktemp`
	23	scp $local_temp_path $remote_temp_path
	24	rm $local_temp_path
	25	tar xzf $remote_temp_path -C $remote_dir
	26	ssh remote:$remote_temp_path
	27	"""
	28	fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
	29	prefix='rebuild_mondb-')
	30	os.close(fd)
	31	cmd = ' '.join(['tar', 'cz',
	32	'-f', local_temp_path,
	33	'-C', path,
	34	'--', '.'])
	35	teuthology.sh(cmd)
	36	_, fname = os.path.split(local_temp_path)
	37	fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
	38	prefix='rebuild_mondb-')
	39	os.close(fd)
	40	remote.put_file(local_temp_path, remote_temp_path)
	41	os.remove(local_temp_path)
	42	remote.run(args=['sudo',
	43	'tar', 'xz',
	44	'-C', remote_dir,
	45	'-f', remote_temp_path])
	46	remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
	47
	48
31f18b77	49	def _nuke_mons(manager, mons, mon_id):
7c673cae	50	assert mons
31f18b77	51	is_mon = teuthology.is_type('mon')
7c673cae	52	for remote, roles in mons.remotes.iteritems():
7c673cae FG	53	for role in roles:
	54	if not is_mon(role):
	55	continue
	56	cluster, _, m = teuthology.split_role(role)
7c673cae FG	57	log.info('killing {cluster}:mon.{mon}'.format(
	58	cluster=cluster,
	59	mon=m))
	60	manager.kill_mon(m)
	61	mon_data = os.path.join('/var/lib/ceph/mon/',
31f18b77	62	'{0}-{1}'.format(cluster, m))
7c673cae FG	63	if m == mon_id:
	64	# so we will only need to recreate the store.db for the
	65	# first mon, would be easier than mkfs on it then replace
	66	# the its store.db with the recovered one
	67	store_dir = os.path.join(mon_data, 'store.db')
	68	remote.run(args=['sudo', 'rm', '-r', store_dir])
	69	else:
	70	remote.run(args=['sudo', 'rm', '-r', mon_data])
	71
31f18b77 FG	72
31f18b77 FG	73	def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
7c673cae FG	74	local_mstore = tempfile.mkdtemp()
	75
	76	# collect the maps from all OSDs
31f18b77 FG	77	is_osd = teuthology.is_type('osd')
31f18b77 FG	78	osds = ctx.cluster.only(is_osd)
7c673cae FG	79	assert osds
7c673cae FG	80	for osd, roles in osds.remotes.iteritems():
7c673cae FG	81	for role in roles:
	82	if not is_osd(role):
	83	continue
	84	cluster, _, osd_id = teuthology.split_role(role)
	85	assert cluster_name == cluster
	86	log.info('collecting maps from {cluster}:osd.{osd}'.format(
	87	cluster=cluster,
	88	osd=osd_id))
	89	# push leveldb to OSD
	90	osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
	91	osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
	92
31f18b77	93	_push_directory(local_mstore, osd, osd_mstore)
7c673cae FG	94	log.info('rm -rf {0}'.format(local_mstore))
	95	shutil.rmtree(local_mstore)
	96	# update leveldb with OSD data
	97	options = '--op update-mon-db --mon-store-path {0}'
	98	log.info('cot {0}'.format(osd_mstore))
	99	manager.objectstore_tool(pool=None,
	100	options=options.format(osd_mstore),
	101	args='',
	102	osd=osd_id,
	103	do_revive=False)
	104	# pull the updated mon db
	105	log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
	106	local_mstore = tempfile.mkdtemp()
	107	teuthology.pull_directory(osd, osd_mstore, local_mstore)
	108	log.info('rm -rf osd:{0}'.format(osd_mstore))
	109	osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
	110
	111	# recover the first_mon with re-built mon db
	112	# pull from recovered leveldb from client
	113	mon_store_dir = os.path.join('/var/lib/ceph/mon',
	114	'{0}-{1}'.format(cluster_name, mon_id))
31f18b77	115	_push_directory(local_mstore, mon, mon_store_dir)
7c673cae FG	116	mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
7c673cae FG	117	shutil.rmtree(local_mstore)
31f18b77	118
7c673cae FG	119	# fill up the caps in the keyring file
	120	mon.run(args=['sudo',
	121	'ceph-authtool', keyring_path,
	122	'-n', 'mon.',
	123	'--cap', 'mon', 'allow *'])
	124	mon.run(args=['sudo',
	125	'ceph-authtool', keyring_path,
	126	'-n', 'client.admin',
	127	'--cap', 'mon', 'allow *',
	128	'--cap', 'osd', 'allow *',
31f18b77 FG	129	'--cap', 'mds', 'allow *',
31f18b77 FG	130	'--cap', 'mgr', 'allow *'])
7c673cae FG	131	mon.run(args=['sudo', '-u', 'ceph',
	132	'ceph-monstore-tool', mon_store_dir,
	133	'rebuild', '--', '--keyring',
	134	keyring_path])
	135
31f18b77 FG	136
31f18b77 FG	137	def _revive_mons(manager, mons, recovered, keyring_path):
7c673cae FG	138	# revive monitors
	139	# the initial monmap is in the ceph.conf, so we are good.
	140	n_mons = 0
31f18b77	141	is_mon = teuthology.is_type('mon')
7c673cae	142	for remote, roles in mons.remotes.iteritems():
7c673cae FG	143	for role in roles:
	144	if not is_mon(role):
	145	continue
	146	cluster, _, m = teuthology.split_role(role)
31f18b77	147	if recovered != m:
7c673cae FG	148	log.info('running mkfs on {cluster}:mon.{mon}'.format(
	149	cluster=cluster,
	150	mon=m))
	151	remote.run(
	152	args=[
	153	'sudo',
	154	'ceph-mon',
	155	'--cluster', cluster,
	156	'--mkfs',
	157	'-i', m,
	158	'--keyring', keyring_path])
31f18b77	159	log.info('reviving mon.{0}'.format(m))
7c673cae FG	160	manager.revive_mon(m)
7c673cae FG	161	n_mons += 1
7c673cae	162	manager.wait_for_mon_quorum_size(n_mons, timeout=30)
31f18b77 FG	163
	164
	165	def _revive_mgrs(ctx, manager):
	166	is_mgr = teuthology.is_type('mgr')
	167	mgrs = ctx.cluster.only(is_mgr)
	168	for _, roles in mgrs.remotes.iteritems():
	169	for role in roles:
	170	if not is_mgr(role):
	171	continue
	172	_, _, mgr_id = teuthology.split_role(role)
	173	log.info('reviving mgr.{0}'.format(mgr_id))
	174	manager.revive_mgr(mgr_id)
	175
	176
	177	def _revive_osds(ctx, manager):
	178	is_osd = teuthology.is_type('osd')
	179	osds = ctx.cluster.only(is_osd)
	180	for _, roles in osds.remotes.iteritems():
7c673cae FG	181	for role in roles:
	182	if not is_osd(role):
	183	continue
	184	_, _, osd_id = teuthology.split_role(role)
	185	log.info('reviving osd.{0}'.format(osd_id))
	186	manager.revive_osd(osd_id)
31f18b77 FG	187
	188
	189	def task(ctx, config):
	190	"""
	191	Test monitor recovery from OSD
	192	"""
	193	if config is None:
	194	config = {}
	195	assert isinstance(config, dict), \
	196	'task only accepts a dict for configuration'
	197
	198	first_mon = teuthology.get_first_mon(ctx, config)
	199	(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
	200	manager = ceph_manager.CephManager(
	201	mon,
	202	ctx=ctx,
	203	logger=log.getChild('ceph_manager'))
	204
	205	mons = ctx.cluster.only(teuthology.is_type('mon'))
	206	# note down the first cluster_name and mon_id
	207	# we will recover it later on
	208	cluster_name, _, mon_id = teuthology.split_role(first_mon)
	209	_nuke_mons(manager, mons, mon_id)
	210	default_keyring = '/etc/ceph/{cluster}.keyring'.format(
	211	cluster=cluster_name)
	212	keyring_path = config.get('keyring_path', default_keyring)
	213	_rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
	214	_revive_mons(manager, mons, mon_id, keyring_path)
	215	_revive_mgrs(ctx, manager)
	216	_revive_osds(ctx, manager)