ceph/qa/tasks/rebuild_mondb.py

   1 """
   2 Test if we can recover the leveldb from OSD after where all leveldbs are
   3 corrupted
   4 """
   5
   6 import logging
   7 import os.path
   8 import shutil
   9 import tempfile
  10
  11 import ceph_manager
  12 from teuthology import misc as teuthology
  13
  14 log = logging.getLogger(__name__)
  15
  16
  17 def push_directory(path, remote, remote_dir):
  18     """
  19     local_temp_path=`mktemp`
  20     tar czf $local_temp_path $path
  21     ssh remote mkdir -p remote_dir
  22     remote_temp_path=`mktemp`
  23     scp $local_temp_path $remote_temp_path
  24     rm $local_temp_path
  25     tar xzf $remote_temp_path -C $remote_dir
  26     ssh remote:$remote_temp_path
  27     """
  28     fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
  29                                            prefix='rebuild_mondb-')
  30     os.close(fd)
  31     cmd = ' '.join(['tar', 'cz',
  32                     '-f', local_temp_path,
  33                     '-C', path,
  34                     '--', '.'])
  35     teuthology.sh(cmd)
  36     _, fname = os.path.split(local_temp_path)
  37     fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
  38                                             prefix='rebuild_mondb-')
  39     os.close(fd)
  40     remote.put_file(local_temp_path, remote_temp_path)
  41     os.remove(local_temp_path)
  42     remote.run(args=['sudo',
  43                      'tar', 'xz',
  44                      '-C', remote_dir,
  45                      '-f', remote_temp_path])
  46     remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
  47
  48
  49 def task(ctx, config):
  50     """
  51     Test monitor recovery from OSD
  52     """
  53     if config is None:
  54         config = {}
  55     assert isinstance(config, dict), \
  56         'task only accepts a dict for configuration'
  57
  58     first_mon = teuthology.get_first_mon(ctx, config)
  59     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  60
  61     manager = ceph_manager.CephManager(
  62         mon,
  63         ctx=ctx,
  64         logger=log.getChild('ceph_manager'))
  65
  66     mons = ctx.cluster.only(teuthology.is_type('mon'))
  67     assert mons
  68     # note down the first cluster_name and mon_id
  69     # we will recover it later on
  70     cluster_name = None
  71     mon_id = None
  72     for remote, roles in mons.remotes.iteritems():
  73         is_mon = teuthology.is_type('mon')
  74         for role in roles:
  75             if not is_mon(role):
  76                 continue
  77             cluster, _, m = teuthology.split_role(role)
  78             if cluster_name is None:
  79                 cluster_name = cluster
  80                 mon_id = m
  81             assert cluster_name == cluster
  82             log.info('killing {cluster}:mon.{mon}'.format(
  83                 cluster=cluster,
  84                 mon=m))
  85             manager.kill_mon(m)
  86             mon_data = os.path.join('/var/lib/ceph/mon/',
  87                                     '{0}-{1}'.format(cluster_name, m))
  88             if m == mon_id:
  89                 # so we will only need to recreate the store.db for the
  90                 # first mon, would be easier than mkfs on it then replace
  91                 # the its store.db with the recovered one
  92                 store_dir = os.path.join(mon_data, 'store.db')
  93                 remote.run(args=['sudo', 'rm', '-r', store_dir])
  94             else:
  95                 remote.run(args=['sudo', 'rm', '-r', mon_data])
  96
  97     local_mstore = tempfile.mkdtemp()
  98
  99     # collect the maps from all OSDs
 100     osds = ctx.cluster.only(teuthology.is_type('osd'))
 101     assert osds
 102     for osd, roles in osds.remotes.iteritems():
 103         is_osd = teuthology.is_type('osd')
 104         for role in roles:
 105             if not is_osd(role):
 106                 continue
 107             cluster, _, osd_id = teuthology.split_role(role)
 108             assert cluster_name == cluster
 109             log.info('collecting maps from {cluster}:osd.{osd}'.format(
 110                 cluster=cluster,
 111                 osd=osd_id))
 112             # push leveldb to OSD
 113             osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
 114             osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
 115
 116             push_directory(local_mstore, osd, osd_mstore)
 117             log.info('rm -rf {0}'.format(local_mstore))
 118             shutil.rmtree(local_mstore)
 119             # update leveldb with OSD data
 120             options = '--op update-mon-db --mon-store-path {0}'
 121             log.info('cot {0}'.format(osd_mstore))
 122             manager.objectstore_tool(pool=None,
 123                                      options=options.format(osd_mstore),
 124                                      args='',
 125                                      osd=osd_id,
 126                                      do_revive=False)
 127             # pull the updated mon db
 128             log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
 129             local_mstore = tempfile.mkdtemp()
 130             teuthology.pull_directory(osd, osd_mstore, local_mstore)
 131             log.info('rm -rf osd:{0}'.format(osd_mstore))
 132             osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
 133
 134     # recover the first_mon with re-built mon db
 135     # pull from recovered leveldb from client
 136     mon_store_dir = os.path.join('/var/lib/ceph/mon',
 137                                  '{0}-{1}'.format(cluster_name, mon_id))
 138     push_directory(local_mstore, mon, mon_store_dir)
 139     mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
 140     shutil.rmtree(local_mstore)
 141     default_keyring = '/etc/ceph/{cluster}.keyring'.format(
 142         cluster=cluster_name)
 143     keyring_path = config.get('keyring_path', default_keyring)
 144     # fill up the caps in the keyring file
 145     mon.run(args=['sudo',
 146                   'ceph-authtool', keyring_path,
 147                   '-n', 'mon.',
 148                   '--cap', 'mon', 'allow *'])
 149     mon.run(args=['sudo',
 150                   'ceph-authtool', keyring_path,
 151                   '-n', 'client.admin',
 152                   '--cap', 'mon', 'allow *',
 153                   '--cap', 'osd', 'allow *',
 154                   '--cap', 'mds', 'allow *'])
 155     mon.run(args=['sudo', '-u', 'ceph',
 156                   'ceph-monstore-tool', mon_store_dir,
 157                   'rebuild', '--', '--keyring',
 158                   keyring_path])
 159
 160     # revive monitors
 161     # the initial monmap is in the ceph.conf, so we are good.
 162     n_mons = 0
 163     for remote, roles in mons.remotes.iteritems():
 164         is_mon = teuthology.is_type('mon')
 165         for role in roles:
 166             if not is_mon(role):
 167                 continue
 168             cluster, _, m = teuthology.split_role(role)
 169             assert cluster_name == cluster
 170             if mon_id != m:
 171                 log.info('running mkfs on {cluster}:mon.{mon}'.format(
 172                     cluster=cluster,
 173                     mon=m))
 174                 remote.run(
 175                     args=[
 176                         'sudo',
 177                         'ceph-mon',
 178                         '--cluster', cluster,
 179                         '--mkfs',
 180                         '-i', m,
 181                         '--keyring', keyring_path])
 182             manager.revive_mon(m)
 183             n_mons += 1
 184
 185     manager.wait_for_mon_quorum_size(n_mons, timeout=30)
 186     for osd, roles in osds.remotes.iteritems():
 187         is_osd = teuthology.is_type('osd')
 188         for role in roles:
 189             if not is_osd(role):
 190                 continue
 191             _, _, osd_id = teuthology.split_role(role)
 192             log.info('reviving osd.{0}'.format(osd_id))
 193             manager.revive_osd(osd_id)