ceph/qa/tasks/dump_stuck.py

   1 """
   2 Dump_stuck command
   3 """
   4 import logging
   5 import time
   6
   7 import ceph_manager
   8 from teuthology import misc as teuthology
   9
  10
  11 log = logging.getLogger(__name__)
  12
  13 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
  14     """
  15     Do checks.  Make sure get_stuck_pgs return the right amount of information, then
  16     extract health information from the raw_cluster_cmd and compare the results with
  17     values passed in.  This passes if all asserts pass.
  18
  19     :param num_manager: Ceph manager
  20     :param num_inactive: number of inaactive pages that are stuck
  21     :param num_unclean: number of unclean pages that are stuck
  22     :param num_stale: number of stale pages that are stuck
  23     :param timeout: timeout value for get_stuck_pgs calls
  24     """
  25     inactive = manager.get_stuck_pgs('inactive', timeout)
  26     unclean = manager.get_stuck_pgs('unclean', timeout)
  27     stale = manager.get_stuck_pgs('stale', timeout)
  28     log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
  29              len(inactive), num_inactive,
  30              len(unclean), num_unclean,
  31              len(stale), num_stale)
  32     assert len(inactive) == num_inactive
  33     assert len(unclean) == num_unclean
  34     assert len(stale) == num_stale
  35
  36 def task(ctx, config):
  37     """
  38     Test the dump_stuck command.
  39
  40     :param ctx: Context
  41     :param config: Configuration
  42     """
  43     assert config is None, \
  44         'dump_stuck requires no configuration'
  45     assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
  46         'dump_stuck requires exactly 2 osds'
  47
  48     timeout = 60
  49     first_mon = teuthology.get_first_mon(ctx, config)
  50     (mon,) = ctx.cluster.only(first_mon).remotes.keys()
  51
  52     manager = ceph_manager.CephManager(
  53         mon,
  54         ctx=ctx,
  55         logger=log.getChild('ceph_manager'),
  56         )
  57
  58     manager.flush_pg_stats([0, 1])
  59     manager.wait_for_clean(timeout)
  60
  61     manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
  62 #                            '--mon-osd-report-timeout 90',
  63                             '--mon-pg-stuck-threshold 10')
  64
  65     # all active+clean
  66     check_stuck(
  67         manager,
  68         num_inactive=0,
  69         num_unclean=0,
  70         num_stale=0,
  71         )
  72     num_pgs = manager.get_num_pgs()
  73
  74     manager.mark_out_osd(0)
  75     time.sleep(timeout)
  76     manager.flush_pg_stats([1])
  77     manager.wait_for_recovery(timeout)
  78
  79     # all active+clean+remapped
  80     check_stuck(
  81         manager,
  82         num_inactive=0,
  83         num_unclean=0,
  84         num_stale=0,
  85         )
  86
  87     manager.mark_in_osd(0)
  88     manager.flush_pg_stats([0, 1])
  89     manager.wait_for_clean(timeout)
  90
  91     # all active+clean
  92     check_stuck(
  93         manager,
  94         num_inactive=0,
  95         num_unclean=0,
  96         num_stale=0,
  97         )
  98
  99     log.info('stopping first osd')
 100     manager.kill_osd(0)
 101     manager.mark_down_osd(0)
 102     manager.wait_for_active(timeout)
 103
 104     log.info('waiting for all to be unclean')
 105     starttime = time.time()
 106     done = False
 107     while not done:
 108         try:
 109             check_stuck(
 110                 manager,
 111                 num_inactive=0,
 112                 num_unclean=num_pgs,
 113                 num_stale=0,
 114                 )
 115             done = True
 116         except AssertionError:
 117             # wait up to 15 minutes to become stale
 118             if time.time() - starttime > 900:
 119                 raise
 120
 121
 122     log.info('stopping second osd')
 123     manager.kill_osd(1)
 124     manager.mark_down_osd(1)
 125
 126     log.info('waiting for all to be stale')
 127     starttime = time.time()
 128     done = False
 129     while not done:
 130         try:
 131             check_stuck(
 132                 manager,
 133                 num_inactive=0,
 134                 num_unclean=num_pgs,
 135                 num_stale=num_pgs,
 136                 )
 137             done = True
 138         except AssertionError:
 139             # wait up to 15 minutes to become stale
 140             if time.time() - starttime > 900:
 141                 raise
 142
 143     log.info('reviving')
 144     for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
 145         manager.revive_osd(id_)
 146         manager.mark_in_osd(id_)
 147     while True:
 148         try:
 149             manager.flush_pg_stats([0, 1])
 150             break
 151         except Exception:
 152             log.exception('osds must not be started yet, waiting...')
 153             time.sleep(1)
 154     manager.wait_for_clean(timeout)
 155
 156     check_stuck(
 157         manager,
 158         num_inactive=0,
 159         num_unclean=0,
 160         num_stale=0,
 161         )