ceph/qa/tasks/dump_stuck.py

   1 """
   2 Dump_stuck command
   3 """
   4 import logging
   5 import re
   6 import time
   7
   8 import ceph_manager
   9 from teuthology import misc as teuthology
  10
  11
  12 log = logging.getLogger(__name__)
  13
  14 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
  15     """
  16     Do checks.  Make sure get_stuck_pgs return the right amout of information, then
  17     extract health information from the raw_cluster_cmd and compare the results with
  18     values passed in.  This passes if all asserts pass.
  19
  20     :param num_manager: Ceph manager
  21     :param num_inactive: number of inaactive pages that are stuck
  22     :param num_unclean: number of unclean pages that are stuck
  23     :paran num_stale: number of stale pages that are stuck
  24     :param timeout: timeout value for get_stuck_pgs calls
  25     """
  26     inactive = manager.get_stuck_pgs('inactive', timeout)
  27     unclean = manager.get_stuck_pgs('unclean', timeout)
  28     stale = manager.get_stuck_pgs('stale', timeout)
  29     log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
  30              len(inactive), num_inactive,
  31              len(unclean), num_unclean,
  32              len(stale), num_stale)
  33     assert len(inactive) == num_inactive
  34     assert len(unclean) == num_unclean
  35     assert len(stale) == num_stale
  36
  37 def task(ctx, config):
  38     """
  39     Test the dump_stuck command.
  40
  41     :param ctx: Context
  42     :param config: Configuration
  43     """
  44     assert config is None, \
  45         'dump_stuck requires no configuration'
  46     assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
  47         'dump_stuck requires exactly 2 osds'
  48
  49     timeout = 60
  50     first_mon = teuthology.get_first_mon(ctx, config)
  51     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  52
  53     manager = ceph_manager.CephManager(
  54         mon,
  55         ctx=ctx,
  56         logger=log.getChild('ceph_manager'),
  57         )
  58
  59     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
  60     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
  61     manager.wait_for_clean(timeout)
  62
  63     manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
  64 #                            '--mon-osd-report-timeout 90',
  65                             '--mon-pg-stuck-threshold 10')
  66
  67     check_stuck(
  68         manager,
  69         num_inactive=0,
  70         num_unclean=0,
  71         num_stale=0,
  72         )
  73     num_pgs = manager.get_num_pgs()
  74
  75     manager.mark_out_osd(0)
  76     time.sleep(timeout)
  77     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
  78     manager.wait_for_recovery(timeout)
  79
  80     check_stuck(
  81         manager,
  82         num_inactive=0,
  83         num_unclean=num_pgs,
  84         num_stale=0,
  85         )
  86
  87     manager.mark_in_osd(0)
  88     manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
  89     manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
  90     manager.wait_for_clean(timeout)
  91
  92     check_stuck(
  93         manager,
  94         num_inactive=0,
  95         num_unclean=0,
  96         num_stale=0,
  97         )
  98
  99     log.info('stopping first osd')
 100     manager.kill_osd(0)
 101     manager.mark_down_osd(0)
 102
 103     log.info('waiting for all to be unclean')
 104     starttime = time.time()
 105     done = False
 106     while not done:
 107         try:
 108             check_stuck(
 109                 manager,
 110                 num_inactive=0,
 111                 num_unclean=num_pgs,
 112                 num_stale=0,
 113                 )
 114             done = True
 115         except AssertionError:
 116             # wait up to 15 minutes to become stale
 117             if time.time() - starttime > 900:
 118                 raise
 119
 120
 121     log.info('stopping second osd')
 122     manager.kill_osd(1)
 123     manager.mark_down_osd(1)
 124
 125     log.info('waiting for all to be stale')
 126     starttime = time.time()
 127     done = False
 128     while not done:
 129         try:
 130             check_stuck(
 131                 manager,
 132                 num_inactive=0,
 133                 num_unclean=num_pgs,
 134                 num_stale=num_pgs,
 135                 )
 136             done = True
 137         except AssertionError:
 138             # wait up to 15 minutes to become stale
 139             if time.time() - starttime > 900:
 140                 raise
 141
 142     log.info('reviving')
 143     for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
 144         manager.revive_osd(id_)
 145         manager.mark_in_osd(id_)
 146     while True:
 147         try:
 148             manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
 149             manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
 150             break
 151         except Exception:
 152             log.exception('osds must not be started yet, waiting...')
 153             time.sleep(1)
 154     manager.wait_for_clean(timeout)
 155
 156     check_stuck(
 157         manager,
 158         num_inactive=0,
 159         num_unclean=0,
 160         num_stale=0,
 161         )