ceph/qa/tasks/dump_stuck.py

   1 """
   2 Dump_stuck command
   3 """
   4 import logging
   5 import re
   6 import time
   7
   8 import ceph_manager
   9 from teuthology import misc as teuthology
  10
  11
  12 log = logging.getLogger(__name__)
  13
  14 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
  15     """
  16     Do checks.  Make sure get_stuck_pgs return the right amout of information, then
  17     extract health information from the raw_cluster_cmd and compare the results with
  18     values passed in.  This passes if all asserts pass.
  19
  20     :param num_manager: Ceph manager
  21     :param num_inactive: number of inaactive pages that are stuck
  22     :param num_unclean: number of unclean pages that are stuck
  23     :paran num_stale: number of stale pages that are stuck
  24     :param timeout: timeout value for get_stuck_pgs calls
  25     """
  26     inactive = manager.get_stuck_pgs('inactive', timeout)
  27     unclean = manager.get_stuck_pgs('unclean', timeout)
  28     stale = manager.get_stuck_pgs('stale', timeout)
  29     log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
  30              len(inactive), num_inactive,
  31              len(unclean), num_unclean,
  32              len(stale), num_stale)
  33     assert len(inactive) == num_inactive
  34     assert len(unclean) == num_unclean
  35     assert len(stale) == num_stale
  36
  37 def task(ctx, config):
  38     """
  39     Test the dump_stuck command.
  40
  41     :param ctx: Context
  42     :param config: Configuration
  43     """
  44     assert config is None, \
  45         'dump_stuck requires no configuration'
  46     assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
  47         'dump_stuck requires exactly 2 osds'
  48
  49     timeout = 60
  50     first_mon = teuthology.get_first_mon(ctx, config)
  51     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  52
  53     manager = ceph_manager.CephManager(
  54         mon,
  55         ctx=ctx,
  56         logger=log.getChild('ceph_manager'),
  57         )
  58
  59     manager.flush_pg_stats([0, 1])
  60     manager.wait_for_clean(timeout)
  61
  62     manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
  63 #                            '--mon-osd-report-timeout 90',
  64                             '--mon-pg-stuck-threshold 10')
  65
  66     check_stuck(
  67         manager,
  68         num_inactive=0,
  69         num_unclean=0,
  70         num_stale=0,
  71         )
  72     num_pgs = manager.get_num_pgs()
  73
  74     manager.mark_out_osd(0)
  75     time.sleep(timeout)
  76     manager.flush_pg_stats([1])
  77     manager.wait_for_recovery(timeout)
  78
  79     check_stuck(
  80         manager,
  81         num_inactive=0,
  82         num_unclean=num_pgs,
  83         num_stale=0,
  84         )
  85
  86     manager.mark_in_osd(0)
  87     manager.flush_pg_stats([0, 1])
  88     manager.wait_for_clean(timeout)
  89
  90     check_stuck(
  91         manager,
  92         num_inactive=0,
  93         num_unclean=0,
  94         num_stale=0,
  95         )
  96
  97     log.info('stopping first osd')
  98     manager.kill_osd(0)
  99     manager.mark_down_osd(0)
 100
 101     log.info('waiting for all to be unclean')
 102     starttime = time.time()
 103     done = False
 104     while not done:
 105         try:
 106             check_stuck(
 107                 manager,
 108                 num_inactive=0,
 109                 num_unclean=num_pgs,
 110                 num_stale=0,
 111                 )
 112             done = True
 113         except AssertionError:
 114             # wait up to 15 minutes to become stale
 115             if time.time() - starttime > 900:
 116                 raise
 117
 118
 119     log.info('stopping second osd')
 120     manager.kill_osd(1)
 121     manager.mark_down_osd(1)
 122
 123     log.info('waiting for all to be stale')
 124     starttime = time.time()
 125     done = False
 126     while not done:
 127         try:
 128             check_stuck(
 129                 manager,
 130                 num_inactive=0,
 131                 num_unclean=num_pgs,
 132                 num_stale=num_pgs,
 133                 )
 134             done = True
 135         except AssertionError:
 136             # wait up to 15 minutes to become stale
 137             if time.time() - starttime > 900:
 138                 raise
 139
 140     log.info('reviving')
 141     for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
 142         manager.revive_osd(id_)
 143         manager.mark_in_osd(id_)
 144     while True:
 145         try:
 146             manager.flush_pg_stats([0, 1])
 147             break
 148         except Exception:
 149             log.exception('osds must not be started yet, waiting...')
 150             time.sleep(1)
 151     manager.wait_for_clean(timeout)
 152
 153     check_stuck(
 154         manager,
 155         num_inactive=0,
 156         num_unclean=0,
 157         num_stale=0,
 158         )