ceph/qa/tasks/dump_stuck.py

   1 """
   2 Dump_stuck command
   3 """
   4 import logging
   5 import re
   6 import time
   7
   8 import ceph_manager
   9 from teuthology import misc as teuthology
  10
  11
  12 log = logging.getLogger(__name__)
  13
  14 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
  15     """
  16     Do checks.  Make sure get_stuck_pgs return the right amout of information, then
  17     extract health information from the raw_cluster_cmd and compare the results with
  18     values passed in.  This passes if all asserts pass.
  19
  20     :param num_manager: Ceph manager
  21     :param num_inactive: number of inaactive pages that are stuck
  22     :param num_unclean: number of unclean pages that are stuck
  23     :paran num_stale: number of stale pages that are stuck
  24     :param timeout: timeout value for get_stuck_pgs calls
  25     """
  26     inactive = manager.get_stuck_pgs('inactive', timeout)
  27     unclean = manager.get_stuck_pgs('unclean', timeout)
  28     stale = manager.get_stuck_pgs('stale', timeout)
  29     log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
  30              len(inactive), num_inactive,
  31              len(unclean), num_unclean,
  32              len(stale), num_stale)
  33     assert len(inactive) == num_inactive
  34     assert len(unclean) == num_unclean
  35     assert len(stale) == num_stale
  36
  37 def task(ctx, config):
  38     """
  39     Test the dump_stuck command.
  40
  41     :param ctx: Context
  42     :param config: Configuration
  43     """
  44     assert config is None, \
  45         'dump_stuck requires no configuration'
  46     assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
  47         'dump_stuck requires exactly 2 osds'
  48
  49     timeout = 60
  50     first_mon = teuthology.get_first_mon(ctx, config)
  51     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  52
  53     manager = ceph_manager.CephManager(
  54         mon,
  55         ctx=ctx,
  56         logger=log.getChild('ceph_manager'),
  57         )
  58
  59     manager.flush_pg_stats([0, 1])
  60     manager.wait_for_clean(timeout)
  61
  62     manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
  63 #                            '--mon-osd-report-timeout 90',
  64                             '--mon-pg-stuck-threshold 10')
  65
  66     # all active+clean
  67     check_stuck(
  68         manager,
  69         num_inactive=0,
  70         num_unclean=0,
  71         num_stale=0,
  72         )
  73     num_pgs = manager.get_num_pgs()
  74
  75     manager.mark_out_osd(0)
  76     time.sleep(timeout)
  77     manager.flush_pg_stats([1])
  78     manager.wait_for_recovery(timeout)
  79
  80     # all active+clean+remapped
  81     check_stuck(
  82         manager,
  83         num_inactive=0,
  84         num_unclean=0,
  85         num_stale=0,
  86         )
  87
  88     manager.mark_in_osd(0)
  89     manager.flush_pg_stats([0, 1])
  90     manager.wait_for_clean(timeout)
  91
  92     # all active+clean
  93     check_stuck(
  94         manager,
  95         num_inactive=0,
  96         num_unclean=0,
  97         num_stale=0,
  98         )
  99
 100     log.info('stopping first osd')
 101     manager.kill_osd(0)
 102     manager.mark_down_osd(0)
 103
 104     log.info('waiting for all to be unclean')
 105     starttime = time.time()
 106     done = False
 107     while not done:
 108         try:
 109             check_stuck(
 110                 manager,
 111                 num_inactive=0,
 112                 num_unclean=num_pgs,
 113                 num_stale=0,
 114                 )
 115             done = True
 116         except AssertionError:
 117             # wait up to 15 minutes to become stale
 118             if time.time() - starttime > 900:
 119                 raise
 120
 121
 122     log.info('stopping second osd')
 123     manager.kill_osd(1)
 124     manager.mark_down_osd(1)
 125
 126     log.info('waiting for all to be stale')
 127     starttime = time.time()
 128     done = False
 129     while not done:
 130         try:
 131             check_stuck(
 132                 manager,
 133                 num_inactive=0,
 134                 num_unclean=num_pgs,
 135                 num_stale=num_pgs,
 136                 )
 137             done = True
 138         except AssertionError:
 139             # wait up to 15 minutes to become stale
 140             if time.time() - starttime > 900:
 141                 raise
 142
 143     log.info('reviving')
 144     for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
 145         manager.revive_osd(id_)
 146         manager.mark_in_osd(id_)
 147     while True:
 148         try:
 149             manager.flush_pg_stats([0, 1])
 150             break
 151         except Exception:
 152             log.exception('osds must not be started yet, waiting...')
 153             time.sleep(1)
 154     manager.wait_for_clean(timeout)
 155
 156     check_stuck(
 157         manager,
 158         num_inactive=0,
 159         num_unclean=0,
 160         num_stale=0,
 161         )