ceph/qa/tasks/ec_inconsistent_hinfo.py

   1 """
   2 Inconsistent_hinfo
   3 """
   4 import logging
   5 import time
   6 from dateutil.parser import parse
   7 from tasks import ceph_manager
   8 from tasks.util.rados import rados
   9 from teuthology import misc as teuthology
  10
  11 log = logging.getLogger(__name__)
  12
  13 def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent):
  14     log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
  15               (pgid, check_time_now))
  16     for i in range(300):
  17         time.sleep(5)
  18         manager.flush_pg_stats([0, 1, 2, 3])
  19         pgs = manager.get_pg_stats()
  20         pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
  21         log.debug('pg=%s' % pg);
  22         assert pg
  23
  24         last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s')
  25         if last_deep_scrub_time < check_time_now:
  26             log.debug('not scrubbed')
  27             continue
  28
  29         status = pg['state'].split('+')
  30         if inconsistent:
  31             assert 'inconsistent' in status
  32         else:
  33             assert 'inconsistent' not in status
  34         return
  35
  36     assert False, 'not scrubbed'
  37
  38
  39 def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd):
  40     log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
  41               (pgid, from_osd, to_osd))
  42     for i in range(300):
  43         time.sleep(5)
  44         manager.flush_pg_stats([0, 1, 2, 3])
  45         pgs = manager.get_pg_stats()
  46         pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
  47         log.info('pg=%s' % pg);
  48         assert pg
  49         status = pg['state'].split('+')
  50         if 'active' not in status:
  51             log.debug('not active')
  52             continue
  53         if 'backfilling' in status:
  54             assert from_osd in pg['acting'] and to_osd in pg['up']
  55             log.debug('backfilling')
  56             continue
  57         if to_osd not in pg['up']:
  58             log.debug('backfill not started yet')
  59             continue
  60         log.debug('backfilled!')
  61         break
  62
  63 def task(ctx, config):
  64     """
  65     Test handling of objects with inconsistent hash info during backfill and deep-scrub.
  66
  67     A pretty rigid cluster is brought up and tested by this task
  68     """
  69     if config is None:
  70         config = {}
  71     assert isinstance(config, dict), \
  72         'ec_inconsistent_hinfo task only accepts a dict for configuration'
  73     first_mon = teuthology.get_first_mon(ctx, config)
  74     (mon,) = ctx.cluster.only(first_mon).remotes.keys()
  75
  76     manager = ceph_manager.CephManager(
  77         mon,
  78         ctx=ctx,
  79         logger=log.getChild('ceph_manager'),
  80         )
  81
  82     profile = config.get('erasure_code_profile', {
  83         'k': '2',
  84         'm': '1',
  85         'crush-failure-domain': 'osd'
  86     })
  87     profile_name = profile.get('name', 'backfill_unfound')
  88     manager.create_erasure_code_profile(profile_name, profile)
  89     pool = manager.create_pool_with_unique_name(
  90         pg_num=1,
  91         erasure_code_profile_name=profile_name,
  92         min_size=2)
  93     manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
  94                             'pg_autoscale_mode', 'off')
  95
  96     manager.flush_pg_stats([0, 1, 2, 3])
  97     manager.wait_for_clean()
  98
  99     pool_id = manager.get_pool_num(pool)
 100     pgid = '%d.0' % pool_id
 101     pgs = manager.get_pg_stats()
 102     acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
 103     log.info("acting=%s" % acting)
 104     assert acting
 105     primary = acting[0]
 106
 107     # something that is always there, readable and never empty
 108     dummyfile = '/etc/group'
 109
 110     # kludge to make sure they get a map
 111     rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
 112
 113     manager.flush_pg_stats([0, 1])
 114     manager.wait_for_recovery()
 115
 116     log.debug("create test object")
 117     obj = 'test'
 118     rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile])
 119
 120     victim = acting[1]
 121
 122     log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
 123              % victim)
 124
 125     manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
 126                              object_name=obj, osd=victim)
 127     check_time_now = time.strftime('%s')
 128     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
 129     wait_for_deep_scrub_complete(manager, pgid, check_time_now, True)
 130
 131     check_time_now = time.strftime('%s')
 132     manager.raw_cluster_cmd('pg', 'repair', pgid)
 133     wait_for_deep_scrub_complete(manager, pgid, check_time_now, False)
 134
 135     log.info("remove test object hash info from primary osd.%s shard and test backfill"
 136              % primary)
 137
 138     log.debug("write some data")
 139     rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096',
 140                      '--no-cleanup'])
 141
 142     manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
 143                              object_name=obj, osd=primary)
 144
 145     # mark the osd out to trigger a rebalance/backfill
 146     source = acting[1]
 147     target = [x for x in [0, 1, 2, 3] if x not in acting][0]
 148     manager.mark_out_osd(source)
 149
 150     # wait for everything to peer, backfill and recover
 151     wait_for_backfilling_complete(manager, pgid, source, target)
 152     manager.wait_for_clean()
 153
 154     manager.flush_pg_stats([0, 1, 2, 3])
 155     pgs = manager.get_pg_stats()
 156     pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
 157     log.debug('pg=%s' % pg)
 158     assert pg
 159     assert 'clean' in pg['state'].split('+')
 160     assert 'inconsistent' not in pg['state'].split('+')
 161     unfound = manager.get_num_unfound_objects()
 162     log.debug("there are %d unfound objects" % unfound)
 163     assert unfound == 0
 164
 165     source, target = target, source
 166     log.info("remove test object hash info from non-primary osd.%s shard and test backfill"
 167              % source)
 168
 169     manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
 170                              object_name=obj, osd=source)
 171
 172     # mark the osd in to trigger a rebalance/backfill
 173     manager.mark_in_osd(target)
 174
 175     # wait for everything to peer, backfill and recover
 176     wait_for_backfilling_complete(manager, pgid, source, target)
 177     manager.wait_for_clean()
 178
 179     manager.flush_pg_stats([0, 1, 2, 3])
 180     pgs = manager.get_pg_stats()
 181     pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
 182     log.debug('pg=%s' % pg)
 183     assert pg
 184     assert 'clean' in pg['state'].split('+')
 185     assert 'inconsistent' not in pg['state'].split('+')
 186     unfound = manager.get_num_unfound_objects()
 187     log.debug("there are %d unfound objects" % unfound)
 188     assert unfound == 0
 189
 190     log.info("remove hash info from two shards and test backfill")
 191
 192     source = acting[2]
 193     target = [x for x in [0, 1, 2, 3] if x not in acting][0]
 194     manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
 195                              object_name=obj, osd=primary)
 196     manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
 197                              object_name=obj, osd=source)
 198
 199     # mark the osd out to trigger a rebalance/backfill
 200     manager.mark_out_osd(source)
 201
 202     # wait for everything to peer, backfill and detect unfound object
 203     wait_for_backfilling_complete(manager, pgid, source, target)
 204
 205     # verify that there is unfound object
 206     manager.flush_pg_stats([0, 1, 2, 3])
 207     pgs = manager.get_pg_stats()
 208     pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
 209     log.debug('pg=%s' % pg)
 210     assert pg
 211     assert 'backfill_unfound' in pg['state'].split('+')
 212     unfound = manager.get_num_unfound_objects()
 213     log.debug("there are %d unfound objects" % unfound)
 214     assert unfound == 1
 215     m = manager.list_pg_unfound(pgid)
 216     log.debug('list_pg_unfound=%s' % m)
 217     assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
 218
 219     # mark stuff lost
 220     pgs = manager.get_pg_stats()
 221     manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete')
 222
 223     # wait for everything to peer and be happy...
 224     manager.flush_pg_stats([0, 1, 2, 3])
 225     manager.wait_for_recovery()