[ceph.git] / ceph / qa / tasks / ec_inconsistent_hinfo.py

"""
Inconsistent_hinfo
"""
import logging
import time
from dateutil.parser import parse
from tasks import ceph_manager
from tasks.util.rados import rados
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent):
    log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
              (pgid, check_time_now))
    for i in range(300):
        time.sleep(5)
        manager.flush_pg_stats([0, 1, 2, 3])
        pgs = manager.get_pg_stats()
        pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
        log.debug('pg=%s' % pg);
        assert pg

        last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s')
        if last_deep_scrub_time < check_time_now:
            log.debug('not scrubbed')
            continue

        status = pg['state'].split('+')
        if inconsistent:
            assert 'inconsistent' in status
        else:
            assert 'inconsistent' not in status
        return

    assert False, 'not scrubbed'


def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd):
    log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
              (pgid, from_osd, to_osd))
    for i in range(300):
        time.sleep(5)
        manager.flush_pg_stats([0, 1, 2, 3])
        pgs = manager.get_pg_stats()
        pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
        log.info('pg=%s' % pg);
        assert pg
        status = pg['state'].split('+')
        if 'active' not in status:
            log.debug('not active')
            continue
        if 'backfilling' in status:
            assert from_osd in pg['acting'] and to_osd in pg['up']
            log.debug('backfilling')
            continue
        if to_osd not in pg['up']:
            log.debug('backfill not started yet')
            continue
        log.debug('backfilled!')
        break

def task(ctx, config):
    """
    Test handling of objects with inconsistent hash info during backfill and deep-scrub.

    A pretty rigid cluster is brought up and tested by this task
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'ec_inconsistent_hinfo task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    profile = config.get('erasure_code_profile', {
        'k': '2',
        'm': '1',
        'crush-failure-domain': 'osd'
    })
    profile_name = profile.get('name', 'backfill_unfound')
    manager.create_erasure_code_profile(profile_name, profile)
    pool = manager.create_pool_with_unique_name(
        pg_num=1,
        erasure_code_profile_name=profile_name,
        min_size=2)
    manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
                            'pg_autoscale_mode', 'off')

    manager.flush_pg_stats([0, 1, 2, 3])
    manager.wait_for_clean()

    pool_id = manager.get_pool_num(pool)
    pgid = '%d.0' % pool_id
    pgs = manager.get_pg_stats()
    acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
    log.info("acting=%s" % acting)
    assert acting
    primary = acting[0]

    # something that is always there, readable and never empty
    dummyfile = '/etc/group'

    # kludge to make sure they get a map
    rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])

    manager.flush_pg_stats([0, 1])
    manager.wait_for_recovery()

    log.debug("create test object")
    obj = 'test'
    rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile])

    victim = acting[1]

    log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
             % victim)

    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
                             object_name=obj, osd=victim)
    check_time_now = time.strftime('%s')
    manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
    wait_for_deep_scrub_complete(manager, pgid, check_time_now, True)

    check_time_now = time.strftime('%s')
    manager.raw_cluster_cmd('pg', 'repair', pgid)
    wait_for_deep_scrub_complete(manager, pgid, check_time_now, False)

    log.info("remove test object hash info from primary osd.%s shard and test backfill"
             % primary)

    log.debug("write some data")
    rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096',
                     '--no-cleanup'])

    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
                             object_name=obj, osd=primary)

    # mark the osd out to trigger a rebalance/backfill
    source = acting[1]
    target = [x for x in [0, 1, 2, 3] if x not in acting][0]
    manager.mark_out_osd(source)

    # wait for everything to peer, backfill and recover
    wait_for_backfilling_complete(manager, pgid, source, target)
    manager.wait_for_clean()

    manager.flush_pg_stats([0, 1, 2, 3])
    pgs = manager.get_pg_stats()
    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
    log.debug('pg=%s' % pg)
    assert pg
    assert 'clean' in pg['state'].split('+')
    assert 'inconsistent' not in pg['state'].split('+')
    unfound = manager.get_num_unfound_objects()
    log.debug("there are %d unfound objects" % unfound)
    assert unfound == 0

    source, target = target, source
    log.info("remove test object hash info from non-primary osd.%s shard and test backfill"
             % source)

    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
                             object_name=obj, osd=source)

    # mark the osd in to trigger a rebalance/backfill
    manager.mark_in_osd(target)

    # wait for everything to peer, backfill and recover
    wait_for_backfilling_complete(manager, pgid, source, target)
    manager.wait_for_clean()

    manager.flush_pg_stats([0, 1, 2, 3])
    pgs = manager.get_pg_stats()
    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
    log.debug('pg=%s' % pg)
    assert pg
    assert 'clean' in pg['state'].split('+')
    assert 'inconsistent' not in pg['state'].split('+')
    unfound = manager.get_num_unfound_objects()
    log.debug("there are %d unfound objects" % unfound)
    assert unfound == 0

    log.info("remove hash info from two shards and test backfill")

    source = acting[2]
    target = [x for x in [0, 1, 2, 3] if x not in acting][0]
    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
                             object_name=obj, osd=primary)
    manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
                             object_name=obj, osd=source)

    # mark the osd out to trigger a rebalance/backfill
    manager.mark_out_osd(source)

    # wait for everything to peer, backfill and detect unfound object
    wait_for_backfilling_complete(manager, pgid, source, target)

    # verify that there is unfound object
    manager.flush_pg_stats([0, 1, 2, 3])
    pgs = manager.get_pg_stats()
    pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
    log.debug('pg=%s' % pg)
    assert pg
    assert 'backfill_unfound' in pg['state'].split('+')
    unfound = manager.get_num_unfound_objects()
    log.debug("there are %d unfound objects" % unfound)
    assert unfound == 1
    m = manager.list_pg_unfound(pgid)
    log.debug('list_pg_unfound=%s' % m)
    assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']

    # mark stuff lost
    pgs = manager.get_pg_stats()
    manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete')

    # wait for everything to peer and be happy...
    manager.flush_pg_stats([0, 1, 2, 3])
    manager.wait_for_recovery()
Commit	Line	Data
a4b75251 TL	1	"""
	2	Inconsistent_hinfo
	3	"""
	4	import logging
	5	import time
	6	from dateutil.parser import parse
	7	from tasks import ceph_manager
	8	from tasks.util.rados import rados
	9	from teuthology import misc as teuthology
	10
	11	log = logging.getLogger(__name__)
	12
	13	def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent):
	14	log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
	15	(pgid, check_time_now))
	16	for i in range(300):
	17	time.sleep(5)
	18	manager.flush_pg_stats([0, 1, 2, 3])
	19	pgs = manager.get_pg_stats()
	20	pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
	21	log.debug('pg=%s' % pg);
	22	assert pg
	23
	24	last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s')
	25	if last_deep_scrub_time < check_time_now:
	26	log.debug('not scrubbed')
	27	continue
	28
	29	status = pg['state'].split('+')
	30	if inconsistent:
	31	assert 'inconsistent' in status
	32	else:
	33	assert 'inconsistent' not in status
	34	return
	35
	36	assert False, 'not scrubbed'
	37
	38
	39	def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd):
	40	log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
	41	(pgid, from_osd, to_osd))
	42	for i in range(300):
	43	time.sleep(5)
	44	manager.flush_pg_stats([0, 1, 2, 3])
	45	pgs = manager.get_pg_stats()
	46	pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
	47	log.info('pg=%s' % pg);
	48	assert pg
	49	status = pg['state'].split('+')
	50	if 'active' not in status:
	51	log.debug('not active')
	52	continue
	53	if 'backfilling' in status:
	54	assert from_osd in pg['acting'] and to_osd in pg['up']
	55	log.debug('backfilling')
	56	continue
	57	if to_osd not in pg['up']:
	58	log.debug('backfill not started yet')
	59	continue
	60	log.debug('backfilled!')
	61	break
	62
	63	def task(ctx, config):
	64	"""
65	Test handling of objects with inconsistent hash info during backfill and deep-scrub.
66
67	A pretty rigid cluster is brought up and tested by this task
68	"""
69	if config is None:
70	config = {}
71	assert isinstance(config, dict), \
72	'ec_inconsistent_hinfo task only accepts a dict for configuration'
73	first_mon = teuthology.get_first_mon(ctx, config)
74	(mon,) = ctx.cluster.only(first_mon).remotes.keys()
75
76	manager = ceph_manager.CephManager(
77	mon,
78	ctx=ctx,
79	logger=log.getChild('ceph_manager'),
80	)
81
82	profile = config.get('erasure_code_profile', {
83	'k': '2',
84	'm': '1',
85	'crush-failure-domain': 'osd'
86	})
87	profile_name = profile.get('name', 'backfill_unfound')
88	manager.create_erasure_code_profile(profile_name, profile)
89	pool = manager.create_pool_with_unique_name(
90	pg_num=1,
91	erasure_code_profile_name=profile_name,
92	min_size=2)
93	manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
94	'pg_autoscale_mode', 'off')
95
96	manager.flush_pg_stats([0, 1, 2, 3])
97	manager.wait_for_clean()
98
99	pool_id = manager.get_pool_num(pool)
100	pgid = '%d.0' % pool_id
101	pgs = manager.get_pg_stats()
102	acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
103	log.info("acting=%s" % acting)
104	assert acting
105	primary = acting[0]
106
107	# something that is always there, readable and never empty
108	dummyfile = '/etc/group'
109
110	# kludge to make sure they get a map
111	rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
112
113	manager.flush_pg_stats([0, 1])
114	manager.wait_for_recovery()
115
116	log.debug("create test object")
117	obj = 'test'
118	rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile])
119
120	victim = acting[1]
121
122	log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
123	% victim)
124
125	manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
126	object_name=obj, osd=victim)
127	check_time_now = time.strftime('%s')
128	manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
129	wait_for_deep_scrub_complete(manager, pgid, check_time_now, True)
130
131	check_time_now = time.strftime('%s')
132	manager.raw_cluster_cmd('pg', 'repair', pgid)
133	wait_for_deep_scrub_complete(manager, pgid, check_time_now, False)
134
135	log.info("remove test object hash info from primary osd.%s shard and test backfill"
136	% primary)
137
138	log.debug("write some data")
139	rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096',
140	'--no-cleanup'])
141
142	manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
143	object_name=obj, osd=primary)
144
145	# mark the osd out to trigger a rebalance/backfill
146	source = acting[1]
147	target = [x for x in [0, 1, 2, 3] if x not in acting][0]
148	manager.mark_out_osd(source)
149
150	# wait for everything to peer, backfill and recover
151	wait_for_backfilling_complete(manager, pgid, source, target)
152	manager.wait_for_clean()
153
154	manager.flush_pg_stats([0, 1, 2, 3])
155	pgs = manager.get_pg_stats()
156	pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
157	log.debug('pg=%s' % pg)
158	assert pg
159	assert 'clean' in pg['state'].split('+')
160	assert 'inconsistent' not in pg['state'].split('+')
161	unfound = manager.get_num_unfound_objects()
162	log.debug("there are %d unfound objects" % unfound)
163	assert unfound == 0
164
165	source, target = target, source
166	log.info("remove test object hash info from non-primary osd.%s shard and test backfill"
167	% source)
168
169	manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
170	object_name=obj, osd=source)
171
172	# mark the osd in to trigger a rebalance/backfill
173	manager.mark_in_osd(target)
174
175	# wait for everything to peer, backfill and recover
176	wait_for_backfilling_complete(manager, pgid, source, target)
177	manager.wait_for_clean()
178
179	manager.flush_pg_stats([0, 1, 2, 3])
180	pgs = manager.get_pg_stats()
181	pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
182	log.debug('pg=%s' % pg)
183	assert pg
184	assert 'clean' in pg['state'].split('+')
185	assert 'inconsistent' not in pg['state'].split('+')
186	unfound = manager.get_num_unfound_objects()
187	log.debug("there are %d unfound objects" % unfound)
188	assert unfound == 0
189
190	log.info("remove hash info from two shards and test backfill")
191
192	source = acting[2]
193	target = [x for x in [0, 1, 2, 3] if x not in acting][0]
194	manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
195	object_name=obj, osd=primary)
196	manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
197	object_name=obj, osd=source)
198
199	# mark the osd out to trigger a rebalance/backfill
200	manager.mark_out_osd(source)
201
202	# wait for everything to peer, backfill and detect unfound object
203	wait_for_backfilling_complete(manager, pgid, source, target)
204
205	# verify that there is unfound object
206	manager.flush_pg_stats([0, 1, 2, 3])
207	pgs = manager.get_pg_stats()
208	pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
209	log.debug('pg=%s' % pg)
210	assert pg
211	assert 'backfill_unfound' in pg['state'].split('+')
212	unfound = manager.get_num_unfound_objects()
213	log.debug("there are %d unfound objects" % unfound)
214	assert unfound == 1
215	m = manager.list_pg_unfound(pgid)
216	log.debug('list_pg_unfound=%s' % m)
217	assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
218
219	# mark stuff lost
220	pgs = manager.get_pg_stats()
221	manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete')
222
223	# wait for everything to peer and be happy...
224	manager.flush_pg_stats([0, 1, 2, 3])
225	manager.wait_for_recovery()