6 from dateutil
.parser
import parse
7 from tasks
import ceph_manager
8 from tasks
.util
.rados
import rados
9 from teuthology
import misc
as teuthology
11 log
= logging
.getLogger(__name__
)
13 def wait_for_deep_scrub_complete(manager
, pgid
, check_time_now
, inconsistent
):
14 log
.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
15 (pgid
, check_time_now
))
18 manager
.flush_pg_stats([0, 1, 2, 3])
19 pgs
= manager
.get_pg_stats()
20 pg
= next((pg
for pg
in pgs
if pg
['pgid'] == pgid
), None)
21 log
.debug('pg=%s' % pg
);
24 last_deep_scrub_time
= parse(pg
['last_deep_scrub_stamp']).strftime('%s')
25 if last_deep_scrub_time
< check_time_now
:
26 log
.debug('not scrubbed')
29 status
= pg
['state'].split('+')
31 assert 'inconsistent' in status
33 assert 'inconsistent' not in status
36 assert False, 'not scrubbed'
39 def wait_for_backfilling_complete(manager
, pgid
, from_osd
, to_osd
):
40 log
.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
41 (pgid
, from_osd
, to_osd
))
44 manager
.flush_pg_stats([0, 1, 2, 3])
45 pgs
= manager
.get_pg_stats()
46 pg
= next((pg
for pg
in pgs
if pg
['pgid'] == pgid
), None)
47 log
.info('pg=%s' % pg
);
49 status
= pg
['state'].split('+')
50 if 'active' not in status
:
51 log
.debug('not active')
53 if 'backfilling' in status
:
54 assert from_osd
in pg
['acting'] and to_osd
in pg
['up']
55 log
.debug('backfilling')
57 if to_osd
not in pg
['up']:
58 log
.debug('backfill not started yet')
60 log
.debug('backfilled!')
63 def task(ctx
, config
):
65 Test handling of objects with inconsistent hash info during backfill and deep-scrub.
67 A pretty rigid cluster is brought up and tested by this task
71 assert isinstance(config
, dict), \
72 'ec_inconsistent_hinfo task only accepts a dict for configuration'
73 first_mon
= teuthology
.get_first_mon(ctx
, config
)
74 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
76 manager
= ceph_manager
.CephManager(
79 logger
=log
.getChild('ceph_manager'),
82 profile
= config
.get('erasure_code_profile', {
85 'crush-failure-domain': 'osd'
87 profile_name
= profile
.get('name', 'backfill_unfound')
88 manager
.create_erasure_code_profile(profile_name
, profile
)
89 pool
= manager
.create_pool_with_unique_name(
91 erasure_code_profile_name
=profile_name
,
93 manager
.raw_cluster_cmd('osd', 'pool', 'set', pool
,
94 'pg_autoscale_mode', 'off')
96 manager
.flush_pg_stats([0, 1, 2, 3])
97 manager
.wait_for_clean()
99 pool_id
= manager
.get_pool_num(pool
)
100 pgid
= '%d.0' % pool_id
101 pgs
= manager
.get_pg_stats()
102 acting
= next((pg
['acting'] for pg
in pgs
if pg
['pgid'] == pgid
), None)
103 log
.info("acting=%s" % acting
)
107 # something that is always there, readable and never empty
108 dummyfile
= '/etc/group'
110 # kludge to make sure they get a map
111 rados(ctx
, mon
, ['-p', pool
, 'put', 'dummy', dummyfile
])
113 manager
.flush_pg_stats([0, 1])
114 manager
.wait_for_recovery()
116 log
.debug("create test object")
118 rados(ctx
, mon
, ['-p', pool
, 'put', obj
, dummyfile
])
122 log
.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
125 manager
.objectstore_tool(pool
, options
='', args
='rm-attr hinfo_key',
126 object_name
=obj
, osd
=victim
)
127 check_time_now
= time
.strftime('%s')
128 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
129 wait_for_deep_scrub_complete(manager
, pgid
, check_time_now
, True)
131 check_time_now
= time
.strftime('%s')
132 manager
.raw_cluster_cmd('pg', 'repair', pgid
)
133 wait_for_deep_scrub_complete(manager
, pgid
, check_time_now
, False)
135 log
.info("remove test object hash info from primary osd.%s shard and test backfill"
138 log
.debug("write some data")
139 rados(ctx
, mon
, ['-p', pool
, 'bench', '30', 'write', '-b', '4096',
142 manager
.objectstore_tool(pool
, options
='', args
='rm-attr hinfo_key',
143 object_name
=obj
, osd
=primary
)
145 # mark the osd out to trigger a rebalance/backfill
147 target
= [x
for x
in [0, 1, 2, 3] if x
not in acting
][0]
148 manager
.mark_out_osd(source
)
150 # wait for everything to peer, backfill and recover
151 wait_for_backfilling_complete(manager
, pgid
, source
, target
)
152 manager
.wait_for_clean()
154 manager
.flush_pg_stats([0, 1, 2, 3])
155 pgs
= manager
.get_pg_stats()
156 pg
= next((pg
for pg
in pgs
if pg
['pgid'] == pgid
), None)
157 log
.debug('pg=%s' % pg
)
159 assert 'clean' in pg
['state'].split('+')
160 assert 'inconsistent' not in pg
['state'].split('+')
161 unfound
= manager
.get_num_unfound_objects()
162 log
.debug("there are %d unfound objects" % unfound
)
165 source
, target
= target
, source
166 log
.info("remove test object hash info from non-primary osd.%s shard and test backfill"
169 manager
.objectstore_tool(pool
, options
='', args
='rm-attr hinfo_key',
170 object_name
=obj
, osd
=source
)
172 # mark the osd in to trigger a rebalance/backfill
173 manager
.mark_in_osd(target
)
175 # wait for everything to peer, backfill and recover
176 wait_for_backfilling_complete(manager
, pgid
, source
, target
)
177 manager
.wait_for_clean()
179 manager
.flush_pg_stats([0, 1, 2, 3])
180 pgs
= manager
.get_pg_stats()
181 pg
= next((pg
for pg
in pgs
if pg
['pgid'] == pgid
), None)
182 log
.debug('pg=%s' % pg
)
184 assert 'clean' in pg
['state'].split('+')
185 assert 'inconsistent' not in pg
['state'].split('+')
186 unfound
= manager
.get_num_unfound_objects()
187 log
.debug("there are %d unfound objects" % unfound
)
190 log
.info("remove hash info from two shards and test backfill")
193 target
= [x
for x
in [0, 1, 2, 3] if x
not in acting
][0]
194 manager
.objectstore_tool(pool
, options
='', args
='rm-attr hinfo_key',
195 object_name
=obj
, osd
=primary
)
196 manager
.objectstore_tool(pool
, options
='', args
='rm-attr hinfo_key',
197 object_name
=obj
, osd
=source
)
199 # mark the osd out to trigger a rebalance/backfill
200 manager
.mark_out_osd(source
)
202 # wait for everything to peer, backfill and detect unfound object
203 wait_for_backfilling_complete(manager
, pgid
, source
, target
)
205 # verify that there is unfound object
206 manager
.flush_pg_stats([0, 1, 2, 3])
207 pgs
= manager
.get_pg_stats()
208 pg
= next((pg
for pg
in pgs
if pg
['pgid'] == pgid
), None)
209 log
.debug('pg=%s' % pg
)
211 assert 'backfill_unfound' in pg
['state'].split('+')
212 unfound
= manager
.get_num_unfound_objects()
213 log
.debug("there are %d unfound objects" % unfound
)
215 m
= manager
.list_pg_unfound(pgid
)
216 log
.debug('list_pg_unfound=%s' % m
)
217 assert m
['num_unfound'] == pg
['stat_sum']['num_objects_unfound']
220 pgs
= manager
.get_pg_stats()
221 manager
.raw_cluster_cmd('pg', pgid
, 'mark_unfound_lost', 'delete')
223 # wait for everything to peer and be happy...
224 manager
.flush_pg_stats([0, 1, 2, 3])
225 manager
.wait_for_recovery()