]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/scrub_test.py
10 from tasks
import ceph_manager
11 from teuthology
import misc
as teuthology
13 log
= logging
.getLogger(__name__
)
16 def wait_for_victim_pg(manager
):
17 """Return a PG with some data and its acting set"""
18 # wait for some PG to have data that we can mess with
21 stats
= manager
.get_pg_stats()
23 size
= pg
['stat_sum']['num_bytes']
31 def find_victim_object(ctx
, pg
, osd
):
32 """Return a file to be fuzzed"""
33 (osd_remote
,) = ctx
.cluster
.only('osd.%d' % osd
).remotes
.keys()
34 data_path
= os
.path
.join(
36 'ceph-{id}'.format(id=osd
),
38 '{pg}_head'.format(pg
=pg
),
43 ls_out
= osd_remote
.sh('sudo ls %s' % data_path
)
45 # find an object file we can mess with (and not the pg info object)
46 osdfilename
= next(line
for line
in ls_out
.split('\n')
47 if not line
.endswith('::::head#'))
48 assert osdfilename
is not None
50 # Get actual object name from osd stored filename
51 objname
= osdfilename
.split(':')[4]
52 return osd_remote
, os
.path
.join(data_path
, osdfilename
), objname
55 def corrupt_file(osd_remote
, path
):
56 # put a single \0 at the beginning of the file
61 'bs=1', 'count=1', 'conv=notrunc']
71 def deep_scrub(manager
, victim
, pool
):
72 # scrub, verify inconsistent
73 pgnum
= get_pgnum(victim
)
74 manager
.do_pg_scrub(pool
, pgnum
, 'deep-scrub')
76 stats
= manager
.get_single_pg_stats(victim
)
77 inconsistent
= stats
['state'].find('+inconsistent') != -1
81 def repair(manager
, victim
, pool
):
82 # repair, verify no longer inconsistent
83 pgnum
= get_pgnum(victim
)
84 manager
.do_pg_scrub(pool
, pgnum
, 'repair')
86 stats
= manager
.get_single_pg_stats(victim
)
87 inconsistent
= stats
['state'].find('+inconsistent') != -1
88 assert not inconsistent
91 def test_repair_corrupted_obj(ctx
, manager
, pg
, osd_remote
, obj_path
, pool
):
92 corrupt_file(osd_remote
, obj_path
)
93 deep_scrub(manager
, pg
, pool
)
94 repair(manager
, pg
, pool
)
97 def test_repair_bad_omap(ctx
, manager
, pg
, osd
, objname
):
98 # Test deep-scrub with various omap modifications
99 # Modify omap on specific osd
100 log
.info('fuzzing omap of %s' % objname
)
101 manager
.osd_admin_socket(osd
, ['rmomapkey', 'rbd', objname
, 'key'])
102 manager
.osd_admin_socket(osd
, ['setomapval', 'rbd', objname
,
104 manager
.osd_admin_socket(osd
, ['setomapheader', 'rbd', objname
, 'badhdr'])
106 deep_scrub(manager
, pg
, 'rbd')
107 # please note, the repair here is errnomous, it rewrites the correct omap
108 # digest and data digest on the replicas with the corresponding digests
109 # from the primary osd which is hosting the victim object, see
110 # find_victim_object().
111 # so we need to either put this test and the end of this task or
112 # undo the mess-up manually before the "repair()" that just ensures
113 # the cleanup is sane, otherwise the succeeding tests will fail. if they
114 # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
115 manager
.osd_admin_socket(osd
, ['setomapheader', 'rbd', objname
, 'hdr'])
116 manager
.osd_admin_socket(osd
, ['rmomapkey', 'rbd', objname
, 'badkey'])
117 manager
.osd_admin_socket(osd
, ['setomapval', 'rbd', objname
,
119 repair(manager
, pg
, 'rbd')
123 def __init__(self
, manager
, osd_remote
, pool
, osd_id
,
124 obj_name
, obj_path
, omap_key
, omap_val
):
125 self
.manager
= manager
126 self
.osd
= osd_remote
131 self
.omap_key
= omap_key
132 self
.omap_val
= omap_val
134 @contextlib.contextmanager
135 def _test_with_file(self
, messup_cmd
, *checks
):
136 temp
= tempfile
.mktemp()
137 backup_cmd
= ['sudo', 'cp', os
.path
.join(self
.path
, 'data'), temp
]
138 self
.osd
.run(args
=backup_cmd
)
139 self
.osd
.run(args
=messup_cmd
.split())
141 create_cmd
= ['sudo', 'mkdir', self
.path
]
142 self
.osd
.run(args
=create_cmd
, check_status
=False)
143 restore_cmd
= ['sudo', 'cp', temp
, os
.path
.join(self
.path
, 'data')]
144 self
.osd
.run(args
=restore_cmd
)
147 cmd
= 'sudo rmdir {path}'.format(path
=self
.path
)
148 return self
._test
_with
_file
(cmd
, 'missing')
151 cmd
= 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
152 'conv=notrunc oflag=append'.format(path
=self
.path
)
153 return self
._test
_with
_file
(cmd
,
154 'data_digest_mismatch',
158 cmd
= 'sudo dd if=/dev/null of={path}/data'.format(path
=self
.path
)
159 return self
._test
_with
_file
(cmd
,
160 'data_digest_mismatch',
163 def change_obj(self
):
164 cmd
= 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
165 'conv=notrunc'.format(path
=self
.path
)
166 return self
._test
_with
_file
(cmd
,
167 'data_digest_mismatch')
169 @contextlib.contextmanager
171 cmd
= ['rmomapkey', self
.pool
, self
.obj
, self
.omap_key
]
172 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
173 yield ('omap_digest_mismatch',)
174 cmd
= ['setomapval', self
.pool
, self
.obj
,
175 self
.omap_key
, self
.omap_val
]
176 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
178 @contextlib.contextmanager
180 cmd
= ['setomapval', self
.pool
, self
.obj
, 'badkey', 'badval']
181 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
182 yield ('omap_digest_mismatch',)
183 cmd
= ['rmomapkey', self
.pool
, self
.obj
, 'badkey']
184 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
186 @contextlib.contextmanager
187 def change_omap(self
):
188 cmd
= ['setomapval', self
.pool
, self
.obj
, self
.omap_key
, 'badval']
189 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
190 yield ('omap_digest_mismatch',)
191 cmd
= ['setomapval', self
.pool
, self
.obj
, self
.omap_key
, self
.omap_val
]
192 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
195 class InconsistentObjChecker
:
196 """Check the returned inconsistents/inconsistent info"""
198 def __init__(self
, osd
, acting
, obj_name
):
202 assert self
.osd
in self
.acting
204 def basic_checks(self
, inc
):
205 assert inc
['object']['name'] == self
.obj
206 assert inc
['object']['snap'] == "head"
207 assert len(inc
['shards']) == len(self
.acting
), \
208 "the number of returned shard does not match with the acting set"
210 def run(self
, check
, inc
):
211 func
= getattr(self
, check
)
214 def _check_errors(self
, inc
, err_name
):
217 for shard
in inc
['shards']:
218 log
.info('shard = %r' % shard
)
219 log
.info('err = %s' % err_name
)
220 assert 'osd' in shard
222 err
= err_name
in shard
['errors']
224 assert bad_found
is False, \
225 "multiple entries found for the given OSD"
226 assert err
is True, \
227 "Didn't find '{err}' in errors".format(err
=err_name
)
230 assert osd
in self
.acting
, "shard not in acting set"
231 assert err
is False, \
232 "Expected '{err}' in errors".format(err
=err_name
)
234 assert bad_found
is True, \
235 "Shard for osd.{osd} not found".format(osd
=self
.osd
)
236 assert good_found
is True, \
237 "No other acting shards found"
239 def _check_attrs(self
, inc
, attr_name
):
242 for shard
in inc
['shards']:
243 log
.info('shard = %r' % shard
)
244 log
.info('attr = %s' % attr_name
)
245 assert 'osd' in shard
247 attr
= shard
.get(attr_name
, False)
249 assert bad_attr
is None, \
250 "multiple entries found for the given OSD"
253 assert osd
in self
.acting
, "shard not in acting set"
254 assert good_attr
is None or good_attr
== attr
, \
255 "multiple good attrs found"
257 assert bad_attr
is not None, \
258 "bad {attr} not found".format(attr
=attr_name
)
259 assert good_attr
is not None, \
260 "good {attr} not found".format(attr
=attr_name
)
261 assert good_attr
!= bad_attr
, \
262 "bad attr is identical to the good ones: " \
263 "{0} == {1}".format(good_attr
, bad_attr
)
265 def data_digest_mismatch(self
, inc
):
266 assert 'data_digest_mismatch' in inc
['errors']
267 self
._check
_attrs
(inc
, 'data_digest')
269 def missing(self
, inc
):
270 assert 'missing' in inc
['union_shard_errors']
271 self
._check
_errors
(inc
, 'missing')
273 def size_mismatch(self
, inc
):
274 assert 'size_mismatch' in inc
['errors']
275 self
._check
_attrs
(inc
, 'size')
277 def omap_digest_mismatch(self
, inc
):
278 assert 'omap_digest_mismatch' in inc
['errors']
279 self
._check
_attrs
(inc
, 'omap_digest')
282 def test_list_inconsistent_obj(ctx
, manager
, osd_remote
, pg
, acting
, osd_id
,
284 mon
= manager
.controller
288 manager
.do_rados(mon
, ['-p', pool
, 'setomapval', obj_name
,
290 # Update missing digests, requires "osd deep scrub update digest min age: 0"
291 pgnum
= get_pgnum(pg
)
292 manager
.do_pg_scrub(pool
, pgnum
, 'deep-scrub')
294 messup
= MessUp(manager
, osd_remote
, pool
, osd_id
, obj_name
, obj_path
,
296 for test
in [messup
.rm_omap
, messup
.add_omap
, messup
.change_omap
,
297 messup
.append
, messup
.truncate
, messup
.change_obj
,
299 with
test() as checks
:
300 deep_scrub(manager
, pg
, pool
)
301 cmd
= 'rados list-inconsistent-pg {pool} ' \
302 '--format=json'.format(pool
=pool
)
303 pgs
= json
.loads(mon
.sh(cmd
))
306 cmd
= 'rados list-inconsistent-obj {pg} ' \
307 '--format=json'.format(pg
=pg
)
308 objs
= json
.loads(mon
.sh(cmd
))
309 assert len(objs
['inconsistents']) == 1
311 checker
= InconsistentObjChecker(osd_id
, acting
, obj_name
)
312 inc_obj
= objs
['inconsistents'][0]
313 log
.info('inc = %r', inc_obj
)
314 checker
.basic_checks(inc_obj
)
316 checker
.run(check
, inc_obj
)
319 def task(ctx
, config
):
331 - deep-scrub 0 missing, 1 inconsistent objects
332 - deep-scrub [0-9]+ errors
333 - repair 0 missing, 1 inconsistent objects
334 - repair [0-9]+ errors, [0-9]+ fixed
335 - shard [0-9]+ .* : missing
336 - deep-scrub 1 missing, 1 inconsistent objects
337 - does not match object info size
338 - attr name mistmatch
339 - deep-scrub 1 missing, 0 inconsistent objects
340 - failed to pick suitable auth object
341 - candidate size [0-9]+ info size [0-9]+ mismatch
344 osd deep scrub update digest min age: 0
349 assert isinstance(config
, dict), \
350 'scrub_test task only accepts a dict for configuration'
351 first_mon
= teuthology
.get_first_mon(ctx
, config
)
352 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
354 num_osds
= teuthology
.num_instances_of_type(ctx
.cluster
, 'osd')
355 log
.info('num_osds is %s' % num_osds
)
357 manager
= ceph_manager
.CephManager(
360 logger
=log
.getChild('ceph_manager'),
363 while len(manager
.get_osd_status()['up']) < num_osds
:
366 for i
in range(num_osds
):
367 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'injectargs',
368 '--', '--osd-objectstore-fuse')
369 manager
.flush_pg_stats(range(num_osds
))
370 manager
.wait_for_clean()
373 p
= manager
.do_rados(mon
, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
374 'write', '-b', '4096'])
375 log
.info('err is %d' % p
.exitstatus
)
377 # wait for some PG to have data that we can mess with
378 pg
, acting
= wait_for_victim_pg(manager
)
381 osd_remote
, obj_path
, obj_name
= find_victim_object(ctx
, pg
, osd
)
382 manager
.do_rados(mon
, ['-p', 'rbd', 'setomapval', obj_name
, 'key', 'val'])
383 log
.info('err is %d' % p
.exitstatus
)
384 manager
.do_rados(mon
, ['-p', 'rbd', 'setomapheader', obj_name
, 'hdr'])
385 log
.info('err is %d' % p
.exitstatus
)
387 # Update missing digests, requires "osd deep scrub update digest min age: 0"
388 pgnum
= get_pgnum(pg
)
389 manager
.do_pg_scrub('rbd', pgnum
, 'deep-scrub')
391 log
.info('messing with PG %s on osd %d' % (pg
, osd
))
392 test_repair_corrupted_obj(ctx
, manager
, pg
, osd_remote
, obj_path
, 'rbd')
393 test_repair_bad_omap(ctx
, manager
, pg
, osd
, obj_name
)
394 test_list_inconsistent_obj(ctx
, manager
, osd_remote
, pg
, acting
, osd
,
396 log
.info('test successful!')
398 # shut down fuse mount
399 for i
in range(num_osds
):
400 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'injectargs',
401 '--', '--no-osd-objectstore-fuse')