]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/scrub_test.py
2 from cStringIO
import StringIO
12 from teuthology
import misc
as teuthology
14 log
= logging
.getLogger(__name__
)
17 def wait_for_victim_pg(manager
):
18 """Return a PG with some data and its acting set"""
19 # wait for some PG to have data that we can mess with
22 stats
= manager
.get_pg_stats()
24 size
= pg
['stat_sum']['num_bytes']
32 def find_victim_object(ctx
, pg
, osd
):
33 """Return a file to be fuzzed"""
34 (osd_remote
,) = ctx
.cluster
.only('osd.%d' % osd
).remotes
.iterkeys()
35 data_path
= os
.path
.join(
37 'ceph-{id}'.format(id=osd
),
39 '{pg}_head'.format(pg
=pg
),
44 with contextlib
.closing(StringIO()) as ls_fp
:
46 args
=['sudo', 'ls', data_path
],
49 ls_out
= ls_fp
.getvalue()
51 # find an object file we can mess with (and not the pg info object)
52 osdfilename
= next(line
for line
in ls_out
.split('\n')
53 if not line
.endswith('::::head#'))
54 assert osdfilename
is not None
56 # Get actual object name from osd stored filename
57 objname
= osdfilename
.split(':')[4]
58 return osd_remote
, os
.path
.join(data_path
, osdfilename
), objname
61 def corrupt_file(osd_remote
, path
):
62 # put a single \0 at the beginning of the file
67 'bs=1', 'count=1', 'conv=notrunc']
77 def deep_scrub(manager
, victim
, pool
):
78 # scrub, verify inconsistent
79 pgnum
= get_pgnum(victim
)
80 manager
.do_pg_scrub(pool
, pgnum
, 'deep-scrub')
82 stats
= manager
.get_single_pg_stats(victim
)
83 inconsistent
= stats
['state'].find('+inconsistent') != -1
87 def repair(manager
, victim
, pool
):
88 # repair, verify no longer inconsistent
89 pgnum
= get_pgnum(victim
)
90 manager
.do_pg_scrub(pool
, pgnum
, 'repair')
92 stats
= manager
.get_single_pg_stats(victim
)
93 inconsistent
= stats
['state'].find('+inconsistent') != -1
94 assert not inconsistent
97 def test_repair_corrupted_obj(ctx
, manager
, pg
, osd_remote
, obj_path
, pool
):
98 corrupt_file(osd_remote
, obj_path
)
99 deep_scrub(manager
, pg
, pool
)
100 repair(manager
, pg
, pool
)
103 def test_repair_bad_omap(ctx
, manager
, pg
, osd
, objname
):
104 # Test deep-scrub with various omap modifications
105 # Modify omap on specific osd
106 log
.info('fuzzing omap of %s' % objname
)
107 manager
.osd_admin_socket(osd
, ['rmomapkey', 'rbd', objname
, 'key'])
108 manager
.osd_admin_socket(osd
, ['setomapval', 'rbd', objname
,
110 manager
.osd_admin_socket(osd
, ['setomapheader', 'rbd', objname
, 'badhdr'])
112 deep_scrub(manager
, pg
, 'rbd')
113 # please note, the repair here is errnomous, it rewrites the correct omap
114 # digest and data digest on the replicas with the corresponding digests
115 # from the primary osd which is hosting the victim object, see
116 # find_victim_object().
117 # so we need to either put this test and the end of this task or
118 # undo the mess-up manually before the "repair()" that just ensures
119 # the cleanup is sane, otherwise the succeeding tests will fail. if they
120 # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
121 manager
.osd_admin_socket(osd
, ['setomapheader', 'rbd', objname
, 'hdr'])
122 manager
.osd_admin_socket(osd
, ['rmomapkey', 'rbd', objname
, 'badkey'])
123 manager
.osd_admin_socket(osd
, ['setomapval', 'rbd', objname
,
125 repair(manager
, pg
, 'rbd')
129 def __init__(self
, manager
, osd_remote
, pool
, osd_id
,
130 obj_name
, obj_path
, omap_key
, omap_val
):
131 self
.manager
= manager
132 self
.osd
= osd_remote
137 self
.omap_key
= omap_key
138 self
.omap_val
= omap_val
140 @contextlib.contextmanager
141 def _test_with_file(self
, messup_cmd
, *checks
):
142 temp
= tempfile
.mktemp()
143 backup_cmd
= ['sudo', 'cp', os
.path
.join(self
.path
, 'data'), temp
]
144 self
.osd
.run(args
=backup_cmd
)
145 self
.osd
.run(args
=messup_cmd
.split())
147 create_cmd
= ['sudo', 'mkdir', self
.path
]
148 self
.osd
.run(args
=create_cmd
, check_status
=False)
149 restore_cmd
= ['sudo', 'cp', temp
, os
.path
.join(self
.path
, 'data')]
150 self
.osd
.run(args
=restore_cmd
)
153 cmd
= 'sudo rmdir {path}'.format(path
=self
.path
)
154 return self
._test
_with
_file
(cmd
, 'missing')
157 cmd
= 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
158 'conv=notrunc oflag=append'.format(path
=self
.path
)
159 return self
._test
_with
_file
(cmd
,
160 'data_digest_mismatch',
164 cmd
= 'sudo dd if=/dev/null of={path}/data'.format(path
=self
.path
)
165 return self
._test
_with
_file
(cmd
,
166 'data_digest_mismatch',
169 def change_obj(self
):
170 cmd
= 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
171 'conv=notrunc'.format(path
=self
.path
)
172 return self
._test
_with
_file
(cmd
,
173 'data_digest_mismatch')
175 @contextlib.contextmanager
177 cmd
= ['rmomapkey', self
.pool
, self
.obj
, self
.omap_key
]
178 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
179 yield ('omap_digest_mismatch',)
180 cmd
= ['setomapval', self
.pool
, self
.obj
,
181 self
.omap_key
, self
.omap_val
]
182 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
184 @contextlib.contextmanager
186 cmd
= ['setomapval', self
.pool
, self
.obj
, 'badkey', 'badval']
187 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
188 yield ('omap_digest_mismatch',)
189 cmd
= ['rmomapkey', self
.pool
, self
.obj
, 'badkey']
190 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
192 @contextlib.contextmanager
193 def change_omap(self
):
194 cmd
= ['setomapval', self
.pool
, self
.obj
, self
.omap_key
, 'badval']
195 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
196 yield ('omap_digest_mismatch',)
197 cmd
= ['setomapval', self
.pool
, self
.obj
, self
.omap_key
, self
.omap_val
]
198 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
201 class InconsistentObjChecker
:
202 """Check the returned inconsistents/inconsistent info"""
204 def __init__(self
, osd
, acting
, obj_name
):
208 assert self
.osd
in self
.acting
210 def basic_checks(self
, inc
):
211 assert inc
['object']['name'] == self
.obj
212 assert inc
['object']['snap'] == "head"
213 assert len(inc
['shards']) == len(self
.acting
), \
214 "the number of returned shard does not match with the acting set"
216 def run(self
, check
, inc
):
217 func
= getattr(self
, check
)
220 def _check_errors(self
, inc
, err_name
):
223 for shard
in inc
['shards']:
224 log
.info('shard = %r' % shard
)
225 log
.info('err = %s' % err_name
)
226 assert 'osd' in shard
228 err
= err_name
in shard
['errors']
230 assert bad_found
is False, \
231 "multiple entries found for the given OSD"
232 assert err
is True, \
233 "Didn't find '{err}' in errors".format(err
=err_name
)
236 assert osd
in self
.acting
, "shard not in acting set"
237 assert err
is False, \
238 "Expected '{err}' in errors".format(err
=err_name
)
240 assert bad_found
is True, \
241 "Shard for osd.{osd} not found".format(osd
=self
.osd
)
242 assert good_found
is True, \
243 "No other acting shards found"
245 def _check_attrs(self
, inc
, attr_name
):
248 for shard
in inc
['shards']:
249 log
.info('shard = %r' % shard
)
250 log
.info('attr = %s' % attr_name
)
251 assert 'osd' in shard
253 attr
= shard
.get(attr_name
, False)
255 assert bad_attr
is None, \
256 "multiple entries found for the given OSD"
259 assert osd
in self
.acting
, "shard not in acting set"
260 assert good_attr
is None or good_attr
== attr
, \
261 "multiple good attrs found"
263 assert bad_attr
is not None, \
264 "bad {attr} not found".format(attr
=attr_name
)
265 assert good_attr
is not None, \
266 "good {attr} not found".format(attr
=attr_name
)
267 assert good_attr
!= bad_attr
, \
268 "bad attr is identical to the good ones: " \
269 "{0} == {1}".format(good_attr
, bad_attr
)
271 def data_digest_mismatch(self
, inc
):
272 assert 'data_digest_mismatch' in inc
['errors']
273 self
._check
_attrs
(inc
, 'data_digest')
275 def missing(self
, inc
):
276 assert 'missing' in inc
['union_shard_errors']
277 self
._check
_errors
(inc
, 'missing')
279 def size_mismatch(self
, inc
):
280 assert 'size_mismatch' in inc
['errors']
281 self
._check
_attrs
(inc
, 'size')
283 def omap_digest_mismatch(self
, inc
):
284 assert 'omap_digest_mismatch' in inc
['errors']
285 self
._check
_attrs
(inc
, 'omap_digest')
288 def test_list_inconsistent_obj(ctx
, manager
, osd_remote
, pg
, acting
, osd_id
,
290 mon
= manager
.controller
294 manager
.do_rados(mon
, ['-p', pool
, 'setomapval', obj_name
,
296 # Update missing digests, requires "osd deep scrub update digest min age: 0"
297 pgnum
= get_pgnum(pg
)
298 manager
.do_pg_scrub(pool
, pgnum
, 'deep-scrub')
300 messup
= MessUp(manager
, osd_remote
, pool
, osd_id
, obj_name
, obj_path
,
302 for test
in [messup
.rm_omap
, messup
.add_omap
, messup
.change_omap
,
303 messup
.append
, messup
.truncate
, messup
.change_obj
,
305 with
test() as checks
:
306 deep_scrub(manager
, pg
, pool
)
307 cmd
= 'rados list-inconsistent-pg {pool} ' \
308 '--format=json'.format(pool
=pool
)
309 with contextlib
.closing(StringIO()) as out
:
310 mon
.run(args
=cmd
.split(), stdout
=out
)
311 pgs
= json
.loads(out
.getvalue())
314 cmd
= 'rados list-inconsistent-obj {pg} ' \
315 '--format=json'.format(pg
=pg
)
316 with contextlib
.closing(StringIO()) as out
:
317 mon
.run(args
=cmd
.split(), stdout
=out
)
318 objs
= json
.loads(out
.getvalue())
319 assert len(objs
['inconsistents']) == 1
321 checker
= InconsistentObjChecker(osd_id
, acting
, obj_name
)
322 inc_obj
= objs
['inconsistents'][0]
323 log
.info('inc = %r', inc_obj
)
324 checker
.basic_checks(inc_obj
)
326 checker
.run(check
, inc_obj
)
329 def task(ctx
, config
):
341 - deep-scrub 0 missing, 1 inconsistent objects
342 - deep-scrub [0-9]+ errors
343 - repair 0 missing, 1 inconsistent objects
344 - repair [0-9]+ errors, [0-9]+ fixed
345 - shard [0-9]+ missing
346 - deep-scrub 1 missing, 1 inconsistent objects
347 - does not match object info size
348 - attr name mistmatch
349 - deep-scrub 1 missing, 0 inconsistent objects
350 - failed to pick suitable auth object
353 osd deep scrub update digest min age: 0
358 assert isinstance(config
, dict), \
359 'scrub_test task only accepts a dict for configuration'
360 first_mon
= teuthology
.get_first_mon(ctx
, config
)
361 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
363 num_osds
= teuthology
.num_instances_of_type(ctx
.cluster
, 'osd')
364 log
.info('num_osds is %s' % num_osds
)
366 manager
= ceph_manager
.CephManager(
369 logger
=log
.getChild('ceph_manager'),
372 while len(manager
.get_osd_status()['up']) < num_osds
:
375 for i
in range(num_osds
):
376 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'injectargs',
377 '--', '--osd-objectstore-fuse')
378 for i
in range(num_osds
):
379 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'flush_pg_stats')
380 manager
.wait_for_clean()
383 p
= manager
.do_rados(mon
, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
384 'write', '-b', '4096'])
385 log
.info('err is %d' % p
.exitstatus
)
387 # wait for some PG to have data that we can mess with
388 pg
, acting
= wait_for_victim_pg(manager
)
391 osd_remote
, obj_path
, obj_name
= find_victim_object(ctx
, pg
, osd
)
392 manager
.do_rados(mon
, ['-p', 'rbd', 'setomapval', obj_name
, 'key', 'val'])
393 log
.info('err is %d' % p
.exitstatus
)
394 manager
.do_rados(mon
, ['-p', 'rbd', 'setomapheader', obj_name
, 'hdr'])
395 log
.info('err is %d' % p
.exitstatus
)
397 # Update missing digests, requires "osd deep scrub update digest min age: 0"
398 pgnum
= get_pgnum(pg
)
399 manager
.do_pg_scrub('rbd', pgnum
, 'deep-scrub')
401 log
.info('messing with PG %s on osd %d' % (pg
, osd
))
402 test_repair_corrupted_obj(ctx
, manager
, pg
, osd_remote
, obj_path
, 'rbd')
403 test_repair_bad_omap(ctx
, manager
, pg
, osd
, obj_name
)
404 test_list_inconsistent_obj(ctx
, manager
, osd_remote
, pg
, acting
, osd
,
406 log
.info('test successful!')
408 # shut down fuse mount
409 for i
in range(num_osds
):
410 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'injectargs',
411 '--', '--no-osd-objectstore-fuse')