]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/scrub_test.py
10 from tasks
import ceph_manager
11 from teuthology
import misc
as teuthology
13 log
= logging
.getLogger(__name__
)
16 def wait_for_victim_pg(manager
, poolid
):
17 """Return a PG with some data and its acting set"""
18 # wait for some PG to have data that we can mess with
21 stats
= manager
.get_pg_stats()
23 pgid
= str(pg
['pgid'])
24 pgpool
= int(pgid
.split('.')[0])
27 size
= pg
['stat_sum']['num_bytes']
35 def find_victim_object(ctx
, pg
, osd
):
36 """Return a file to be fuzzed"""
37 (osd_remote
,) = ctx
.cluster
.only('osd.%d' % osd
).remotes
.keys()
38 data_path
= os
.path
.join(
40 'ceph-{id}'.format(id=osd
),
42 '{pg}_head'.format(pg
=pg
),
47 ls_out
= osd_remote
.sh('sudo ls %s' % data_path
)
49 # find an object file we can mess with (and not the pg info object)
50 osdfilename
= next(line
for line
in ls_out
.split('\n')
51 if not line
.endswith('::::head#'))
52 assert osdfilename
is not None
54 # Get actual object name from osd stored filename
55 objname
= osdfilename
.split(':')[4]
56 return osd_remote
, os
.path
.join(data_path
, osdfilename
), objname
59 def corrupt_file(osd_remote
, path
):
60 # put a single \0 at the beginning of the file
65 'bs=1', 'count=1', 'conv=notrunc']
75 def deep_scrub(manager
, victim
, pool
):
76 # scrub, verify inconsistent
77 pgnum
= get_pgnum(victim
)
78 manager
.do_pg_scrub(pool
, pgnum
, 'deep-scrub')
80 stats
= manager
.get_single_pg_stats(victim
)
81 inconsistent
= stats
['state'].find('+inconsistent') != -1
85 def repair(manager
, victim
, pool
):
86 # repair, verify no longer inconsistent
87 pgnum
= get_pgnum(victim
)
88 manager
.do_pg_scrub(pool
, pgnum
, 'repair')
90 stats
= manager
.get_single_pg_stats(victim
)
91 inconsistent
= stats
['state'].find('+inconsistent') != -1
92 assert not inconsistent
95 def test_repair_corrupted_obj(ctx
, manager
, pg
, osd_remote
, obj_path
, pool
):
96 corrupt_file(osd_remote
, obj_path
)
97 deep_scrub(manager
, pg
, pool
)
98 repair(manager
, pg
, pool
)
101 def test_repair_bad_omap(ctx
, manager
, pg
, osd
, objname
):
102 # Test deep-scrub with various omap modifications
103 # Modify omap on specific osd
104 log
.info('fuzzing omap of %s' % objname
)
105 manager
.osd_admin_socket(osd
, ['rmomapkey', 'rbd', objname
, 'key'])
106 manager
.osd_admin_socket(osd
, ['setomapval', 'rbd', objname
,
108 manager
.osd_admin_socket(osd
, ['setomapheader', 'rbd', objname
, 'badhdr'])
110 deep_scrub(manager
, pg
, 'rbd')
111 # please note, the repair here is errnomous, it rewrites the correct omap
112 # digest and data digest on the replicas with the corresponding digests
113 # from the primary osd which is hosting the victim object, see
114 # find_victim_object().
115 # so we need to either put this test and the end of this task or
116 # undo the mess-up manually before the "repair()" that just ensures
117 # the cleanup is sane, otherwise the succeeding tests will fail. if they
118 # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
119 manager
.osd_admin_socket(osd
, ['setomapheader', 'rbd', objname
, 'hdr'])
120 manager
.osd_admin_socket(osd
, ['rmomapkey', 'rbd', objname
, 'badkey'])
121 manager
.osd_admin_socket(osd
, ['setomapval', 'rbd', objname
,
123 repair(manager
, pg
, 'rbd')
127 def __init__(self
, manager
, osd_remote
, pool
, osd_id
,
128 obj_name
, obj_path
, omap_key
, omap_val
):
129 self
.manager
= manager
130 self
.osd
= osd_remote
135 self
.omap_key
= omap_key
136 self
.omap_val
= omap_val
138 @contextlib.contextmanager
139 def _test_with_file(self
, messup_cmd
, *checks
):
140 temp
= tempfile
.mktemp()
141 backup_cmd
= ['sudo', 'cp', os
.path
.join(self
.path
, 'data'), temp
]
142 self
.osd
.run(args
=backup_cmd
)
143 self
.osd
.run(args
=messup_cmd
.split())
145 create_cmd
= ['sudo', 'mkdir', self
.path
]
146 self
.osd
.run(args
=create_cmd
, check_status
=False)
147 restore_cmd
= ['sudo', 'cp', temp
, os
.path
.join(self
.path
, 'data')]
148 self
.osd
.run(args
=restore_cmd
)
151 cmd
= 'sudo rmdir {path}'.format(path
=self
.path
)
152 return self
._test
_with
_file
(cmd
, 'missing')
155 cmd
= 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
156 'conv=notrunc oflag=append'.format(path
=self
.path
)
157 return self
._test
_with
_file
(cmd
,
158 'data_digest_mismatch',
162 cmd
= 'sudo dd if=/dev/null of={path}/data'.format(path
=self
.path
)
163 return self
._test
_with
_file
(cmd
,
164 'data_digest_mismatch',
167 def change_obj(self
):
168 cmd
= 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
169 'conv=notrunc'.format(path
=self
.path
)
170 return self
._test
_with
_file
(cmd
,
171 'data_digest_mismatch')
173 @contextlib.contextmanager
175 cmd
= ['rmomapkey', self
.pool
, self
.obj
, self
.omap_key
]
176 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
177 yield ('omap_digest_mismatch',)
178 cmd
= ['setomapval', self
.pool
, self
.obj
,
179 self
.omap_key
, self
.omap_val
]
180 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
182 @contextlib.contextmanager
184 cmd
= ['setomapval', self
.pool
, self
.obj
, 'badkey', 'badval']
185 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
186 yield ('omap_digest_mismatch',)
187 cmd
= ['rmomapkey', self
.pool
, self
.obj
, 'badkey']
188 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
190 @contextlib.contextmanager
191 def change_omap(self
):
192 cmd
= ['setomapval', self
.pool
, self
.obj
, self
.omap_key
, 'badval']
193 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
194 yield ('omap_digest_mismatch',)
195 cmd
= ['setomapval', self
.pool
, self
.obj
, self
.omap_key
, self
.omap_val
]
196 self
.manager
.osd_admin_socket(self
.osd_id
, cmd
)
199 class InconsistentObjChecker
:
200 """Check the returned inconsistents/inconsistent info"""
202 def __init__(self
, osd
, acting
, obj_name
):
206 assert self
.osd
in self
.acting
208 def basic_checks(self
, inc
):
209 assert inc
['object']['name'] == self
.obj
210 assert inc
['object']['snap'] == "head"
211 assert len(inc
['shards']) == len(self
.acting
), \
212 "the number of returned shard does not match with the acting set"
214 def run(self
, check
, inc
):
215 func
= getattr(self
, check
)
218 def _check_errors(self
, inc
, err_name
):
221 for shard
in inc
['shards']:
222 log
.info('shard = %r' % shard
)
223 log
.info('err = %s' % err_name
)
224 assert 'osd' in shard
226 err
= err_name
in shard
['errors']
228 assert bad_found
is False, \
229 "multiple entries found for the given OSD"
230 assert err
is True, \
231 "Didn't find '{err}' in errors".format(err
=err_name
)
234 assert osd
in self
.acting
, "shard not in acting set"
235 assert err
is False, \
236 "Expected '{err}' in errors".format(err
=err_name
)
238 assert bad_found
is True, \
239 "Shard for osd.{osd} not found".format(osd
=self
.osd
)
240 assert good_found
is True, \
241 "No other acting shards found"
243 def _check_attrs(self
, inc
, attr_name
):
246 for shard
in inc
['shards']:
247 log
.info('shard = %r' % shard
)
248 log
.info('attr = %s' % attr_name
)
249 assert 'osd' in shard
251 attr
= shard
.get(attr_name
, False)
253 assert bad_attr
is None, \
254 "multiple entries found for the given OSD"
257 assert osd
in self
.acting
, "shard not in acting set"
258 assert good_attr
is None or good_attr
== attr
, \
259 "multiple good attrs found"
261 assert bad_attr
is not None, \
262 "bad {attr} not found".format(attr
=attr_name
)
263 assert good_attr
is not None, \
264 "good {attr} not found".format(attr
=attr_name
)
265 assert good_attr
!= bad_attr
, \
266 "bad attr is identical to the good ones: " \
267 "{0} == {1}".format(good_attr
, bad_attr
)
269 def data_digest_mismatch(self
, inc
):
270 assert 'data_digest_mismatch' in inc
['errors']
271 self
._check
_attrs
(inc
, 'data_digest')
273 def missing(self
, inc
):
274 assert 'missing' in inc
['union_shard_errors']
275 self
._check
_errors
(inc
, 'missing')
277 def size_mismatch(self
, inc
):
278 assert 'size_mismatch' in inc
['errors']
279 self
._check
_attrs
(inc
, 'size')
281 def omap_digest_mismatch(self
, inc
):
282 assert 'omap_digest_mismatch' in inc
['errors']
283 self
._check
_attrs
(inc
, 'omap_digest')
286 def test_list_inconsistent_obj(ctx
, manager
, osd_remote
, pg
, acting
, osd_id
,
288 mon
= manager
.controller
292 manager
.do_rados(['setomapval', obj_name
, omap_key
, omap_val
], pool
=pool
)
293 # Update missing digests, requires "osd deep scrub update digest min age: 0"
294 pgnum
= get_pgnum(pg
)
295 manager
.do_pg_scrub(pool
, pgnum
, 'deep-scrub')
297 messup
= MessUp(manager
, osd_remote
, pool
, osd_id
, obj_name
, obj_path
,
299 for test
in [messup
.rm_omap
, messup
.add_omap
, messup
.change_omap
,
300 messup
.append
, messup
.truncate
, messup
.change_obj
,
302 with
test() as checks
:
303 deep_scrub(manager
, pg
, pool
)
304 cmd
= 'rados list-inconsistent-pg {pool} ' \
305 '--format=json'.format(pool
=pool
)
306 pgs
= json
.loads(mon
.sh(cmd
))
309 cmd
= 'rados list-inconsistent-obj {pg} ' \
310 '--format=json'.format(pg
=pg
)
311 objs
= json
.loads(mon
.sh(cmd
))
312 assert len(objs
['inconsistents']) == 1
314 checker
= InconsistentObjChecker(osd_id
, acting
, obj_name
)
315 inc_obj
= objs
['inconsistents'][0]
316 log
.info('inc = %r', inc_obj
)
317 checker
.basic_checks(inc_obj
)
319 checker
.run(check
, inc_obj
)
322 def task(ctx
, config
):
334 - deep-scrub 0 missing, 1 inconsistent objects
335 - deep-scrub [0-9]+ errors
336 - repair 0 missing, 1 inconsistent objects
337 - repair [0-9]+ errors, [0-9]+ fixed
338 - shard [0-9]+ .* : missing
339 - deep-scrub 1 missing, 1 inconsistent objects
340 - does not match object info size
341 - attr name mistmatch
342 - deep-scrub 1 missing, 0 inconsistent objects
343 - failed to pick suitable auth object
344 - candidate size [0-9]+ info size [0-9]+ mismatch
347 osd deep scrub update digest min age: 0
352 assert isinstance(config
, dict), \
353 'scrub_test task only accepts a dict for configuration'
354 first_mon
= teuthology
.get_first_mon(ctx
, config
)
355 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
357 num_osds
= teuthology
.num_instances_of_type(ctx
.cluster
, 'osd')
358 log
.info('num_osds is %s' % num_osds
)
360 manager
= ceph_manager
.CephManager(
363 logger
=log
.getChild('ceph_manager'),
366 while len(manager
.get_osd_status()['up']) < num_osds
:
369 for i
in range(num_osds
):
370 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'injectargs',
371 '--', '--osd-objectstore-fuse')
372 manager
.flush_pg_stats(range(num_osds
))
373 manager
.wait_for_clean()
375 osd_dump
= manager
.get_osd_dump_json()
377 for p
in osd_dump
['pools']:
378 if p
['pool_name'] == 'rbd':
384 p
= manager
.do_rados(['bench', '--no-cleanup', '1', 'write', '-b', '4096'], pool
='rbd')
385 log
.info('err is %d' % p
.exitstatus
)
387 # wait for some PG to have data that we can mess with
388 pg
, acting
= wait_for_victim_pg(manager
, poolid
)
391 osd_remote
, obj_path
, obj_name
= find_victim_object(ctx
, pg
, osd
)
392 manager
.do_rados(['setomapval', obj_name
, 'key', 'val'], pool
='rbd')
393 log
.info('err is %d' % p
.exitstatus
)
394 manager
.do_rados(['setomapheader', obj_name
, 'hdr'], pool
='rbd')
395 log
.info('err is %d' % p
.exitstatus
)
397 # Update missing digests, requires "osd deep scrub update digest min age: 0"
398 pgnum
= get_pgnum(pg
)
399 manager
.do_pg_scrub('rbd', pgnum
, 'deep-scrub')
401 log
.info('messing with PG %s on osd %d' % (pg
, osd
))
402 test_repair_corrupted_obj(ctx
, manager
, pg
, osd_remote
, obj_path
, 'rbd')
403 test_repair_bad_omap(ctx
, manager
, pg
, osd
, obj_name
)
404 test_list_inconsistent_obj(ctx
, manager
, osd_remote
, pg
, acting
, osd
,
406 log
.info('test successful!')
408 # shut down fuse mount
409 for i
in range(num_osds
):
410 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'injectargs',
411 '--', '--no-osd-objectstore-fuse')