]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/osd_recovery.py
b0623c21ba7c2fc1635be5538dc3765f7bc083b1
6 from tasks
import ceph_manager
7 from teuthology
import misc
as teuthology
10 log
= logging
.getLogger(__name__
)
13 def rados_start(testdir
, remote
, cmd
):
15 Run a remote rados command (currently used to only write data)
17 log
.info("rados %s" % ' '.join(cmd
))
21 '{tdir}/archive/coverage'.format(tdir
=testdir
),
31 def task(ctx
, config
):
33 Test (non-backfill) recovery
37 assert isinstance(config
, dict), \
38 'task only accepts a dict for configuration'
39 testdir
= teuthology
.get_testdir(ctx
)
40 first_mon
= teuthology
.get_first_mon(ctx
, config
)
41 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
43 num_osds
= teuthology
.num_instances_of_type(ctx
.cluster
, 'osd')
44 log
.info('num_osds is %s' % num_osds
)
47 manager
= ceph_manager
.CephManager(
50 logger
=log
.getChild('ceph_manager'),
53 while len(manager
.get_osd_status()['up']) < 3:
55 manager
.flush_pg_stats([0, 1, 2])
56 manager
.wait_for_clean()
58 # test some osdmap flags
59 manager
.raw_cluster_cmd('osd', 'set', 'noin')
60 manager
.raw_cluster_cmd('osd', 'set', 'noout')
61 manager
.raw_cluster_cmd('osd', 'set', 'noup')
62 manager
.raw_cluster_cmd('osd', 'set', 'nodown')
63 manager
.raw_cluster_cmd('osd', 'unset', 'noin')
64 manager
.raw_cluster_cmd('osd', 'unset', 'noout')
65 manager
.raw_cluster_cmd('osd', 'unset', 'noup')
66 manager
.raw_cluster_cmd('osd', 'unset', 'nodown')
69 p
= rados_start(testdir
, mon
, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
74 # trigger a divergent target:
75 # blackhole + restart osd.1 (shorter log)
76 manager
.blackhole_kill_osd(1)
77 # kill osd.2 (longer log... we'll make it divergent below)
82 # wait for our writes to complete + succeed
84 log
.info('err is %d' % err
)
87 manager
.flush_pg_stats([0, 1])
88 manager
.wait_for_active_or_down()
90 # write some more (make sure osd.2 really is divergent)
91 p
= rados_start(testdir
, mon
, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
94 # revive divergent osd
97 while len(manager
.get_osd_status()['up']) < 3:
98 log
.info('waiting a bit...')
100 log
.info('3 are up!')
102 # cluster must recover
103 manager
.flush_pg_stats([0, 1, 2])
104 manager
.wait_for_clean()
107 def test_incomplete_pgs(ctx
, config
):
109 Test handling of incomplete pgs. Requires 4 osds.
111 testdir
= teuthology
.get_testdir(ctx
)
114 assert isinstance(config
, dict), \
115 'task only accepts a dict for configuration'
116 first_mon
= teuthology
.get_first_mon(ctx
, config
)
117 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
119 num_osds
= teuthology
.num_instances_of_type(ctx
.cluster
, 'osd')
120 log
.info('num_osds is %s' % num_osds
)
123 manager
= ceph_manager
.CephManager(
126 logger
=log
.getChild('ceph_manager'),
129 while len(manager
.get_osd_status()['up']) < 4:
132 manager
.flush_pg_stats([0, 1, 2, 3])
133 manager
.wait_for_clean()
135 log
.info('Testing incomplete pgs...')
140 osd_recovery_delay_start
=1000)
142 # move data off of osd.0, osd.1
143 manager
.raw_cluster_cmd('osd', 'out', '0', '1')
144 manager
.flush_pg_stats([0, 1, 2, 3], [0, 1])
145 manager
.wait_for_clean()
147 # lots of objects in rbd (no pg log, will backfill)
148 p
= rados_start(testdir
, mon
,
149 ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
153 # few objects in rbd pool (with pg log, normal recovery)
154 for f
in range(1, 20):
155 p
= rados_start(testdir
, mon
, ['-p', 'rbd', 'put',
156 'foo.%d' % f
, '/etc/passwd'])
160 manager
.raw_cluster_cmd('osd', 'in', '0', '1')
161 manager
.raw_cluster_cmd('osd', 'out', '2', '3')
163 manager
.flush_pg_stats([0, 1, 2, 3], [2, 3])
165 manager
.wait_for_active()
167 assert not manager
.is_clean()
168 assert not manager
.is_recovered()
171 log
.info('stopping 2,3')
175 manager
.raw_cluster_cmd('osd', 'down', '2', '3')
176 manager
.flush_pg_stats([0, 1])
177 manager
.wait_for_active_or_down()
179 assert manager
.get_num_down() > 0
182 manager
.revive_osd(2)
183 manager
.revive_osd(3)
184 while len(manager
.get_osd_status()['up']) < 4:
185 log
.info('waiting a bit...')
187 log
.info('all are up!')
190 manager
.kick_recovery_wq(i
)
192 # cluster must recover
193 manager
.wait_for_clean()