]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/osd_recovery.py
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / qa / tasks / osd_recovery.py
CommitLineData
7c673cae
FG
1"""
2osd recovery
3"""
4import logging
5import ceph_manager
6import time
7from teuthology import misc as teuthology
8
9
10log = logging.getLogger(__name__)
11
12
13def rados_start(testdir, remote, cmd):
14 """
15 Run a remote rados command (currently used to only write data)
16 """
17 log.info("rados %s" % ' '.join(cmd))
18 pre = [
19 'adjust-ulimits',
20 'ceph-coverage',
21 '{tdir}/archive/coverage'.format(tdir=testdir),
22 'rados',
23 ];
24 pre.extend(cmd)
25 proc = remote.run(
26 args=pre,
27 wait=False,
28 )
29 return proc
30
31def task(ctx, config):
32 """
33 Test (non-backfill) recovery
34 """
35 if config is None:
36 config = {}
37 assert isinstance(config, dict), \
38 'task only accepts a dict for configuration'
39 testdir = teuthology.get_testdir(ctx)
40 first_mon = teuthology.get_first_mon(ctx, config)
41 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
42
43 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
44 log.info('num_osds is %s' % num_osds)
45 assert num_osds == 3
46
47 manager = ceph_manager.CephManager(
48 mon,
49 ctx=ctx,
50 logger=log.getChild('ceph_manager'),
51 )
52
53 while len(manager.get_osd_status()['up']) < 3:
54 time.sleep(10)
55 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
56 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
57 manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
58 manager.wait_for_clean()
59
60 # test some osdmap flags
61 manager.raw_cluster_cmd('osd', 'set', 'noin')
62 manager.raw_cluster_cmd('osd', 'set', 'noout')
63 manager.raw_cluster_cmd('osd', 'set', 'noup')
64 manager.raw_cluster_cmd('osd', 'set', 'nodown')
65 manager.raw_cluster_cmd('osd', 'unset', 'noin')
66 manager.raw_cluster_cmd('osd', 'unset', 'noout')
67 manager.raw_cluster_cmd('osd', 'unset', 'noup')
68 manager.raw_cluster_cmd('osd', 'unset', 'nodown')
69
70 # write some new data
71 p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
72 '--no-cleanup'])
73
74 time.sleep(15)
75
76 # trigger a divergent target:
77 # blackhole + restart osd.1 (shorter log)
78 manager.blackhole_kill_osd(1)
79 # kill osd.2 (longer log... we'll make it divergent below)
80 manager.kill_osd(2)
81 time.sleep(2)
82 manager.revive_osd(1)
83
84 # wait for our writes to complete + succeed
85 err = p.wait()
86 log.info('err is %d' % err)
87
88 # cluster must repeer
89 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
90 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
91 manager.wait_for_active_or_down()
92
93 # write some more (make sure osd.2 really is divergent)
94 p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
95 p.wait()
96
97 # revive divergent osd
98 manager.revive_osd(2)
99
100 while len(manager.get_osd_status()['up']) < 3:
101 log.info('waiting a bit...')
102 time.sleep(2)
103 log.info('3 are up!')
104
105 # cluster must recover
106 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
107 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
108 manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
109 manager.wait_for_clean()
110
111
112def test_incomplete_pgs(ctx, config):
113 """
114 Test handling of incomplete pgs. Requires 4 osds.
115 """
116 testdir = teuthology.get_testdir(ctx)
117 if config is None:
118 config = {}
119 assert isinstance(config, dict), \
120 'task only accepts a dict for configuration'
121 first_mon = teuthology.get_first_mon(ctx, config)
122 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
123
124 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
125 log.info('num_osds is %s' % num_osds)
126 assert num_osds == 4
127
128 manager = ceph_manager.CephManager(
129 mon,
130 ctx=ctx,
131 logger=log.getChild('ceph_manager'),
132 )
133
134 while len(manager.get_osd_status()['up']) < 4:
135 time.sleep(10)
136
137 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
138 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
139 manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
140 manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
141 manager.wait_for_clean()
142
143 log.info('Testing incomplete pgs...')
144
145 for i in range(4):
146 manager.set_config(
147 i,
148 osd_recovery_delay_start=1000)
149
150 # move data off of osd.0, osd.1
151 manager.raw_cluster_cmd('osd', 'out', '0', '1')
152 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
153 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
154 manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
155 manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
156 manager.wait_for_clean()
157
158 # lots of objects in rbd (no pg log, will backfill)
159 p = rados_start(testdir, mon,
160 ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
161 '--no-cleanup'])
162 p.wait()
163
164 # few objects in rbd pool (with pg log, normal recovery)
165 for f in range(1, 20):
166 p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
167 'foo.%d' % f, '/etc/passwd'])
168 p.wait()
169
170 # move it back
171 manager.raw_cluster_cmd('osd', 'in', '0', '1')
172 manager.raw_cluster_cmd('osd', 'out', '2', '3')
173 time.sleep(10)
174 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
175 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
176 manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
177 manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
178 time.sleep(10)
179 manager.wait_for_active()
180
181 assert not manager.is_clean()
182 assert not manager.is_recovered()
183
184 # kill 2 + 3
185 log.info('stopping 2,3')
186 manager.kill_osd(2)
187 manager.kill_osd(3)
188 log.info('...')
189 manager.raw_cluster_cmd('osd', 'down', '2', '3')
190 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
191 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
192 manager.wait_for_active_or_down()
193
194 assert manager.get_num_down() > 0
195
196 # revive 2 + 3
197 manager.revive_osd(2)
198 manager.revive_osd(3)
199 while len(manager.get_osd_status()['up']) < 4:
200 log.info('waiting a bit...')
201 time.sleep(2)
202 log.info('all are up!')
203
204 for i in range(4):
205 manager.kick_recovery_wq(i)
206
207 # cluster must recover
208 manager.wait_for_clean()