]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/osd_recovery.py
update sources to v12.1.0
[ceph.git] / ceph / qa / tasks / osd_recovery.py
1 """
2 osd recovery
3 """
4 import logging
5 import ceph_manager
6 import time
7 from teuthology import misc as teuthology
8
9
10 log = logging.getLogger(__name__)
11
12
13 def rados_start(testdir, remote, cmd):
14 """
15 Run a remote rados command (currently used to only write data)
16 """
17 log.info("rados %s" % ' '.join(cmd))
18 pre = [
19 'adjust-ulimits',
20 'ceph-coverage',
21 '{tdir}/archive/coverage'.format(tdir=testdir),
22 'rados',
23 ];
24 pre.extend(cmd)
25 proc = remote.run(
26 args=pre,
27 wait=False,
28 )
29 return proc
30
31 def task(ctx, config):
32 """
33 Test (non-backfill) recovery
34 """
35 if config is None:
36 config = {}
37 assert isinstance(config, dict), \
38 'task only accepts a dict for configuration'
39 testdir = teuthology.get_testdir(ctx)
40 first_mon = teuthology.get_first_mon(ctx, config)
41 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
42
43 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
44 log.info('num_osds is %s' % num_osds)
45 assert num_osds == 3
46
47 manager = ceph_manager.CephManager(
48 mon,
49 ctx=ctx,
50 logger=log.getChild('ceph_manager'),
51 )
52
53 while len(manager.get_osd_status()['up']) < 3:
54 time.sleep(10)
55 manager.flush_pg_stats([0, 1, 2])
56 manager.wait_for_clean()
57
58 # test some osdmap flags
59 manager.raw_cluster_cmd('osd', 'set', 'noin')
60 manager.raw_cluster_cmd('osd', 'set', 'noout')
61 manager.raw_cluster_cmd('osd', 'set', 'noup')
62 manager.raw_cluster_cmd('osd', 'set', 'nodown')
63 manager.raw_cluster_cmd('osd', 'unset', 'noin')
64 manager.raw_cluster_cmd('osd', 'unset', 'noout')
65 manager.raw_cluster_cmd('osd', 'unset', 'noup')
66 manager.raw_cluster_cmd('osd', 'unset', 'nodown')
67
68 # write some new data
69 p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
70 '--no-cleanup'])
71
72 time.sleep(15)
73
74 # trigger a divergent target:
75 # blackhole + restart osd.1 (shorter log)
76 manager.blackhole_kill_osd(1)
77 # kill osd.2 (longer log... we'll make it divergent below)
78 manager.kill_osd(2)
79 time.sleep(2)
80 manager.revive_osd(1)
81
82 # wait for our writes to complete + succeed
83 err = p.wait()
84 log.info('err is %d' % err)
85
86 # cluster must repeer
87 manager.flush_pg_stats([0, 1])
88 manager.wait_for_active_or_down()
89
90 # write some more (make sure osd.2 really is divergent)
91 p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
92 p.wait()
93
94 # revive divergent osd
95 manager.revive_osd(2)
96
97 while len(manager.get_osd_status()['up']) < 3:
98 log.info('waiting a bit...')
99 time.sleep(2)
100 log.info('3 are up!')
101
102 # cluster must recover
103 manager.flush_pg_stats([0, 1, 2])
104 manager.wait_for_clean()
105
106
107 def test_incomplete_pgs(ctx, config):
108 """
109 Test handling of incomplete pgs. Requires 4 osds.
110 """
111 testdir = teuthology.get_testdir(ctx)
112 if config is None:
113 config = {}
114 assert isinstance(config, dict), \
115 'task only accepts a dict for configuration'
116 first_mon = teuthology.get_first_mon(ctx, config)
117 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
118
119 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
120 log.info('num_osds is %s' % num_osds)
121 assert num_osds == 4
122
123 manager = ceph_manager.CephManager(
124 mon,
125 ctx=ctx,
126 logger=log.getChild('ceph_manager'),
127 )
128
129 while len(manager.get_osd_status()['up']) < 4:
130 time.sleep(10)
131
132 manager.flush_pg_stats([0, 1, 2, 3])
133 manager.wait_for_clean()
134
135 log.info('Testing incomplete pgs...')
136
137 for i in range(4):
138 manager.set_config(
139 i,
140 osd_recovery_delay_start=1000)
141
142 # move data off of osd.0, osd.1
143 manager.raw_cluster_cmd('osd', 'out', '0', '1')
144 manager.flush_pg_stats([0, 1, 2, 3], [0, 1])
145 manager.wait_for_clean()
146
147 # lots of objects in rbd (no pg log, will backfill)
148 p = rados_start(testdir, mon,
149 ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
150 '--no-cleanup'])
151 p.wait()
152
153 # few objects in rbd pool (with pg log, normal recovery)
154 for f in range(1, 20):
155 p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
156 'foo.%d' % f, '/etc/passwd'])
157 p.wait()
158
159 # move it back
160 manager.raw_cluster_cmd('osd', 'in', '0', '1')
161 manager.raw_cluster_cmd('osd', 'out', '2', '3')
162 time.sleep(10)
163 manager.flush_pg_stats([0, 1, 2, 3], [2, 3])
164 time.sleep(10)
165 manager.wait_for_active()
166
167 assert not manager.is_clean()
168 assert not manager.is_recovered()
169
170 # kill 2 + 3
171 log.info('stopping 2,3')
172 manager.kill_osd(2)
173 manager.kill_osd(3)
174 log.info('...')
175 manager.raw_cluster_cmd('osd', 'down', '2', '3')
176 manager.flush_pg_stats([0, 1])
177 manager.wait_for_active_or_down()
178
179 assert manager.get_num_down() > 0
180
181 # revive 2 + 3
182 manager.revive_osd(2)
183 manager.revive_osd(3)
184 while len(manager.get_osd_status()['up']) < 4:
185 log.info('waiting a bit...')
186 time.sleep(2)
187 log.info('all are up!')
188
189 for i in range(4):
190 manager.kick_recovery_wq(i)
191
192 # cluster must recover
193 manager.wait_for_clean()