]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/mgr/test_progress.py
import 15.2.9
[ceph.git] / ceph / qa / tasks / mgr / test_progress.py
CommitLineData
11fdf7f2
TL
1
2import json
3import logging
4import time
11fdf7f2 5
e306af50 6from tasks.mgr.mgr_test_case import MgrTestCase
11fdf7f2
TL
7
8
9log = logging.getLogger(__name__)
10
11
12class TestProgress(MgrTestCase):
13 POOL = "progress_data"
14
15 # How long we expect to wait at most between taking an OSD out
16 # and seeing the progress event pop up.
17 EVENT_CREATION_PERIOD = 5
18
19 WRITE_PERIOD = 30
20
21 # Generous period for OSD recovery, should be same order of magnitude
22 # to how long it took to write the data to begin with
23 RECOVERY_PERIOD = WRITE_PERIOD * 4
24
25 def _get_progress(self):
26 out = self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "json")
27 return json.loads(out)
28
29 def _all_events(self):
30 """
31 To avoid racing on completion, we almost always want to look
32 for events in the total list of active and complete, so
33 munge them into a single list.
34 """
35 p = self._get_progress()
36 log.info(json.dumps(p, indent=2))
37 return p['events'] + p['completed']
38
9f95a23c
TL
39 def _events_in_progress(self):
40 """
41 this function returns all events that are in progress
42 """
43 p = self._get_progress()
44 log.info(json.dumps(p, indent=2))
45 return p['events']
46
adb31ebb
TL
47 def _completed_events(self):
48 """
49 This function returns all events that are completed
50 """
51 p = self._get_progress()
52 log.info(json.dumps(p, indent=2))
53 return p['completed']
54
55 def is_osd_marked_out(self, ev):
56 return ev['message'].endswith('marked out')
57
58 def is_osd_marked_in(self, ev):
59 return ev['message'].endswith('marked in')
60
61 def _get_osd_in_out_events(self, marked='both'):
62 """
63 Return the event that deals with OSDs being
64 marked in, out or both
65 """
66
67 marked_in_events = []
68 marked_out_events = []
69
70 events_in_progress = self._events_in_progress()
71 for ev in events_in_progress:
72 if self.is_osd_marked_out(ev):
73 marked_out_events.append(ev)
74 elif self.is_osd_marked_in(ev):
75 marked_in_events.append(ev)
76
77 if marked == 'both':
78 return [marked_in_events] + [marked_out_events]
79 elif marked == 'in':
80 return marked_in_events
81 else:
82 return marked_out_events
83
84 def _osd_in_out_events_count(self, marked='both'):
85 """
86 Return the event that deals with OSDs being
87 marked in, out or both
88 """
89
90 marked_in_events = []
91 marked_out_events = []
92
93 events_in_progress = self._events_in_progress()
94 for ev in events_in_progress:
95 if self.is_osd_marked_out(ev):
96 marked_out_events.append(ev)
97 elif self.is_osd_marked_in(ev):
98 marked_in_events.append(ev)
99
100 if marked == 'both':
101 return [marked_in_events] + [marked_out_events]
102 elif marked == 'in':
103 return marked_in_events
104 else:
105 return marked_out_events
106
107 def _osd_in_out_events_count(self, marked='both'):
108 """
109 Count the number of on going recovery events that deals with
110 OSDs being marked in, out or both.
111 """
112 events_in_progress = self._events_in_progress()
113 marked_in_count = 0
114 marked_out_count = 0
115
116 for ev in events_in_progress:
117 if self.is_osd_marked_out(ev):
118 marked_out_count += 1
119 elif self.is_osd_marked_in(ev):
120 marked_in_count += 1
121
122 if marked == 'both':
123 return marked_in_count + marked_out_count
124 elif marked == 'in':
125 return marked_in_count
126 else:
127 return marked_out_count
128
11fdf7f2
TL
129 def _setup_pool(self, size=None):
130 self.mgr_cluster.mon_manager.create_pool(self.POOL)
131 if size is not None:
132 self.mgr_cluster.mon_manager.raw_cluster_cmd(
133 'osd', 'pool', 'set', self.POOL, 'size', str(size))
134
135 def _write_some_data(self, t):
136 """
137 To adapt to test systems of varying performance, we write
138 data for a defined time period, rather than to a defined
139 capacity. This will hopefully result in a similar timescale
140 for PG recovery after an OSD failure.
141 """
142
143 args = [
144 "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"]
145
146 self.mgr_cluster.admin_remote.run(args=args, wait=True)
147
148 def _osd_count(self):
149 osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
150 return len(osd_map['osds'])
151
152 def setUp(self):
9f95a23c 153 super(TestProgress, self).setUp()
11fdf7f2
TL
154 # Ensure we have at least four OSDs
155 if self._osd_count() < 4:
9f95a23c 156 self.skipTest("Not enough OSDS!")
11fdf7f2
TL
157
158 # Remove any filesystems so that we can remove their pools
159 if self.mds_cluster:
160 self.mds_cluster.mds_stop()
161 self.mds_cluster.mds_fail()
162 self.mds_cluster.delete_all_filesystems()
163
164 # Remove all other pools
165 for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']:
166 self.mgr_cluster.mon_manager.remove_pool(pool['pool_name'])
167
168 self._load_module("progress")
169 self.mgr_cluster.mon_manager.raw_cluster_cmd('progress', 'clear')
170
171 def _simulate_failure(self, osd_ids=None):
172 """
173 Common lead-in to several tests: get some data in the cluster,
174 then mark an OSD out to trigger the start of a progress event.
175
176 Return the JSON representation of the failure event.
177 """
178
179 if osd_ids is None:
180 osd_ids = [0]
181
182 self._setup_pool()
183 self._write_some_data(self.WRITE_PERIOD)
184
185 for osd_id in osd_ids:
186 self.mgr_cluster.mon_manager.raw_cluster_cmd(
187 'osd', 'out', str(osd_id))
188
189 # Wait for a progress event to pop up
adb31ebb
TL
190 self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
191 timeout=self.EVENT_CREATION_PERIOD*2,
192 period=1)
193 ev = self._get_osd_in_out_events('out')[0]
11fdf7f2
TL
194 log.info(json.dumps(ev, indent=1))
195 self.assertIn("Rebalancing after osd.0 marked out", ev['message'])
9f95a23c 196
11fdf7f2
TL
197 return ev
198
9f95a23c
TL
199 def _simulate_back_in(self, osd_ids, initial_event):
200
201 for osd_id in osd_ids:
202 self.mgr_cluster.mon_manager.raw_cluster_cmd(
203 'osd', 'in', str(osd_id))
204
205 # First Event should complete promptly
206 self.wait_until_true(lambda: self._is_complete(initial_event['id']),
207 timeout=self.EVENT_CREATION_PERIOD)
208
209 try:
210 # Wait for progress event marked in to pop up
adb31ebb
TL
211 self.wait_until_equal(lambda: self._osd_in_out_events_count('in'), 1,
212 timeout=self.EVENT_CREATION_PERIOD*2,
213 period=1)
9f95a23c
TL
214 except RuntimeError as ex:
215 if not "Timed out after" in str(ex):
216 raise ex
217
218 log.info("There was no PGs affected by osd being marked in")
219 return None
220
adb31ebb 221 new_event = self._get_osd_in_out_events('in')[0]
9f95a23c
TL
222 return new_event
223
adb31ebb
TL
224 def _no_events_anywhere(self):
225 """
226 Whether there are any live or completed events
227 """
228 p = self._get_progress()
229 total_events = len(p['events']) + len(p['completed'])
230 return total_events == 0
231
11fdf7f2
TL
232 def _is_quiet(self):
233 """
234 Whether any progress events are live.
235 """
236 return len(self._get_progress()['events']) == 0
237
238 def _is_complete(self, ev_id):
239 progress = self._get_progress()
240 live_ids = [ev['id'] for ev in progress['events']]
241 complete_ids = [ev['id'] for ev in progress['completed']]
242 if ev_id in complete_ids:
243 assert ev_id not in live_ids
244 return True
245 else:
246 assert ev_id in live_ids
247 return False
248
249 def tearDown(self):
250 if self.POOL in self.mgr_cluster.mon_manager.pools:
251 self.mgr_cluster.mon_manager.remove_pool(self.POOL)
252
253 osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
254 for osd in osd_map['osds']:
255 if osd['weight'] == 0.0:
256 self.mgr_cluster.mon_manager.raw_cluster_cmd(
257 'osd', 'in', str(osd['osd']))
258
259 super(TestProgress, self).tearDown()
260
261 def test_osd_healthy_recovery(self):
262 """
263 The simple recovery case: an OSD goes down, its PGs get a new
264 placement, and we wait for the PG to get healthy in its new
265 locations.
266 """
267 ev = self._simulate_failure()
268
269 # Wait for progress event to ultimately reach completion
270 self.wait_until_true(lambda: self._is_complete(ev['id']),
271 timeout=self.RECOVERY_PERIOD)
272 self.assertTrue(self._is_quiet())
273
274 def test_pool_removal(self):
275 """
276 That a pool removed during OSD recovery causes the
277 progress event to be correctly marked complete once there
278 is no more data to move.
279 """
280 ev = self._simulate_failure()
281
282 self.mgr_cluster.mon_manager.remove_pool(self.POOL)
283
284 # Event should complete promptly
285 self.wait_until_true(lambda: self._is_complete(ev['id']),
286 timeout=self.EVENT_CREATION_PERIOD)
287 self.assertTrue(self._is_quiet())
288
289 def test_osd_came_back(self):
290 """
291 When a recovery is underway, but then the out OSD
292 comes back in, such that recovery is no longer necessary.
9f95a23c
TL
293 It should create another event for when osd is marked in
294 and cancel the one that is still ongoing.
11fdf7f2 295 """
9f95a23c 296 ev1 = self._simulate_failure()
11fdf7f2 297
9f95a23c
TL
298 ev2 = self._simulate_back_in([0], ev1)
299
300 if ev2 is not None:
301 # Wait for progress event to ultimately complete
302 self.wait_until_true(lambda: self._is_complete(ev2['id']),
303 timeout=self.RECOVERY_PERIOD)
11fdf7f2 304
11fdf7f2
TL
305 self.assertTrue(self._is_quiet())
306
307 def test_osd_cannot_recover(self):
308 """
309 When the cluster cannot recover from a lost OSD, e.g.
310 because there is no suitable new placement for it.
311 (a size=3 pool when there are only 2 OSDs left)
312 (a size=3 pool when the remaining osds are only on 2 hosts)
313
314 Progress event should not be created.
315 """
316
317 pool_size = 3
318
319 self._setup_pool(size=pool_size)
320 self._write_some_data(self.WRITE_PERIOD)
321
322 # Fail enough OSDs so there are less than N_replicas OSDs
323 # available.
324 osd_count = self._osd_count()
325
326 # First do some failures that will result in a normal rebalance
327 # (Assumption: we're in a test environment that is configured
328 # not to require replicas be on different hosts, like teuthology)
329 for osd_id in range(0, osd_count - pool_size):
330 self.mgr_cluster.mon_manager.raw_cluster_cmd(
331 'osd', 'out', str(osd_id))
332
333 # We should see an event for each of the OSDs we took out
334 self.wait_until_equal(
335 lambda: len(self._all_events()),
336 osd_count - pool_size,
337 timeout=self.EVENT_CREATION_PERIOD)
338
339 # Those should complete cleanly
340 self.wait_until_true(
341 lambda: self._is_quiet(),
342 timeout=self.RECOVERY_PERIOD
343 )
344
345 # Fail one last OSD, at the point the PGs have nowhere to go
346 victim_osd = osd_count - pool_size
347 self.mgr_cluster.mon_manager.raw_cluster_cmd(
348 'osd', 'out', str(victim_osd))
349
350 # Check that no event is created
351 time.sleep(self.EVENT_CREATION_PERIOD)
352
adb31ebb
TL
353 self.assertEqual(
354 self._osd_in_out_completed_events_count('out'),
355 osd_count - pool_size)
356
357 def test_turn_off_module(self):
358 """
359 When the the module is turned off, there should not
360 be any on going events or completed events.
361 Also module should not accept any kind of Remote Event
362 coming in from other module, however, once it is turned
363 back, on creating an event should be working as it is.
364 """
365
366 pool_size = 3
367 self._setup_pool(size=pool_size)
368 self._write_some_data(self.WRITE_PERIOD)
369
370 self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "off")
371
372 self.mgr_cluster.mon_manager.raw_cluster_cmd(
373 'osd', 'out', '0')
374
375 time.sleep(self.EVENT_CREATION_PERIOD)
376
377 self.mgr_cluster.mon_manager.raw_cluster_cmd(
378 'osd', 'in', '0')
379
380 time.sleep(self.EVENT_CREATION_PERIOD)
381
382 self.assertTrue(self._no_events_anywhere())
383
384 self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "on")
385
386 self._write_some_data(self.WRITE_PERIOD)
387
388 self.mgr_cluster.mon_manager.raw_cluster_cmd(
389 'osd', 'out', '0')
390
391 # Wait for a progress event to pop up
392 self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
393 timeout=self.EVENT_CREATION_PERIOD*2,
394 period=1)
395
396 ev1 = self._get_osd_in_out_events('out')[0]
397
398 log.info(json.dumps(ev1, indent=1))
399
400 self.wait_until_true(lambda: self._is_complete(ev1['id']),
401 timeout=self.RECOVERY_PERIOD)
402 self.assertTrue(self._is_quiet())