]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mgr/test_progress.py
6 from .mgr_test_case
import MgrTestCase
9 log
= logging
.getLogger(__name__
)
12 class TestProgress(MgrTestCase
):
13 POOL
= "progress_data"
15 # How long we expect to wait at most between taking an OSD out
16 # and seeing the progress event pop up.
17 EVENT_CREATION_PERIOD
= 15
21 # Generous period for OSD recovery, should be same order of magnitude
22 # to how long it took to write the data to begin with
23 RECOVERY_PERIOD
= WRITE_PERIOD
* 4
25 def _get_progress(self
):
26 out
= self
.mgr_cluster
.mon_manager
.raw_cluster_cmd("progress", "json")
27 return json
.loads(out
)
29 def _all_events(self
):
31 To avoid racing on completion, we almost always want to look
32 for events in the total list of active and complete, so
33 munge them into a single list.
35 p
= self
._get
_progress
()
36 log
.info(json
.dumps(p
, indent
=2))
37 return p
['events'] + p
['completed']
39 def _events_in_progress(self
):
41 this function returns all events that are in progress
43 p
= self
._get
_progress
()
44 log
.info(json
.dumps(p
, indent
=2))
47 def _completed_events(self
):
49 This function returns all events that are completed
51 p
= self
._get
_progress
()
52 log
.info(json
.dumps(p
, indent
=2))
55 def is_osd_marked_out(self
, ev
):
56 return ev
['message'].endswith('marked out')
58 def is_osd_marked_in(self
, ev
):
59 return ev
['message'].endswith('marked in')
61 def _get_osd_in_out_events(self
, marked
='both'):
63 Return the event that deals with OSDs being
64 marked in, out or both
68 marked_out_events
= []
70 events_in_progress
= self
._events
_in
_progress
()
71 for ev
in events_in_progress
:
72 if self
.is_osd_marked_out(ev
):
73 marked_out_events
.append(ev
)
74 elif self
.is_osd_marked_in(ev
):
75 marked_in_events
.append(ev
)
78 return [marked_in_events
] + [marked_out_events
]
80 return marked_in_events
82 return marked_out_events
84 def _osd_in_out_events_count(self
, marked
='both'):
86 Count the number of on going recovery events that deals with
87 OSDs being marked in, out or both.
89 events_in_progress
= self
._events
_in
_progress
()
93 for ev
in events_in_progress
:
94 if self
.is_osd_marked_out(ev
):
96 elif self
.is_osd_marked_in(ev
):
100 return marked_in_count
+ marked_out_count
102 return marked_in_count
104 return marked_out_count
106 def _setup_pool(self
, size
=None):
107 self
.mgr_cluster
.mon_manager
.create_pool(self
.POOL
)
109 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
110 'osd', 'pool', 'set', self
.POOL
, 'size', str(size
))
112 def _osd_in_out_completed_events_count(self
, marked
='both'):
114 Count the number of completed recovery events that deals with
115 OSDs being marked in, out, or both.
118 completed_events
= self
._completed
_events
()
122 for ev
in completed_events
:
123 if self
.is_osd_marked_out(ev
):
124 marked_out_count
+= 1
125 elif self
.is_osd_marked_in(ev
):
129 return marked_in_count
+ marked_out_count
131 return marked_in_count
133 return marked_out_count
135 def _write_some_data(self
, t
):
137 To adapt to test systems of varying performance, we write
138 data for a defined time period, rather than to a defined
139 capacity. This will hopefully result in a similar timescale
140 for PG recovery after an OSD failure.
144 "rados", "-p", self
.POOL
, "bench", str(t
), "write", "-t", "16"]
146 self
.mgr_cluster
.admin_remote
.run(args
=args
, wait
=True)
148 def _osd_count(self
):
149 osd_map
= self
.mgr_cluster
.mon_manager
.get_osd_dump_json()
150 return len(osd_map
['osds'])
153 super(TestProgress
, self
).setUp()
154 # Ensure we have at least four OSDs
155 if self
._osd
_count
() < 4:
156 self
.skipTest("Not enough OSDS!")
158 # Remove any filesystems so that we can remove their pools
160 self
.mds_cluster
.mds_stop()
161 self
.mds_cluster
.mds_fail()
162 self
.mds_cluster
.delete_all_filesystems()
164 # Remove all other pools
165 for pool
in self
.mgr_cluster
.mon_manager
.get_osd_dump_json()['pools']:
166 self
.mgr_cluster
.mon_manager
.remove_pool(pool
['pool_name'])
168 self
._load
_module
("progress")
169 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd('progress', 'clear')
171 def _simulate_failure(self
, osd_ids
=None):
173 Common lead-in to several tests: get some data in the cluster,
174 then mark an OSD out to trigger the start of a progress event.
176 Return the JSON representation of the failure event.
183 self
._write
_some
_data
(self
.WRITE_PERIOD
)
185 for osd_id
in osd_ids
:
186 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
187 'osd', 'out', str(osd_id
))
189 # Wait for a progress event to pop up
190 self
.wait_until_equal(lambda: self
._osd
_in
_out
_events
_count
('out'), 1,
191 timeout
=self
.EVENT_CREATION_PERIOD
*2,
193 ev
= self
._get
_osd
_in
_out
_events
('out')[0]
194 log
.info(json
.dumps(ev
, indent
=1))
195 self
.assertIn("Rebalancing after osd.0 marked out", ev
['message'])
198 def _simulate_back_in(self
, osd_ids
, initial_event
):
199 for osd_id
in osd_ids
:
200 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
201 'osd', 'in', str(osd_id
))
203 # First Event should complete promptly
204 self
.wait_until_true(lambda: self
._is
_complete
(initial_event
['id']),
205 timeout
=self
.EVENT_CREATION_PERIOD
)
207 # Wait for progress event marked in to pop up
208 self
.wait_until_equal(lambda: self
._osd
_in
_out
_events
_count
('in'), 1,
209 timeout
=self
.EVENT_CREATION_PERIOD
*2,
211 except RuntimeError as ex
:
212 if not "Timed out after" in str(ex
):
215 log
.info("There was no PGs affected by osd being marked in")
218 new_event
= self
._get
_osd
_in
_out
_events
('in')[0]
221 def _no_events_anywhere(self
):
223 Whether there are any live or completed events
225 p
= self
._get
_progress
()
226 total_events
= len(p
['events']) + len(p
['completed'])
227 return total_events
== 0
231 Whether any progress events are live.
233 return len(self
._get
_progress
()['events']) == 0
235 def _is_complete(self
, ev_id
):
236 progress
= self
._get
_progress
()
237 live_ids
= [ev
['id'] for ev
in progress
['events']]
238 complete_ids
= [ev
['id'] for ev
in progress
['completed']]
239 if ev_id
in complete_ids
:
240 assert ev_id
not in live_ids
243 assert ev_id
in live_ids
247 if self
.POOL
in self
.mgr_cluster
.mon_manager
.pools
:
248 self
.mgr_cluster
.mon_manager
.remove_pool(self
.POOL
)
250 osd_map
= self
.mgr_cluster
.mon_manager
.get_osd_dump_json()
251 for osd
in osd_map
['osds']:
252 if osd
['weight'] == 0.0:
253 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
254 'osd', 'in', str(osd
['osd']))
256 super(TestProgress
, self
).tearDown()
258 def test_osd_healthy_recovery(self
):
260 The simple recovery case: an OSD goes down, its PGs get a new
261 placement, and we wait for the PG to get healthy in its new
264 ev
= self
._simulate
_failure
()
266 # Wait for progress event to ultimately reach completion
267 self
.wait_until_true(lambda: self
._is
_complete
(ev
['id']),
268 timeout
=self
.RECOVERY_PERIOD
)
269 self
.assertEqual(self
._osd
_in
_out
_events
_count
(), 0)
271 def test_pool_removal(self
):
273 That a pool removed during OSD recovery causes the
274 progress event to be correctly marked complete once there
275 is no more data to move.
277 ev
= self
._simulate
_failure
()
279 self
.mgr_cluster
.mon_manager
.remove_pool(self
.POOL
)
281 # Event should complete promptly
282 self
.wait_until_true(lambda: self
._is
_complete
(ev
['id']),
283 timeout
=self
.EVENT_CREATION_PERIOD
)
284 self
.assertEqual(self
._osd
_in
_out
_events
_count
(), 0)
286 def test_osd_came_back(self
):
288 When a recovery is underway, but then the out OSD
289 comes back in, such that recovery is no longer necessary.
290 It should create another event for when osd is marked in
291 and cancel the one that is still ongoing.
293 ev1
= self
._simulate
_failure
()
295 ev2
= self
._simulate
_back
_in
([0], ev1
)
298 # Wait for progress event to ultimately complete
299 self
.wait_until_true(lambda: self
._is
_complete
(ev2
['id']),
300 timeout
=self
.RECOVERY_PERIOD
)
302 self
.assertEqual(self
._osd
_in
_out
_events
_count
(), 0)
304 def test_osd_cannot_recover(self
):
306 When the cluster cannot recover from a lost OSD, e.g.
307 because there is no suitable new placement for it.
308 (a size=3 pool when there are only 2 OSDs left)
309 (a size=3 pool when the remaining osds are only on 2 hosts)
311 Progress event should not be created.
316 self
._setup
_pool
(size
=pool_size
)
317 self
._write
_some
_data
(self
.WRITE_PERIOD
)
319 # Fail enough OSDs so there are less than N_replicas OSDs
321 osd_count
= self
._osd
_count
()
323 # First do some failures that will result in a normal rebalance
324 # (Assumption: we're in a test environment that is configured
325 # not to require replicas be on different hosts, like teuthology)
326 for osd_id
in range(0, osd_count
- pool_size
):
327 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
328 'osd', 'out', str(osd_id
))
330 # We should see an event for each of the OSDs we took out
331 self
.wait_until_equal(
332 lambda: self
._osd
_in
_out
_events
_count
('out'),
333 osd_count
- pool_size
,
334 timeout
=self
.EVENT_CREATION_PERIOD
*(osd_count
- pool_size
))
336 # Those should complete cleanly
337 self
.wait_until_equal(
338 lambda: self
._osd
_in
_out
_completed
_events
_count
('out'),
339 osd_count
- pool_size
,
340 timeout
=self
.RECOVERY_PERIOD
*(osd_count
- pool_size
)
343 # Fail one last OSD, at the point the PGs have nowhere to go
344 victim_osd
= osd_count
- pool_size
345 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
346 'osd', 'out', str(victim_osd
))
348 # Check that no event is created
349 time
.sleep(self
.EVENT_CREATION_PERIOD
)
352 self
._osd
_in
_out
_completed
_events
_count
('out'),
353 osd_count
- pool_size
)
355 def test_turn_off_module(self
):
357 When the the module is turned off, there should not
358 be any on going events or completed events.
359 Also module should not accept any kind of Remote Event
360 coming in from other module, however, once it is turned
361 back, on creating an event should be working as it is.
365 self
._setup
_pool
(size
=pool_size
)
366 self
._write
_some
_data
(self
.WRITE_PERIOD
)
368 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd("progress", "off")
370 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
373 time
.sleep(self
.EVENT_CREATION_PERIOD
)
375 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
378 time
.sleep(self
.EVENT_CREATION_PERIOD
)
380 self
.assertTrue(self
._no
_events
_anywhere
())
382 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd("progress", "on")
384 self
._write
_some
_data
(self
.WRITE_PERIOD
)
386 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
389 # Wait for a progress event to pop up
390 self
.wait_until_equal(lambda: self
._osd
_in
_out
_events
_count
('out'), 1,
391 timeout
=self
.EVENT_CREATION_PERIOD
*2,
394 ev1
= self
._get
_osd
_in
_out
_events
('out')[0]
396 log
.info(json
.dumps(ev1
, indent
=1))
398 self
.wait_until_true(lambda: self
._is
_complete
(ev1
['id']),
399 timeout
=self
.RECOVERY_PERIOD
)
400 self
.assertTrue(self
._is
_quiet
())