]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mgr/test_progress.py
6 from mgr_test_case
import MgrTestCase
9 log
= logging
.getLogger(__name__
)
12 class TestProgress(MgrTestCase
):
13 POOL
= "progress_data"
15 # How long we expect to wait at most between taking an OSD out
16 # and seeing the progress event pop up.
17 EVENT_CREATION_PERIOD
= 5
21 # Generous period for OSD recovery, should be same order of magnitude
22 # to how long it took to write the data to begin with
23 RECOVERY_PERIOD
= WRITE_PERIOD
* 4
25 def _get_progress(self
):
26 out
= self
.mgr_cluster
.mon_manager
.raw_cluster_cmd("progress", "json")
27 return json
.loads(out
)
29 def _all_events(self
):
31 To avoid racing on completion, we almost always want to look
32 for events in the total list of active and complete, so
33 munge them into a single list.
35 p
= self
._get
_progress
()
36 log
.info(json
.dumps(p
, indent
=2))
37 return p
['events'] + p
['completed']
39 def _events_in_progress(self
):
41 this function returns all events that are in progress
43 p
= self
._get
_progress
()
44 log
.info(json
.dumps(p
, indent
=2))
47 def _setup_pool(self
, size
=None):
48 self
.mgr_cluster
.mon_manager
.create_pool(self
.POOL
)
50 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
51 'osd', 'pool', 'set', self
.POOL
, 'size', str(size
))
53 def _write_some_data(self
, t
):
55 To adapt to test systems of varying performance, we write
56 data for a defined time period, rather than to a defined
57 capacity. This will hopefully result in a similar timescale
58 for PG recovery after an OSD failure.
62 "rados", "-p", self
.POOL
, "bench", str(t
), "write", "-t", "16"]
64 self
.mgr_cluster
.admin_remote
.run(args
=args
, wait
=True)
67 osd_map
= self
.mgr_cluster
.mon_manager
.get_osd_dump_json()
68 return len(osd_map
['osds'])
71 super(TestProgress
, self
).setUp()
72 # Ensure we have at least four OSDs
73 if self
._osd
_count
() < 4:
74 self
.skipTest("Not enough OSDS!")
76 # Remove any filesystems so that we can remove their pools
78 self
.mds_cluster
.mds_stop()
79 self
.mds_cluster
.mds_fail()
80 self
.mds_cluster
.delete_all_filesystems()
82 # Remove all other pools
83 for pool
in self
.mgr_cluster
.mon_manager
.get_osd_dump_json()['pools']:
84 self
.mgr_cluster
.mon_manager
.remove_pool(pool
['pool_name'])
86 self
._load
_module
("progress")
87 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd('progress', 'clear')
89 def _simulate_failure(self
, osd_ids
=None):
91 Common lead-in to several tests: get some data in the cluster,
92 then mark an OSD out to trigger the start of a progress event.
94 Return the JSON representation of the failure event.
101 self
._write
_some
_data
(self
.WRITE_PERIOD
)
103 for osd_id
in osd_ids
:
104 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
105 'osd', 'out', str(osd_id
))
107 # Wait for a progress event to pop up
108 self
.wait_until_equal(lambda: len(self
._all
_events
()), 1,
109 timeout
=self
.EVENT_CREATION_PERIOD
)
110 ev
= self
._all
_events
()[0]
111 log
.info(json
.dumps(ev
, indent
=1))
112 self
.assertIn("Rebalancing after osd.0 marked out", ev
['message'])
116 def _simulate_back_in(self
, osd_ids
, initial_event
):
118 for osd_id
in osd_ids
:
119 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
120 'osd', 'in', str(osd_id
))
122 # First Event should complete promptly
123 self
.wait_until_true(lambda: self
._is
_complete
(initial_event
['id']),
124 timeout
=self
.EVENT_CREATION_PERIOD
)
127 # Wait for progress event marked in to pop up
128 self
.wait_until_equal(lambda: len(self
._events
_in
_progress
()), 1,
129 timeout
=self
.EVENT_CREATION_PERIOD
)
130 except RuntimeError as ex
:
131 if not "Timed out after" in str(ex
):
134 log
.info("There was no PGs affected by osd being marked in")
137 new_event
= self
._events
_in
_progress
()[0]
138 log
.info(json
.dumps(new_event
, indent
=1))
139 self
.assertIn("Rebalancing after osd.0 marked in", new_event
['message'])
145 Whether any progress events are live.
147 return len(self
._get
_progress
()['events']) == 0
149 def _is_complete(self
, ev_id
):
150 progress
= self
._get
_progress
()
151 live_ids
= [ev
['id'] for ev
in progress
['events']]
152 complete_ids
= [ev
['id'] for ev
in progress
['completed']]
153 if ev_id
in complete_ids
:
154 assert ev_id
not in live_ids
157 assert ev_id
in live_ids
161 if self
.POOL
in self
.mgr_cluster
.mon_manager
.pools
:
162 self
.mgr_cluster
.mon_manager
.remove_pool(self
.POOL
)
164 osd_map
= self
.mgr_cluster
.mon_manager
.get_osd_dump_json()
165 for osd
in osd_map
['osds']:
166 if osd
['weight'] == 0.0:
167 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
168 'osd', 'in', str(osd
['osd']))
170 super(TestProgress
, self
).tearDown()
172 def test_osd_healthy_recovery(self
):
174 The simple recovery case: an OSD goes down, its PGs get a new
175 placement, and we wait for the PG to get healthy in its new
178 ev
= self
._simulate
_failure
()
180 # Wait for progress event to ultimately reach completion
181 self
.wait_until_true(lambda: self
._is
_complete
(ev
['id']),
182 timeout
=self
.RECOVERY_PERIOD
)
183 self
.assertTrue(self
._is
_quiet
())
185 def test_pool_removal(self
):
187 That a pool removed during OSD recovery causes the
188 progress event to be correctly marked complete once there
189 is no more data to move.
191 ev
= self
._simulate
_failure
()
193 self
.mgr_cluster
.mon_manager
.remove_pool(self
.POOL
)
195 # Event should complete promptly
196 self
.wait_until_true(lambda: self
._is
_complete
(ev
['id']),
197 timeout
=self
.EVENT_CREATION_PERIOD
)
198 self
.assertTrue(self
._is
_quiet
())
200 def test_osd_came_back(self
):
202 When a recovery is underway, but then the out OSD
203 comes back in, such that recovery is no longer necessary.
204 It should create another event for when osd is marked in
205 and cancel the one that is still ongoing.
207 ev1
= self
._simulate
_failure
()
209 ev2
= self
._simulate
_back
_in
([0], ev1
)
212 # Wait for progress event to ultimately complete
213 self
.wait_until_true(lambda: self
._is
_complete
(ev2
['id']),
214 timeout
=self
.RECOVERY_PERIOD
)
216 self
.assertTrue(self
._is
_quiet
())
218 def test_osd_cannot_recover(self
):
220 When the cluster cannot recover from a lost OSD, e.g.
221 because there is no suitable new placement for it.
222 (a size=3 pool when there are only 2 OSDs left)
223 (a size=3 pool when the remaining osds are only on 2 hosts)
225 Progress event should not be created.
230 self
._setup
_pool
(size
=pool_size
)
231 self
._write
_some
_data
(self
.WRITE_PERIOD
)
233 # Fail enough OSDs so there are less than N_replicas OSDs
235 osd_count
= self
._osd
_count
()
237 # First do some failures that will result in a normal rebalance
238 # (Assumption: we're in a test environment that is configured
239 # not to require replicas be on different hosts, like teuthology)
240 for osd_id
in range(0, osd_count
- pool_size
):
241 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
242 'osd', 'out', str(osd_id
))
244 # We should see an event for each of the OSDs we took out
245 self
.wait_until_equal(
246 lambda: len(self
._all
_events
()),
247 osd_count
- pool_size
,
248 timeout
=self
.EVENT_CREATION_PERIOD
)
250 # Those should complete cleanly
251 self
.wait_until_true(
252 lambda: self
._is
_quiet
(),
253 timeout
=self
.RECOVERY_PERIOD
256 # Fail one last OSD, at the point the PGs have nowhere to go
257 victim_osd
= osd_count
- pool_size
258 self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
259 'osd', 'out', str(victim_osd
))
261 # Check that no event is created
262 time
.sleep(self
.EVENT_CREATION_PERIOD
)
264 self
.assertEqual(len(self
._all
_events
()), osd_count
- pool_size
)