]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/mgr/test_progress.py
import 15.2.4
[ceph.git] / ceph / qa / tasks / mgr / test_progress.py
CommitLineData
11fdf7f2
TL
1
2import json
3import logging
4import time
11fdf7f2 5
e306af50 6from tasks.mgr.mgr_test_case import MgrTestCase
11fdf7f2
TL
7
8
9log = logging.getLogger(__name__)
10
11
12class TestProgress(MgrTestCase):
13 POOL = "progress_data"
14
15 # How long we expect to wait at most between taking an OSD out
16 # and seeing the progress event pop up.
17 EVENT_CREATION_PERIOD = 5
18
19 WRITE_PERIOD = 30
20
21 # Generous period for OSD recovery, should be same order of magnitude
22 # to how long it took to write the data to begin with
23 RECOVERY_PERIOD = WRITE_PERIOD * 4
24
25 def _get_progress(self):
26 out = self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "json")
27 return json.loads(out)
28
29 def _all_events(self):
30 """
31 To avoid racing on completion, we almost always want to look
32 for events in the total list of active and complete, so
33 munge them into a single list.
34 """
35 p = self._get_progress()
36 log.info(json.dumps(p, indent=2))
37 return p['events'] + p['completed']
38
9f95a23c
TL
39 def _events_in_progress(self):
40 """
41 this function returns all events that are in progress
42 """
43 p = self._get_progress()
44 log.info(json.dumps(p, indent=2))
45 return p['events']
46
11fdf7f2
TL
47 def _setup_pool(self, size=None):
48 self.mgr_cluster.mon_manager.create_pool(self.POOL)
49 if size is not None:
50 self.mgr_cluster.mon_manager.raw_cluster_cmd(
51 'osd', 'pool', 'set', self.POOL, 'size', str(size))
52
53 def _write_some_data(self, t):
54 """
55 To adapt to test systems of varying performance, we write
56 data for a defined time period, rather than to a defined
57 capacity. This will hopefully result in a similar timescale
58 for PG recovery after an OSD failure.
59 """
60
61 args = [
62 "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"]
63
64 self.mgr_cluster.admin_remote.run(args=args, wait=True)
65
66 def _osd_count(self):
67 osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
68 return len(osd_map['osds'])
69
70 def setUp(self):
9f95a23c 71 super(TestProgress, self).setUp()
11fdf7f2
TL
72 # Ensure we have at least four OSDs
73 if self._osd_count() < 4:
9f95a23c 74 self.skipTest("Not enough OSDS!")
11fdf7f2
TL
75
76 # Remove any filesystems so that we can remove their pools
77 if self.mds_cluster:
78 self.mds_cluster.mds_stop()
79 self.mds_cluster.mds_fail()
80 self.mds_cluster.delete_all_filesystems()
81
82 # Remove all other pools
83 for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']:
84 self.mgr_cluster.mon_manager.remove_pool(pool['pool_name'])
85
86 self._load_module("progress")
87 self.mgr_cluster.mon_manager.raw_cluster_cmd('progress', 'clear')
88
89 def _simulate_failure(self, osd_ids=None):
90 """
91 Common lead-in to several tests: get some data in the cluster,
92 then mark an OSD out to trigger the start of a progress event.
93
94 Return the JSON representation of the failure event.
95 """
96
97 if osd_ids is None:
98 osd_ids = [0]
99
100 self._setup_pool()
101 self._write_some_data(self.WRITE_PERIOD)
102
103 for osd_id in osd_ids:
104 self.mgr_cluster.mon_manager.raw_cluster_cmd(
105 'osd', 'out', str(osd_id))
106
107 # Wait for a progress event to pop up
108 self.wait_until_equal(lambda: len(self._all_events()), 1,
109 timeout=self.EVENT_CREATION_PERIOD)
110 ev = self._all_events()[0]
111 log.info(json.dumps(ev, indent=1))
112 self.assertIn("Rebalancing after osd.0 marked out", ev['message'])
9f95a23c 113
11fdf7f2
TL
114 return ev
115
9f95a23c
TL
116 def _simulate_back_in(self, osd_ids, initial_event):
117
118 for osd_id in osd_ids:
119 self.mgr_cluster.mon_manager.raw_cluster_cmd(
120 'osd', 'in', str(osd_id))
121
122 # First Event should complete promptly
123 self.wait_until_true(lambda: self._is_complete(initial_event['id']),
124 timeout=self.EVENT_CREATION_PERIOD)
125
126 try:
127 # Wait for progress event marked in to pop up
128 self.wait_until_equal(lambda: len(self._events_in_progress()), 1,
129 timeout=self.EVENT_CREATION_PERIOD)
130 except RuntimeError as ex:
131 if not "Timed out after" in str(ex):
132 raise ex
133
134 log.info("There was no PGs affected by osd being marked in")
135 return None
136
137 new_event = self._events_in_progress()[0]
138 log.info(json.dumps(new_event, indent=1))
139 self.assertIn("Rebalancing after osd.0 marked in", new_event['message'])
140
141 return new_event
142
11fdf7f2
TL
143 def _is_quiet(self):
144 """
145 Whether any progress events are live.
146 """
147 return len(self._get_progress()['events']) == 0
148
149 def _is_complete(self, ev_id):
150 progress = self._get_progress()
151 live_ids = [ev['id'] for ev in progress['events']]
152 complete_ids = [ev['id'] for ev in progress['completed']]
153 if ev_id in complete_ids:
154 assert ev_id not in live_ids
155 return True
156 else:
157 assert ev_id in live_ids
158 return False
159
160 def tearDown(self):
161 if self.POOL in self.mgr_cluster.mon_manager.pools:
162 self.mgr_cluster.mon_manager.remove_pool(self.POOL)
163
164 osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
165 for osd in osd_map['osds']:
166 if osd['weight'] == 0.0:
167 self.mgr_cluster.mon_manager.raw_cluster_cmd(
168 'osd', 'in', str(osd['osd']))
169
170 super(TestProgress, self).tearDown()
171
172 def test_osd_healthy_recovery(self):
173 """
174 The simple recovery case: an OSD goes down, its PGs get a new
175 placement, and we wait for the PG to get healthy in its new
176 locations.
177 """
178 ev = self._simulate_failure()
179
180 # Wait for progress event to ultimately reach completion
181 self.wait_until_true(lambda: self._is_complete(ev['id']),
182 timeout=self.RECOVERY_PERIOD)
183 self.assertTrue(self._is_quiet())
184
185 def test_pool_removal(self):
186 """
187 That a pool removed during OSD recovery causes the
188 progress event to be correctly marked complete once there
189 is no more data to move.
190 """
191 ev = self._simulate_failure()
192
193 self.mgr_cluster.mon_manager.remove_pool(self.POOL)
194
195 # Event should complete promptly
196 self.wait_until_true(lambda: self._is_complete(ev['id']),
197 timeout=self.EVENT_CREATION_PERIOD)
198 self.assertTrue(self._is_quiet())
199
200 def test_osd_came_back(self):
201 """
202 When a recovery is underway, but then the out OSD
203 comes back in, such that recovery is no longer necessary.
9f95a23c
TL
204 It should create another event for when osd is marked in
205 and cancel the one that is still ongoing.
11fdf7f2 206 """
9f95a23c 207 ev1 = self._simulate_failure()
11fdf7f2 208
9f95a23c
TL
209 ev2 = self._simulate_back_in([0], ev1)
210
211 if ev2 is not None:
212 # Wait for progress event to ultimately complete
213 self.wait_until_true(lambda: self._is_complete(ev2['id']),
214 timeout=self.RECOVERY_PERIOD)
11fdf7f2 215
11fdf7f2
TL
216 self.assertTrue(self._is_quiet())
217
218 def test_osd_cannot_recover(self):
219 """
220 When the cluster cannot recover from a lost OSD, e.g.
221 because there is no suitable new placement for it.
222 (a size=3 pool when there are only 2 OSDs left)
223 (a size=3 pool when the remaining osds are only on 2 hosts)
224
225 Progress event should not be created.
226 """
227
228 pool_size = 3
229
230 self._setup_pool(size=pool_size)
231 self._write_some_data(self.WRITE_PERIOD)
232
233 # Fail enough OSDs so there are less than N_replicas OSDs
234 # available.
235 osd_count = self._osd_count()
236
237 # First do some failures that will result in a normal rebalance
238 # (Assumption: we're in a test environment that is configured
239 # not to require replicas be on different hosts, like teuthology)
240 for osd_id in range(0, osd_count - pool_size):
241 self.mgr_cluster.mon_manager.raw_cluster_cmd(
242 'osd', 'out', str(osd_id))
243
244 # We should see an event for each of the OSDs we took out
245 self.wait_until_equal(
246 lambda: len(self._all_events()),
247 osd_count - pool_size,
248 timeout=self.EVENT_CREATION_PERIOD)
249
250 # Those should complete cleanly
251 self.wait_until_true(
252 lambda: self._is_quiet(),
253 timeout=self.RECOVERY_PERIOD
254 )
255
256 # Fail one last OSD, at the point the PGs have nowhere to go
257 victim_osd = osd_count - pool_size
258 self.mgr_cluster.mon_manager.raw_cluster_cmd(
259 'osd', 'out', str(victim_osd))
260
261 # Check that no event is created
262 time.sleep(self.EVENT_CREATION_PERIOD)
263
264 self.assertEqual(len(self._all_events()), osd_count - pool_size)