]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_snap_schedules.py
import quincy 17.2.0
[ceph.git] / ceph / qa / tasks / cephfs / test_snap_schedules.py
1 import os
2 import json
3 import time
4 import errno
5 import logging
6
7 from tasks.cephfs.cephfs_test_case import CephFSTestCase
8 from teuthology.exceptions import CommandFailedError
9 from datetime import datetime, timedelta
10
11 log = logging.getLogger(__name__)
12
13 def extract_schedule_and_retention_spec(spec=[]):
14 schedule = set([s[0] for s in spec])
15 retention = set([s[1] for s in spec])
16 return (schedule, retention)
17
18 def seconds_upto_next_schedule(time_from, timo):
19 ts = int(time_from)
20 return ((int(ts / 60) * 60) + timo) - ts
21
22 class TestSnapSchedules(CephFSTestCase):
23 CLIENTS_REQUIRED = 1
24
25 TEST_VOLUME_NAME = 'snap_vol'
26 TEST_DIRECTORY = 'snap_test_dir1'
27
28 # this should be in sync with snap_schedule format
29 SNAPSHOT_TS_FORMAT = '%Y-%m-%d-%H_%M_%S'
30
31 def check_scheduled_snapshot(self, exec_time, timo):
32 now = time.time()
33 delta = now - exec_time
34 log.debug(f'exec={exec_time}, now = {now}, timo = {timo}')
35 # tolerate snapshot existance in the range [-5,+5]
36 self.assertTrue((delta <= timo + 5) and (delta >= timo - 5))
37
38 def _fs_cmd(self, *args):
39 return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args)
40
41 def fs_snap_schedule_cmd(self, *args, **kwargs):
42 fs = kwargs.pop('fs', self.volname)
43 args += ('--fs', fs)
44 if 'format' in kwargs:
45 fmt = kwargs.pop('format')
46 args += ('--format', fmt)
47 for name, val in kwargs.items():
48 args += (str(val),)
49 res = self._fs_cmd('snap-schedule', *args)
50 log.debug(f'res={res}')
51 return res
52
53 def _create_or_reuse_test_volume(self):
54 result = json.loads(self._fs_cmd("volume", "ls"))
55 if len(result) == 0:
56 self.vol_created = True
57 self.volname = TestSnapSchedules.TEST_VOLUME_NAME
58 self._fs_cmd("volume", "create", self.volname)
59 else:
60 self.volname = result[0]['name']
61
62 def _enable_snap_schedule(self):
63 return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "snap_schedule")
64
65 def _disable_snap_schedule(self):
66 return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "snap_schedule")
67
68 def _allow_minute_granularity_snapshots(self):
69 self.config_set('mgr', 'mgr/snap_schedule/allow_m_granularity', True)
70
71 def _dump_on_update(self):
72 self.config_set('mgr', 'mgr/snap_schedule/dump_on_update', True)
73
74 def setUp(self):
75 super(TestSnapSchedules, self).setUp()
76 self.volname = None
77 self.vol_created = False
78 self._create_or_reuse_test_volume()
79 self.create_cbks = []
80 self.remove_cbks = []
81 # used to figure out which snapshots are created/deleted
82 self.snapshots = set()
83 self._enable_snap_schedule()
84 self._allow_minute_granularity_snapshots()
85 self._dump_on_update()
86
87 def tearDown(self):
88 if self.vol_created:
89 self._delete_test_volume()
90 self._disable_snap_schedule()
91 super(TestSnapSchedules, self).tearDown()
92
93 def _schedule_to_timeout(self, schedule):
94 mult = schedule[-1]
95 period = int(schedule[0:-1])
96 if mult == 'M':
97 return period * 60
98 elif mult == 'h':
99 return period * 60 * 60
100 elif mult == 'd':
101 return period * 60 * 60 * 24
102 elif mult == 'w':
103 return period * 60 * 60 * 24 * 7
104 else:
105 raise RuntimeError('schedule multiplier not recognized')
106
107 def add_snap_create_cbk(self, cbk):
108 self.create_cbks.append(cbk)
109 def remove_snap_create_cbk(self, cbk):
110 self.create_cbks.remove(cbk)
111
112 def add_snap_remove_cbk(self, cbk):
113 self.remove_cbks.append(cbk)
114 def remove_snap_remove_cbk(self, cbk):
115 self.remove_cbks.remove(cbk)
116
117 def assert_if_not_verified(self):
118 self.assertListEqual(self.create_cbks, [])
119 self.assertListEqual(self.remove_cbks, [])
120
121 def verify(self, dir_path, max_trials):
122 trials = 0
123 snap_path = "{0}/.snap".format(dir_path)
124 while (len(self.create_cbks) or len(self.remove_cbks)) and trials < max_trials:
125 snapshots = set(self.mount_a.ls(path=snap_path))
126 log.info(f"snapshots: {snapshots}")
127 added = snapshots - self.snapshots
128 log.info(f"added: {added}")
129 removed = self.snapshots - snapshots
130 log.info(f"removed: {removed}")
131 if added:
132 for cbk in list(self.create_cbks):
133 res = cbk(list(added))
134 if res:
135 self.remove_snap_create_cbk(cbk)
136 break
137 if removed:
138 for cbk in list(self.remove_cbks):
139 res = cbk(list(removed))
140 if res:
141 self.remove_snap_remove_cbk(cbk)
142 break
143 self.snapshots = snapshots
144 trials += 1
145 time.sleep(1)
146
147 def calc_wait_time_and_snap_name(self, snap_sched_exec_epoch, schedule):
148 timo = self._schedule_to_timeout(schedule)
149 # calculate wait time upto the next minute
150 wait_timo = seconds_upto_next_schedule(snap_sched_exec_epoch, timo)
151
152 # expected "scheduled" snapshot name
153 ts_name = (datetime.utcfromtimestamp(snap_sched_exec_epoch)
154 + timedelta(seconds=wait_timo)).strftime(TestSnapSchedules.SNAPSHOT_TS_FORMAT)
155 return (wait_timo, ts_name)
156
157 def verify_schedule(self, dir_path, schedules, retentions=[]):
158 log.debug(f'expected_schedule: {schedules}, expected_retention: {retentions}')
159
160 result = self.fs_snap_schedule_cmd('list', path=dir_path, format='json')
161 json_res = json.loads(result)
162 log.debug(f'json_res: {json_res}')
163
164 for schedule in schedules:
165 self.assertTrue(schedule in json_res['schedule'])
166 for retention in retentions:
167 self.assertTrue(retention in json_res['retention'])
168
169 def remove_snapshots(self, dir_path):
170 snap_path = f'{dir_path}/.snap'
171
172 snapshots = self.mount_a.ls(path=snap_path)
173 for snapshot in snapshots:
174 snapshot_path = os.path.join(snap_path, snapshot)
175 log.debug(f'removing snapshot: {snapshot_path}')
176 self.mount_a.run_shell(['rmdir', snapshot_path])
177
178 def test_non_existent_snap_schedule_list(self):
179 """Test listing snap schedules on a non-existing filesystem path failure"""
180 try:
181 self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
182 except CommandFailedError as ce:
183 if ce.exitstatus != errno.ENOENT:
184 raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
185 else:
186 raise RuntimeError('expected "fs snap-schedule list" to fail')
187
188 def test_non_existent_schedule(self):
189 """Test listing non-existing snap schedules failure"""
190 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
191
192 try:
193 self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
194 except CommandFailedError as ce:
195 if ce.exitstatus != errno.ENOENT:
196 raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
197 else:
198 raise RuntimeError('expected "fs snap-schedule list" returned fail')
199
200 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
201
202 def test_snap_schedule_list_post_schedule_remove(self):
203 """Test listing snap schedules post removal of a schedule"""
204 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
205
206 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1h')
207
208 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
209
210 try:
211 self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
212 except CommandFailedError as ce:
213 if ce.exitstatus != errno.ENOENT:
214 raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
215 else:
216 raise RuntimeError('"fs snap-schedule list" returned error')
217
218 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
219
220 def test_snap_schedule(self):
221 """Test existence of a scheduled snapshot"""
222 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
223
224 # set a schedule on the dir
225 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
226 exec_time = time.time()
227
228 timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
229 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo}s...')
230 to_wait = timo + 2 # some leeway to avoid false failures...
231
232 # verify snapshot schedule
233 self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'])
234
235 def verify_added(snaps_added):
236 log.debug(f'snapshots added={snaps_added}')
237 self.assertEqual(len(snaps_added), 1)
238 snapname = snaps_added[0]
239 if snapname.startswith('scheduled-'):
240 if snapname[10:26] == snap_sfx[:16]:
241 self.check_scheduled_snapshot(exec_time, timo)
242 return True
243 return False
244 self.add_snap_create_cbk(verify_added)
245 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
246 self.assert_if_not_verified()
247
248 # remove snapshot schedule
249 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
250
251 # remove all scheduled snapshots
252 self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
253
254 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
255
256 def test_multi_snap_schedule(self):
257 """Test exisitence of multiple scheduled snapshots"""
258 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
259
260 # set schedules on the dir
261 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
262 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='2M')
263 exec_time = time.time()
264
265 timo_1, snap_sfx_1 = self.calc_wait_time_and_snap_name(exec_time, '1M')
266 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_1} in ~{timo_1}s...')
267 timo_2, snap_sfx_2 = self.calc_wait_time_and_snap_name(exec_time, '2M')
268 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_2} in ~{timo_2}s...')
269 to_wait = timo_2 + 2 # use max timeout
270
271 # verify snapshot schedule
272 self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M', '2M'])
273
274 def verify_added_1(snaps_added):
275 log.debug(f'snapshots added={snaps_added}')
276 self.assertEqual(len(snaps_added), 1)
277 snapname = snaps_added[0]
278 if snapname.startswith('scheduled-'):
279 if snapname[10:26] == snap_sfx_1[:16]:
280 self.check_scheduled_snapshot(exec_time, timo_1)
281 return True
282 return False
283 def verify_added_2(snaps_added):
284 log.debug(f'snapshots added={snaps_added}')
285 self.assertEqual(len(snaps_added), 1)
286 snapname = snaps_added[0]
287 if snapname.startswith('scheduled-'):
288 if snapname[10:26] == snap_sfx_2[:16]:
289 self.check_scheduled_snapshot(exec_time, timo_2)
290 return True
291 return False
292 self.add_snap_create_cbk(verify_added_1)
293 self.add_snap_create_cbk(verify_added_2)
294 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
295 self.assert_if_not_verified()
296
297 # remove snapshot schedule
298 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
299
300 # remove all scheduled snapshots
301 self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
302
303 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
304
305 def test_snap_schedule_with_retention(self):
306 """Test scheduled snapshots along with rentention policy"""
307 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
308
309 # set a schedule on the dir
310 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
311 self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedules.TEST_DIRECTORY, retention_spec_or_period='1M')
312 exec_time = time.time()
313
314 timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
315 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_1}s...')
316 to_wait = timo_1 + 2 # some leeway to avoid false failures...
317
318 # verify snapshot schedule
319 self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}])
320
321 def verify_added(snaps_added):
322 log.debug(f'snapshots added={snaps_added}')
323 self.assertEqual(len(snaps_added), 1)
324 snapname = snaps_added[0]
325 if snapname.startswith('scheduled-'):
326 if snapname[10:26] == snap_sfx[:16]:
327 self.check_scheduled_snapshot(exec_time, timo_1)
328 return True
329 return False
330 self.add_snap_create_cbk(verify_added)
331 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
332 self.assert_if_not_verified()
333
334 timo_2 = timo_1 + 60 # expected snapshot removal timeout
335 def verify_removed(snaps_removed):
336 log.debug(f'snapshots removed={snaps_removed}')
337 self.assertEqual(len(snaps_removed), 1)
338 snapname = snaps_removed[0]
339 if snapname.startswith('scheduled-'):
340 if snapname[10:26] == snap_sfx[:16]:
341 self.check_scheduled_snapshot(exec_time, timo_2)
342 return True
343 return False
344 log.debug(f'expecting removal of snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_2}s...')
345 to_wait = timo_2
346 self.add_snap_remove_cbk(verify_removed)
347 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait+2)
348 self.assert_if_not_verified()
349
350 # remove snapshot schedule
351 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
352
353 # remove all scheduled snapshots
354 self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
355
356 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
357
358 def get_snap_stats(self, dir_path):
359 snap_path = f"{dir_path}/.snap"[1:]
360 snapshots = self.mount_a.ls(path=snap_path)
361 fs_count = len(snapshots)
362 log.debug(f'snapshots: {snapshots}');
363
364 result = self.fs_snap_schedule_cmd('status', path=dir_path,
365 snap_schedule='1M', format='json')
366 json_res = json.loads(result)[0]
367 db_count = int(json_res['created_count'])
368 log.debug(f'json_res: {json_res}')
369
370 snap_stats = dict()
371 snap_stats['fs_count'] = fs_count
372 snap_stats['db_count'] = db_count
373
374 return snap_stats
375
376 def verify_snap_stats(self, dir_path):
377 snap_stats = self.get_snap_stats(dir_path)
378 self.assertTrue(snap_stats['fs_count'] == snap_stats['db_count'])
379
380 def test_concurrent_snap_creates(self):
381 """Test concurrent snap creates in same file-system without db issues"""
382 """
383 Test snap creates at same cadence on same fs to verify correct stats.
384 A single SQLite DB Connection handle cannot be used to run concurrent
385 transactions and results transaction aborts. This test makes sure that
386 proper care has been taken in the code to avoid such situation by
387 verifying number of dirs created on the file system with the
388 created_count in the schedule_meta table for the specific path.
389 """
390 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
391
392 testdirs = []
393 for d in range(10):
394 testdirs.append(os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "dir" + str(d)))
395
396 for d in testdirs:
397 self.mount_a.run_shell(['mkdir', '-p', d[1:]])
398 self.fs_snap_schedule_cmd('add', path=d, snap_schedule='1M')
399
400 exec_time = time.time()
401 timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
402
403 for d in testdirs:
404 self.fs_snap_schedule_cmd('activate', path=d, snap_schedule='1M')
405
406 # we wait for 10 snaps to be taken
407 wait_time = timo_1 + 10 * 60 + 15
408 time.sleep(wait_time)
409
410 for d in testdirs:
411 self.fs_snap_schedule_cmd('deactivate', path=d, snap_schedule='1M')
412
413 for d in testdirs:
414 self.verify_snap_stats(d)
415
416 for d in testdirs:
417 self.fs_snap_schedule_cmd('remove', path=d, snap_schedule='1M')
418 self.remove_snapshots(d[1:])
419 self.mount_a.run_shell(['rmdir', d[1:]])
420
421 def test_snap_schedule_with_mgr_restart(self):
422 """Test that snap schedule is resumed after mgr restart"""
423 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
424 testdir = os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "test_restart")
425 self.mount_a.run_shell(['mkdir', '-p', testdir[1:]])
426 self.fs_snap_schedule_cmd('add', path=testdir, snap_schedule='1M')
427
428 exec_time = time.time()
429 timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
430
431 self.fs_snap_schedule_cmd('activate', path=testdir, snap_schedule='1M')
432
433 # we wait for 10 snaps to be taken
434 wait_time = timo_1 + 10 * 60 + 15
435 time.sleep(wait_time)
436
437 old_stats = self.get_snap_stats(testdir)
438 self.assertTrue(old_stats['fs_count'] == old_stats['db_count'])
439 self.assertTrue(old_stats['fs_count'] > 9)
440
441 # restart mgr
442 active_mgr = self.mgr_cluster.mon_manager.get_mgr_dump()['active_name']
443 log.debug(f'restarting active mgr: {active_mgr}')
444 self.mgr_cluster.mon_manager.revive_mgr(active_mgr)
445 time.sleep(300) # sleep for 5 minutes
446 self.fs_snap_schedule_cmd('deactivate', path=testdir, snap_schedule='1M')
447
448 new_stats = self.get_snap_stats(testdir)
449 self.assertTrue(new_stats['fs_count'] == new_stats['db_count'])
450 self.assertTrue(new_stats['fs_count'] > old_stats['fs_count'])
451 self.assertTrue(new_stats['db_count'] > old_stats['db_count'])
452
453 # cleanup
454 self.fs_snap_schedule_cmd('remove', path=testdir, snap_schedule='1M')
455 self.remove_snapshots(testdir[1:])
456 self.mount_a.run_shell(['rmdir', testdir[1:]])