]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_snap_schedules.py
4a9ce838ef7fc22ff2551c78ad3271eb20dd85f2
[ceph.git] / ceph / qa / tasks / cephfs / test_snap_schedules.py
1 import os
2 import json
3 import time
4 import errno
5 import logging
6
7 from tasks.cephfs.cephfs_test_case import CephFSTestCase
8 from teuthology.exceptions import CommandFailedError
9 from datetime import datetime, timedelta
10
11 log = logging.getLogger(__name__)
12
13 def extract_schedule_and_retention_spec(spec=[]):
14 schedule = set([s[0] for s in spec])
15 retention = set([s[1] for s in spec])
16 return (schedule, retention)
17
18 def seconds_upto_next_schedule(time_from, timo):
19 ts = int(time_from)
20 return ((int(ts / 60) * 60) + timo) - ts
21
22 class TestSnapSchedulesHelper(CephFSTestCase):
23 CLIENTS_REQUIRED = 1
24
25 TEST_VOLUME_NAME = 'snap_vol'
26 TEST_DIRECTORY = 'snap_test_dir1'
27
28 # this should be in sync with snap_schedule format
29 SNAPSHOT_TS_FORMAT = '%Y-%m-%d-%H_%M_%S'
30
31 def check_scheduled_snapshot(self, exec_time, timo):
32 now = time.time()
33 delta = now - exec_time
34 log.debug(f'exec={exec_time}, now = {now}, timo = {timo}')
35 # tolerate snapshot existance in the range [-5,+5]
36 self.assertTrue((delta <= timo + 5) and (delta >= timo - 5))
37
38 def _fs_cmd(self, *args):
39 return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args)
40
41 def fs_snap_schedule_cmd(self, *args, **kwargs):
42 fs = kwargs.pop('fs', self.volname)
43 args += ('--fs', fs)
44 if 'format' in kwargs:
45 fmt = kwargs.pop('format')
46 args += ('--format', fmt)
47 for name, val in kwargs.items():
48 args += (str(val),)
49 res = self._fs_cmd('snap-schedule', *args)
50 log.debug(f'res={res}')
51 return res
52
53 def _create_or_reuse_test_volume(self):
54 result = json.loads(self._fs_cmd("volume", "ls"))
55 if len(result) == 0:
56 self.vol_created = True
57 self.volname = TestSnapSchedulesHelper.TEST_VOLUME_NAME
58 self._fs_cmd("volume", "create", self.volname)
59 else:
60 self.volname = result[0]['name']
61
62 def _enable_snap_schedule(self):
63 return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "snap_schedule")
64
65 def _disable_snap_schedule(self):
66 return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "snap_schedule")
67
68 def _allow_minute_granularity_snapshots(self):
69 self.config_set('mgr', 'mgr/snap_schedule/allow_m_granularity', True)
70
71 def _dump_on_update(self):
72 self.config_set('mgr', 'mgr/snap_schedule/dump_on_update', True)
73
74 def setUp(self):
75 super(TestSnapSchedulesHelper, self).setUp()
76 self.volname = None
77 self.vol_created = False
78 self._create_or_reuse_test_volume()
79 self.create_cbks = []
80 self.remove_cbks = []
81 # used to figure out which snapshots are created/deleted
82 self.snapshots = set()
83 self._enable_snap_schedule()
84 self._allow_minute_granularity_snapshots()
85 self._dump_on_update()
86
87 def tearDown(self):
88 if self.vol_created:
89 self._delete_test_volume()
90 self._disable_snap_schedule()
91 super(TestSnapSchedulesHelper, self).tearDown()
92
93 def _schedule_to_timeout(self, schedule):
94 mult = schedule[-1]
95 period = int(schedule[0:-1])
96 if mult == 'M':
97 return period * 60
98 elif mult == 'h':
99 return period * 60 * 60
100 elif mult == 'd':
101 return period * 60 * 60 * 24
102 elif mult == 'w':
103 return period * 60 * 60 * 24 * 7
104 else:
105 raise RuntimeError('schedule multiplier not recognized')
106
107 def add_snap_create_cbk(self, cbk):
108 self.create_cbks.append(cbk)
109 def remove_snap_create_cbk(self, cbk):
110 self.create_cbks.remove(cbk)
111
112 def add_snap_remove_cbk(self, cbk):
113 self.remove_cbks.append(cbk)
114 def remove_snap_remove_cbk(self, cbk):
115 self.remove_cbks.remove(cbk)
116
117 def assert_if_not_verified(self):
118 self.assertListEqual(self.create_cbks, [])
119 self.assertListEqual(self.remove_cbks, [])
120
121 def verify(self, dir_path, max_trials):
122 trials = 0
123 snap_path = f'{dir_path}/.snap'
124 while (len(self.create_cbks) or len(self.remove_cbks)) and trials < max_trials:
125 snapshots = set(self.mount_a.ls(path=snap_path))
126 log.info(f'snapshots: {snapshots}')
127 added = snapshots - self.snapshots
128 log.info(f'added: {added}')
129 removed = self.snapshots - snapshots
130 log.info(f'removed: {removed}')
131 if added:
132 for cbk in list(self.create_cbks):
133 res = cbk(list(added))
134 if res:
135 self.remove_snap_create_cbk(cbk)
136 break
137 if removed:
138 for cbk in list(self.remove_cbks):
139 res = cbk(list(removed))
140 if res:
141 self.remove_snap_remove_cbk(cbk)
142 break
143 self.snapshots = snapshots
144 trials += 1
145 time.sleep(1)
146
147 def calc_wait_time_and_snap_name(self, snap_sched_exec_epoch, schedule):
148 timo = self._schedule_to_timeout(schedule)
149 # calculate wait time upto the next minute
150 wait_timo = seconds_upto_next_schedule(snap_sched_exec_epoch, timo)
151
152 # expected "scheduled" snapshot name
153 ts_name = (datetime.utcfromtimestamp(snap_sched_exec_epoch)
154 + timedelta(seconds=wait_timo)).strftime(TestSnapSchedulesHelper.SNAPSHOT_TS_FORMAT)
155 return (wait_timo, ts_name)
156
157 def verify_schedule(self, dir_path, schedules, retentions=[]):
158 log.debug(f'expected_schedule: {schedules}, expected_retention: {retentions}')
159
160 result = self.fs_snap_schedule_cmd('list', path=dir_path, format='json')
161 json_res = json.loads(result)
162 log.debug(f'json_res: {json_res}')
163
164 for schedule in schedules:
165 self.assertTrue(schedule in json_res['schedule'])
166 for retention in retentions:
167 self.assertTrue(retention in json_res['retention'])
168
169 class TestSnapSchedules(TestSnapSchedulesHelper):
170 def remove_snapshots(self, dir_path):
171 snap_path = f'{dir_path}/.snap'
172
173 snapshots = self.mount_a.ls(path=snap_path)
174 for snapshot in snapshots:
175 snapshot_path = os.path.join(snap_path, snapshot)
176 log.debug(f'removing snapshot: {snapshot_path}')
177 self.mount_a.run_shell(['rmdir', snapshot_path])
178
179 def test_non_existent_snap_schedule_list(self):
180 """Test listing snap schedules on a non-existing filesystem path failure"""
181 try:
182 self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
183 except CommandFailedError as ce:
184 if ce.exitstatus != errno.ENOENT:
185 raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
186 else:
187 raise RuntimeError('expected "fs snap-schedule list" to fail')
188
189 def test_non_existent_schedule(self):
190 """Test listing non-existing snap schedules failure"""
191 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
192
193 try:
194 self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
195 except CommandFailedError as ce:
196 if ce.exitstatus != errno.ENOENT:
197 raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
198 else:
199 raise RuntimeError('expected "fs snap-schedule list" returned fail')
200
201 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
202
203 def test_snap_schedule_list_post_schedule_remove(self):
204 """Test listing snap schedules post removal of a schedule"""
205 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
206
207 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1h')
208
209 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
210
211 try:
212 self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY)
213 except CommandFailedError as ce:
214 if ce.exitstatus != errno.ENOENT:
215 raise RuntimeError('incorrect errno when listing a non-existing snap schedule')
216 else:
217 raise RuntimeError('"fs snap-schedule list" returned error')
218
219 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
220
221 def test_snap_schedule(self):
222 """Test existence of a scheduled snapshot"""
223 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
224
225 # set a schedule on the dir
226 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
227 exec_time = time.time()
228
229 timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
230 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo}s...')
231 to_wait = timo + 2 # some leeway to avoid false failures...
232
233 # verify snapshot schedule
234 self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'])
235
236 def verify_added(snaps_added):
237 log.debug(f'snapshots added={snaps_added}')
238 self.assertEqual(len(snaps_added), 1)
239 snapname = snaps_added[0]
240 if snapname.startswith('scheduled-'):
241 if snapname[10:26] == snap_sfx[:16]:
242 self.check_scheduled_snapshot(exec_time, timo)
243 return True
244 return False
245 self.add_snap_create_cbk(verify_added)
246 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
247 self.assert_if_not_verified()
248
249 # remove snapshot schedule
250 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
251
252 # remove all scheduled snapshots
253 self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
254
255 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
256
257 def test_multi_snap_schedule(self):
258 """Test exisitence of multiple scheduled snapshots"""
259 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
260
261 # set schedules on the dir
262 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
263 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='2M')
264 exec_time = time.time()
265
266 timo_1, snap_sfx_1 = self.calc_wait_time_and_snap_name(exec_time, '1M')
267 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_1} in ~{timo_1}s...')
268 timo_2, snap_sfx_2 = self.calc_wait_time_and_snap_name(exec_time, '2M')
269 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_2} in ~{timo_2}s...')
270 to_wait = timo_2 + 2 # use max timeout
271
272 # verify snapshot schedule
273 self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M', '2M'])
274
275 def verify_added_1(snaps_added):
276 log.debug(f'snapshots added={snaps_added}')
277 self.assertEqual(len(snaps_added), 1)
278 snapname = snaps_added[0]
279 if snapname.startswith('scheduled-'):
280 if snapname[10:26] == snap_sfx_1[:16]:
281 self.check_scheduled_snapshot(exec_time, timo_1)
282 return True
283 return False
284 def verify_added_2(snaps_added):
285 log.debug(f'snapshots added={snaps_added}')
286 self.assertEqual(len(snaps_added), 1)
287 snapname = snaps_added[0]
288 if snapname.startswith('scheduled-'):
289 if snapname[10:26] == snap_sfx_2[:16]:
290 self.check_scheduled_snapshot(exec_time, timo_2)
291 return True
292 return False
293 self.add_snap_create_cbk(verify_added_1)
294 self.add_snap_create_cbk(verify_added_2)
295 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
296 self.assert_if_not_verified()
297
298 # remove snapshot schedule
299 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
300
301 # remove all scheduled snapshots
302 self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
303
304 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
305
306 def test_snap_schedule_with_retention(self):
307 """Test scheduled snapshots along with rentention policy"""
308 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
309
310 # set a schedule on the dir
311 self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M')
312 self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedules.TEST_DIRECTORY, retention_spec_or_period='1M')
313 exec_time = time.time()
314
315 timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
316 log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_1}s...')
317 to_wait = timo_1 + 2 # some leeway to avoid false failures...
318
319 # verify snapshot schedule
320 self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}])
321
322 def verify_added(snaps_added):
323 log.debug(f'snapshots added={snaps_added}')
324 self.assertEqual(len(snaps_added), 1)
325 snapname = snaps_added[0]
326 if snapname.startswith('scheduled-'):
327 if snapname[10:26] == snap_sfx[:16]:
328 self.check_scheduled_snapshot(exec_time, timo_1)
329 return True
330 return False
331 self.add_snap_create_cbk(verify_added)
332 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait)
333 self.assert_if_not_verified()
334
335 timo_2 = timo_1 + 60 # expected snapshot removal timeout
336 def verify_removed(snaps_removed):
337 log.debug(f'snapshots removed={snaps_removed}')
338 self.assertEqual(len(snaps_removed), 1)
339 snapname = snaps_removed[0]
340 if snapname.startswith('scheduled-'):
341 if snapname[10:26] == snap_sfx[:16]:
342 self.check_scheduled_snapshot(exec_time, timo_2)
343 return True
344 return False
345 log.debug(f'expecting removal of snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_2}s...')
346 to_wait = timo_2
347 self.add_snap_remove_cbk(verify_removed)
348 self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait+2)
349 self.assert_if_not_verified()
350
351 # remove snapshot schedule
352 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY)
353
354 # remove all scheduled snapshots
355 self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY)
356
357 self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY])
358
359 def get_snap_stats(self, dir_path):
360 snap_path = f"{dir_path}/.snap"[1:]
361 snapshots = self.mount_a.ls(path=snap_path)
362 fs_count = len(snapshots)
363 log.debug(f'snapshots: {snapshots}')
364
365 result = self.fs_snap_schedule_cmd('status', path=dir_path,
366 snap_schedule='1M', format='json')
367 json_res = json.loads(result)[0]
368 db_count = int(json_res['created_count'])
369 log.debug(f'json_res: {json_res}')
370
371 snap_stats = dict()
372 snap_stats['fs_count'] = fs_count
373 snap_stats['db_count'] = db_count
374
375 return snap_stats
376
377 def verify_snap_stats(self, dir_path):
378 snap_stats = self.get_snap_stats(dir_path)
379 self.assertTrue(snap_stats['fs_count'] == snap_stats['db_count'])
380
381 def test_concurrent_snap_creates(self):
382 """Test concurrent snap creates in same file-system without db issues"""
383 """
384 Test snap creates at same cadence on same fs to verify correct stats.
385 A single SQLite DB Connection handle cannot be used to run concurrent
386 transactions and results transaction aborts. This test makes sure that
387 proper care has been taken in the code to avoid such situation by
388 verifying number of dirs created on the file system with the
389 created_count in the schedule_meta table for the specific path.
390 """
391 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
392
393 testdirs = []
394 for d in range(10):
395 testdirs.append(os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "dir" + str(d)))
396
397 for d in testdirs:
398 self.mount_a.run_shell(['mkdir', '-p', d[1:]])
399 self.fs_snap_schedule_cmd('add', path=d, snap_schedule='1M')
400
401 exec_time = time.time()
402 timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
403
404 for d in testdirs:
405 self.fs_snap_schedule_cmd('activate', path=d, snap_schedule='1M')
406
407 # we wait for 10 snaps to be taken
408 wait_time = timo_1 + 10 * 60 + 15
409 time.sleep(wait_time)
410
411 for d in testdirs:
412 self.fs_snap_schedule_cmd('deactivate', path=d, snap_schedule='1M')
413
414 for d in testdirs:
415 self.verify_snap_stats(d)
416
417 for d in testdirs:
418 self.fs_snap_schedule_cmd('remove', path=d, snap_schedule='1M')
419 self.remove_snapshots(d[1:])
420 self.mount_a.run_shell(['rmdir', d[1:]])
421
422 def test_snap_schedule_with_mgr_restart(self):
423 """Test that snap schedule is resumed after mgr restart"""
424 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY])
425 testdir = os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "test_restart")
426 self.mount_a.run_shell(['mkdir', '-p', testdir[1:]])
427 self.fs_snap_schedule_cmd('add', path=testdir, snap_schedule='1M')
428
429 exec_time = time.time()
430 timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
431
432 self.fs_snap_schedule_cmd('activate', path=testdir, snap_schedule='1M')
433
434 # we wait for 10 snaps to be taken
435 wait_time = timo_1 + 10 * 60 + 15
436 time.sleep(wait_time)
437
438 old_stats = self.get_snap_stats(testdir)
439 self.assertTrue(old_stats['fs_count'] == old_stats['db_count'])
440 self.assertTrue(old_stats['fs_count'] > 9)
441
442 # restart mgr
443 active_mgr = self.mgr_cluster.mon_manager.get_mgr_dump()['active_name']
444 log.debug(f'restarting active mgr: {active_mgr}')
445 self.mgr_cluster.mon_manager.revive_mgr(active_mgr)
446 time.sleep(300) # sleep for 5 minutes
447 self.fs_snap_schedule_cmd('deactivate', path=testdir, snap_schedule='1M')
448
449 new_stats = self.get_snap_stats(testdir)
450 self.assertTrue(new_stats['fs_count'] == new_stats['db_count'])
451 self.assertTrue(new_stats['fs_count'] > old_stats['fs_count'])
452 self.assertTrue(new_stats['db_count'] > old_stats['db_count'])
453
454 # cleanup
455 self.fs_snap_schedule_cmd('remove', path=testdir, snap_schedule='1M')
456 self.remove_snapshots(testdir[1:])
457 self.mount_a.run_shell(['rmdir', testdir[1:]])
458
459 class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper):
460 def remove_snapshots(self, dir_path, sdn):
461 snap_path = f'{dir_path}/{sdn}'
462
463 snapshots = self.mount_a.ls(path=snap_path)
464 for snapshot in snapshots:
465 snapshot_path = os.path.join(snap_path, snapshot)
466 log.debug(f'removing snapshot: {snapshot_path}')
467 self.mount_a.run_shell(['rmdir', snapshot_path])
468
469 def get_snap_dir_name(self):
470 from tasks.cephfs.fuse_mount import FuseMount
471 from tasks.cephfs.kernel_mount import KernelMount
472
473 if isinstance(self.mount_a, KernelMount):
474 sdn = self.mount_a.client_config.get('snapdirname', '.snap')
475 elif isinstance(self.mount_a, FuseMount):
476 sdn = self.mount_a.client_config.get('client_snapdir', '.snap')
477 self.fs.set_ceph_conf('client', 'client snapdir', sdn)
478 self.mount_a.remount()
479 return sdn
480
481 def test_snap_dir_name(self):
482 """Test the correctness of snap directory name"""
483 self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedulesSnapdir.TEST_DIRECTORY])
484
485 # set a schedule on the dir
486 self.fs_snap_schedule_cmd('add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, snap_schedule='1M')
487 self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, retention_spec_or_period='1M')
488 exec_time = time.time()
489
490 timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
491 sdn = self.get_snap_dir_name()
492 log.info(f'expecting snap {TestSnapSchedulesSnapdir.TEST_DIRECTORY}/{sdn}/scheduled-{snap_sfx} in ~{timo}s...')
493
494 # verify snapshot schedule
495 self.verify_schedule(TestSnapSchedulesSnapdir.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}])
496
497 # remove snapshot schedule
498 self.fs_snap_schedule_cmd('remove', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY)
499
500 # remove all scheduled snapshots
501 self.remove_snapshots(TestSnapSchedulesSnapdir.TEST_DIRECTORY, sdn)
502
503 self.mount_a.run_shell(['rmdir', TestSnapSchedulesSnapdir.TEST_DIRECTORY])