]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/systemd.py
9 from teuthology
.orchestra
import run
10 from teuthology
.misc
import reconnect
, get_first_mon
, wait_until_healthy
12 log
= logging
.getLogger(__name__
)
14 def _remote_service_status(remote
, service
):
15 status
= remote
.sh('sudo systemctl status %s' % service
,
19 @contextlib.contextmanager
20 def task(ctx
, config
):
26 Test ceph systemd services can start, stop and restart and
27 check for any failed services and report back errors
29 for remote
, roles
in ctx
.cluster
.remotes
.items():
30 remote
.run(args
=['sudo', 'ps', '-eaf', run
.Raw('|'),
32 units
= remote
.sh('sudo systemctl list-units | grep ceph',
35 if units
.find('failed'):
36 log
.info("Ceph services in failed state")
38 # test overall service stop and start using ceph.target
39 # ceph.target tests are meant for ceph systemd tests
40 # and not actual process testing using 'ps'
41 log
.info("Stopping all Ceph services")
42 remote
.run(args
=['sudo', 'systemctl', 'stop', 'ceph.target'])
43 status
= _remote_service_status(remote
, 'ceph.target')
45 log
.info("Checking process status")
46 ps_eaf
= remote
.sh('sudo ps -eaf | grep ceph')
47 if ps_eaf
.find('Active: inactive'):
48 log
.info("Successfully stopped all ceph services")
50 log
.info("Failed to stop ceph services")
52 log
.info("Starting all Ceph services")
53 remote
.run(args
=['sudo', 'systemctl', 'start', 'ceph.target'])
54 status
= _remote_service_status(remote
, 'ceph.target')
56 if status
.find('Active: active'):
57 log
.info("Successfully started all Ceph services")
59 log
.info("info", "Failed to start Ceph services")
60 ps_eaf
= remote
.sh('sudo ps -eaf | grep ceph')
64 # test individual services start stop
65 name
= remote
.shortname
66 mon_name
= 'ceph-mon@' + name
+ '.service'
67 mds_name
= 'ceph-mds@' + name
+ '.service'
68 mgr_name
= 'ceph-mgr@' + name
+ '.service'
69 mon_role_name
= 'mon.' + name
70 mds_role_name
= 'mds.' + name
71 mgr_role_name
= 'mgr.' + name
72 m_osd
= re
.search('--id (\d+) --setuser ceph', ps_eaf
)
74 osd_service
= 'ceph-osd@{m}.service'.format(m
=m_osd
.group(1))
75 remote
.run(args
=['sudo', 'systemctl', 'status',
77 remote
.run(args
=['sudo', 'systemctl', 'stop',
79 time
.sleep(4) # immediate check will result in deactivating state
80 status
= _remote_service_status(remote
, osd_service
)
82 if status
.find('Active: inactive'):
83 log
.info("Successfully stopped single osd ceph service")
85 log
.info("Failed to stop ceph osd services")
86 remote
.sh(['sudo', 'systemctl', 'start', osd_service
])
88 if mon_role_name
in roles
:
89 remote
.run(args
=['sudo', 'systemctl', 'status', mon_name
])
90 remote
.run(args
=['sudo', 'systemctl', 'stop', mon_name
])
91 time
.sleep(4) # immediate check will result in deactivating state
92 status
= _remote_service_status(remote
, mon_name
)
93 if status
.find('Active: inactive'):
94 log
.info("Successfully stopped single mon ceph service")
96 log
.info("Failed to stop ceph mon service")
97 remote
.run(args
=['sudo', 'systemctl', 'start', mon_name
])
99 if mgr_role_name
in roles
:
100 remote
.run(args
=['sudo', 'systemctl', 'status', mgr_name
])
101 remote
.run(args
=['sudo', 'systemctl', 'stop', mgr_name
])
102 time
.sleep(4) # immediate check will result in deactivating state
103 status
= _remote_service_status(remote
, mgr_name
)
104 if status
.find('Active: inactive'):
105 log
.info("Successfully stopped single ceph mgr service")
107 log
.info("Failed to stop ceph mgr service")
108 remote
.run(args
=['sudo', 'systemctl', 'start', mgr_name
])
110 if mds_role_name
in roles
:
111 remote
.run(args
=['sudo', 'systemctl', 'status', mds_name
])
112 remote
.run(args
=['sudo', 'systemctl', 'stop', mds_name
])
113 time
.sleep(4) # immediate check will result in deactivating state
114 status
= _remote_service_status(remote
, mds_name
)
115 if status
.find('Active: inactive'):
116 log
.info("Successfully stopped single ceph mds service")
118 log
.info("Failed to stop ceph mds service")
119 remote
.run(args
=['sudo', 'systemctl', 'start', mds_name
])
122 # reboot all nodes and verify the systemd units restart
123 # workunit that runs would fail if any of the systemd unit doesnt start
124 ctx
.cluster
.run(args
='sudo reboot', wait
=False, check_status
=False)
125 # avoid immediate reconnect
127 reconnect(ctx
, 480) # reconnect all nodes
129 ctx
.cluster
.run(args
=['sudo', 'ps', '-eaf', run
.Raw('|'),
132 mon
= get_first_mon(ctx
, config
)
133 (mon_remote
,) = ctx
.cluster
.only(mon
).remotes
.keys()
134 wait_until_healthy(ctx
, mon_remote
, use_sudo
=True)