ceph/qa/tasks/systemd.py

   1 """
   2 Systemd test
   3 """
   4 import contextlib
   5 import logging
   6 import re
   7 import time
   8
   9 from teuthology.orchestra import run
  10 from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
  11
  12 log = logging.getLogger(__name__)
  13
  14 def _remote_service_status(remote, service):
  15     status = remote.sh('sudo systemctl status %s' % service,
  16                        check_status=False)
  17     return status
  18
  19 @contextlib.contextmanager
  20 def task(ctx, config):
  21     """
  22       - tasks:
  23           ceph-deploy:
  24           systemd:
  25
  26     Test ceph systemd services can start, stop and restart and
  27     check for any failed services and report back errors
  28     """
  29     for remote, roles in ctx.cluster.remotes.items():
  30         remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
  31                          'grep', 'ceph'])
  32         units = remote.sh('sudo systemctl list-units | grep ceph',
  33                           check_status=False)
  34         log.info(units)
  35         if units.find('failed'):
  36             log.info("Ceph services in failed state")
  37
  38         # test overall service stop and start using ceph.target
  39         # ceph.target tests are meant for ceph systemd tests
  40         # and not actual process testing using 'ps'
  41         log.info("Stopping all Ceph services")
  42         remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
  43         status = _remote_service_status(remote, 'ceph.target')
  44         log.info(status)
  45         log.info("Checking process status")
  46         ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
  47         if ps_eaf.find('Active: inactive'):
  48             log.info("Successfully stopped all ceph services")
  49         else:
  50             log.info("Failed to stop ceph services")
  51
  52         log.info("Starting all Ceph services")
  53         remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
  54         status = _remote_service_status(remote, 'ceph.target')
  55         log.info(status)
  56         if status.find('Active: active'):
  57             log.info("Successfully started all Ceph services")
  58         else:
  59             log.info("info", "Failed to start Ceph services")
  60         ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
  61         log.info(ps_eaf)
  62         time.sleep(4)
  63
  64         # test individual services start stop
  65         name = remote.shortname
  66         mon_name = 'ceph-mon@' + name + '.service'
  67         mds_name = 'ceph-mds@' + name + '.service'
  68         mgr_name = 'ceph-mgr@' + name + '.service'
  69         mon_role_name = 'mon.' + name
  70         mds_role_name = 'mds.' + name
  71         mgr_role_name = 'mgr.' + name
  72         m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf)
  73         if m_osd:
  74             osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
  75             remote.run(args=['sudo', 'systemctl', 'status',
  76                              osd_service])
  77             remote.run(args=['sudo', 'systemctl', 'stop',
  78                              osd_service])
  79             time.sleep(4)  # immediate check will result in deactivating state
  80             status = _remote_service_status(remote, osd_service)
  81             log.info(status)
  82             if status.find('Active: inactive'):
  83                 log.info("Successfully stopped single osd ceph service")
  84             else:
  85                 log.info("Failed to stop ceph osd services")
  86             remote.sh(['sudo', 'systemctl', 'start', osd_service])
  87             time.sleep(4)
  88         if mon_role_name in roles:
  89             remote.run(args=['sudo', 'systemctl', 'status', mon_name])
  90             remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
  91             time.sleep(4)  # immediate check will result in deactivating state
  92             status = _remote_service_status(remote, mon_name)
  93             if status.find('Active: inactive'):
  94                 log.info("Successfully stopped single mon ceph service")
  95             else:
  96                 log.info("Failed to stop ceph mon service")
  97             remote.run(args=['sudo', 'systemctl', 'start', mon_name])
  98             time.sleep(4)
  99         if mgr_role_name in roles:
 100             remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
 101             remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
 102             time.sleep(4)  # immediate check will result in deactivating state
 103             status = _remote_service_status(remote, mgr_name)
 104             if status.find('Active: inactive'):
 105                 log.info("Successfully stopped single ceph mgr service")
 106             else:
 107                 log.info("Failed to stop ceph mgr service")
 108             remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
 109             time.sleep(4)
 110         if mds_role_name in roles:
 111             remote.run(args=['sudo', 'systemctl', 'status', mds_name])
 112             remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
 113             time.sleep(4)  # immediate check will result in deactivating state
 114             status = _remote_service_status(remote, mds_name)
 115             if status.find('Active: inactive'):
 116                 log.info("Successfully stopped single ceph mds service")
 117             else:
 118                 log.info("Failed to stop ceph mds service")
 119             remote.run(args=['sudo', 'systemctl', 'start', mds_name])
 120             time.sleep(4)
 121
 122     # reboot all nodes and verify the systemd units restart
 123     # workunit that runs would fail if any of the systemd unit doesnt start
 124     ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
 125     # avoid immediate reconnect
 126     time.sleep(120)
 127     reconnect(ctx, 480)  # reconnect all nodes
 128     # for debug info
 129     ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
 130                           'grep', 'ceph'])
 131     # wait for HEALTH_OK
 132     mon = get_first_mon(ctx, config)
 133     (mon_remote,) = ctx.cluster.only(mon).remotes.keys()
 134     wait_until_healthy(ctx, mon_remote, use_sudo=True)
 135     yield