ceph/src/pybind/mgr/dashboard/services/ceph_service.py

   1 # -*- coding: utf-8 -*-
   2
   3 import json
   4 import logging
   5
   6 import rados
   7 from mgr_module import CommandResult
   8 from mgr_util import get_most_recent_rate, get_time_series_rates
   9
  10 from .. import mgr
  11
  12 try:
  13     from typing import Any, Dict, Optional, Union
  14 except ImportError:
  15     pass  # For typing only
  16
  17 logger = logging.getLogger('ceph_service')
  18
  19
  20 class SendCommandError(rados.Error):
  21     def __init__(self, err, prefix, argdict, errno):
  22         self.prefix = prefix
  23         self.argdict = argdict
  24         super(SendCommandError, self).__init__(err, errno)
  25
  26
  27 # pylint: disable=too-many-public-methods
  28 class CephService(object):
  29
  30     OSD_FLAG_NO_SCRUB = 'noscrub'
  31     OSD_FLAG_NO_DEEP_SCRUB = 'nodeep-scrub'
  32
  33     PG_STATUS_SCRUBBING = 'scrubbing'
  34     PG_STATUS_DEEP_SCRUBBING = 'deep'
  35
  36     SCRUB_STATUS_DISABLED = 'Disabled'
  37     SCRUB_STATUS_ACTIVE = 'Active'
  38     SCRUB_STATUS_INACTIVE = 'Inactive'
  39
  40     @classmethod
  41     def get_service_map(cls, service_name):
  42         service_map = {}  # type: Dict[str, dict]
  43         for server in mgr.list_servers():
  44             for service in server['services']:
  45                 if service['type'] == service_name:
  46                     if server['hostname'] not in service_map:
  47                         service_map[server['hostname']] = {
  48                             'server': server,
  49                             'services': []
  50                         }
  51                     inst_id = service['id']
  52                     metadata = mgr.get_metadata(service_name, inst_id)
  53                     status = mgr.get_daemon_status(service_name, inst_id)
  54                     service_map[server['hostname']]['services'].append({
  55                         'id': inst_id,
  56                         'type': service_name,
  57                         'hostname': server['hostname'],
  58                         'metadata': metadata,
  59                         'status': status
  60                     })
  61         return service_map
  62
  63     @classmethod
  64     def get_service_list(cls, service_name):
  65         service_map = cls.get_service_map(service_name)
  66         return [svc for _, svcs in service_map.items() for svc in svcs['services']]
  67
  68     @classmethod
  69     def get_service_data_by_metadata_id(cls,
  70                                         service_type: str,
  71                                         metadata_id: str) -> Optional[Dict[str, Any]]:
  72         for server in mgr.list_servers():
  73             for service in server['services']:
  74                 if service['type'] == service_type:
  75                     metadata = mgr.get_metadata(service_type, service['id'])
  76                     if metadata_id == metadata['id']:
  77                         return {
  78                             'id': metadata['id'],
  79                             'service_map_id': str(service['id']),
  80                             'type': service_type,
  81                             'hostname': server['hostname'],
  82                             'metadata': metadata
  83                         }
  84         return None
  85
  86     @classmethod
  87     def get_service(cls, service_type: str, metadata_id: str) -> Optional[Dict[str, Any]]:
  88         svc_data = cls.get_service_data_by_metadata_id(service_type, metadata_id)
  89         if svc_data:
  90             svc_data['status'] = mgr.get_daemon_status(svc_data['type'], svc_data['service_map_id'])
  91         return svc_data
  92
  93     @classmethod
  94     def get_service_perf_counters(cls, service_type: str, service_id: str) -> Dict[str, Any]:
  95         schema_dict = mgr.get_perf_schema(service_type, service_id)
  96         schema = schema_dict["{}.{}".format(service_type, service_id)]
  97         counters = []
  98         for key, value in sorted(schema.items()):
  99             counter = {'name': str(key), 'description': value['description']}
 100             # pylint: disable=W0212
 101             if mgr._stattype_to_str(value['type']) == 'counter':
 102                 counter['value'] = cls.get_rate(
 103                     service_type, service_id, key)
 104                 counter['unit'] = mgr._unit_to_str(value['units'])
 105             else:
 106                 counter['value'] = mgr.get_latest(
 107                     service_type, service_id, key)
 108                 counter['unit'] = ''
 109             counters.append(counter)
 110
 111         return {
 112             'service': {
 113                 'type': service_type,
 114                 'id': str(service_id)
 115             },
 116             'counters': counters
 117         }
 118
 119     @classmethod
 120     def get_pool_list(cls, application=None):
 121         osd_map = mgr.get('osd_map')
 122         if not application:
 123             return osd_map['pools']
 124         return [pool for pool in osd_map['pools']
 125                 if application in pool.get('application_metadata', {})]
 126
 127     @classmethod
 128     def get_pool_list_with_stats(cls, application=None):
 129         # pylint: disable=too-many-locals
 130         pools = cls.get_pool_list(application)
 131
 132         pools_w_stats = []
 133
 134         pg_summary = mgr.get("pg_summary")
 135         pool_stats = mgr.get_updated_pool_stats()
 136
 137         for pool in pools:
 138             pool['pg_status'] = pg_summary['by_pool'][pool['pool'].__str__()]
 139             stats = pool_stats[pool['pool']]
 140             s = {}
 141
 142             for stat_name, stat_series in stats.items():
 143                 rates = get_time_series_rates(stat_series)
 144                 s[stat_name] = {
 145                     'latest': stat_series[0][1],
 146                     'rate': get_most_recent_rate(rates),
 147                     'rates': rates
 148                 }
 149             pool['stats'] = s
 150             pools_w_stats.append(pool)
 151         return pools_w_stats
 152
 153     @classmethod
 154     def get_erasure_code_profiles(cls):
 155         def _serialize_ecp(name, ecp):
 156             def serialize_numbers(key):
 157                 value = ecp.get(key)
 158                 if value is not None:
 159                     ecp[key] = int(value)
 160
 161             ecp['name'] = name
 162             serialize_numbers('k')
 163             serialize_numbers('m')
 164             return ecp
 165
 166         ret = []
 167         for name, ecp in mgr.get('osd_map').get('erasure_code_profiles', {}).items():
 168             ret.append(_serialize_ecp(name, ecp))
 169         return ret
 170
 171     @classmethod
 172     def get_pool_name_from_id(cls, pool_id):
 173         # type: (int) -> Union[str, None]
 174         return mgr.rados.pool_reverse_lookup(pool_id)
 175
 176     @classmethod
 177     def get_pool_by_attribute(cls, attribute, value):
 178         # type: (str, Any) -> Union[dict, None]
 179         pool_list = cls.get_pool_list()
 180         for pool in pool_list:
 181             if attribute in pool and pool[attribute] == value:
 182                 return pool
 183         return None
 184
 185     @classmethod
 186     def get_pool_pg_status(cls, pool_name):
 187         # type: (str) -> dict
 188         pool = cls.get_pool_by_attribute('pool_name', pool_name)
 189         if pool is None:
 190             return {}
 191         return mgr.get("pg_summary")['by_pool'][pool['pool'].__str__()]
 192
 193     @staticmethod
 194     def send_command(srv_type, prefix, srv_spec='', **kwargs):
 195         # type: (str, str, Optional[str], Any) -> Any
 196         """
 197         :type prefix: str
 198         :param srv_type: mon |
 199         :param kwargs: will be added to argdict
 200         :param srv_spec: typically empty. or something like "<fs_id>:0"
 201
 202         :raises PermissionError: See rados.make_ex
 203         :raises ObjectNotFound: See rados.make_ex
 204         :raises IOError: See rados.make_ex
 205         :raises NoSpace: See rados.make_ex
 206         :raises ObjectExists: See rados.make_ex
 207         :raises ObjectBusy: See rados.make_ex
 208         :raises NoData: See rados.make_ex
 209         :raises InterruptedOrTimeoutError: See rados.make_ex
 210         :raises TimedOut: See rados.make_ex
 211         :raises ValueError: return code != 0
 212         """
 213         argdict = {
 214             "prefix": prefix,
 215             "format": "json",
 216         }
 217         argdict.update({k: v for k, v in kwargs.items() if v is not None})
 218         result = CommandResult("")
 219         mgr.send_command(result, srv_type, srv_spec, json.dumps(argdict), "")
 220         r, outb, outs = result.wait()
 221         if r != 0:
 222             logger.error("send_command '%s' failed. (r=%s, outs=\"%s\", kwargs=%s)", prefix, r,
 223                          outs, kwargs)
 224
 225             raise SendCommandError(outs, prefix, argdict, r)
 226
 227         try:
 228             return json.loads(outb or outs)
 229         except Exception:  # pylint: disable=broad-except
 230             return outb
 231
 232     @staticmethod
 233     def _get_smart_data_by_device(device):
 234         # type: (dict) -> Dict[str, dict]
 235         # Check whether the device is associated with daemons.
 236         if 'daemons' in device and device['daemons']:
 237             dev_smart_data: Dict[str, Any] = {}
 238
 239             # Get a list of all OSD daemons on all hosts that are 'up'
 240             # because SMART data can not be retrieved from daemons that
 241             # are 'down' or 'destroyed'.
 242             osd_tree = CephService.send_command('mon', 'osd tree')
 243             osd_daemons_up = [
 244                 node['name'] for node in osd_tree.get('nodes', {})
 245                 if node.get('status') == 'up'
 246             ]
 247
 248             # All daemons on the same host can deliver SMART data,
 249             # thus it is not relevant for us which daemon we are using.
 250             # NOTE: the list may contain daemons that are 'down' or 'destroyed'.
 251             for daemon in device['daemons']:
 252                 svc_type, svc_id = daemon.split('.')
 253                 if 'osd' in svc_type:
 254                     if daemon not in osd_daemons_up:
 255                         continue
 256                     try:
 257                         dev_smart_data = CephService.send_command(
 258                             svc_type, 'smart', svc_id, devid=device['devid'])
 259                     except SendCommandError as error:
 260                         logger.warning(str(error))
 261                         # Try to retrieve SMART data from another daemon.
 262                         continue
 263                 elif 'mon' in svc_type:
 264                     try:
 265                         dev_smart_data = CephService.send_command(
 266                             svc_type, 'device query-daemon-health-metrics', who=daemon)
 267                     except SendCommandError as error:
 268                         logger.warning(str(error))
 269                         # Try to retrieve SMART data from another daemon.
 270                         continue
 271                 else:
 272                     dev_smart_data = {}
 273                 for dev_id, dev_data in dev_smart_data.items():
 274                     if 'error' in dev_data:
 275                         logger.warning(
 276                             '[SMART] Error retrieving smartctl data for device ID "%s": %s',
 277                             dev_id, dev_data)
 278                 break
 279
 280             return dev_smart_data
 281         logger.warning('[SMART] No daemons associated with device ID "%s"',
 282                        device['devid'])
 283         return {}
 284
 285     @staticmethod
 286     def get_devices_by_host(hostname):
 287         # type: (str) -> dict
 288         return CephService.send_command('mon',
 289                                         'device ls-by-host',
 290                                         host=hostname)
 291
 292     @staticmethod
 293     def get_devices_by_daemon(daemon_type, daemon_id):
 294         # type: (str, str) -> dict
 295         return CephService.send_command('mon',
 296                                         'device ls-by-daemon',
 297                                         who='{}.{}'.format(
 298                                             daemon_type, daemon_id))
 299
 300     @staticmethod
 301     def get_smart_data_by_host(hostname):
 302         # type: (str) -> dict
 303         """
 304         Get the SMART data of all devices on the given host, regardless
 305         of the daemon (osd, mon, ...).
 306         :param hostname: The name of the host.
 307         :return: A dictionary containing the SMART data of every device
 308           on the given host. The device name is used as the key in the
 309           dictionary.
 310         """
 311         devices = CephService.get_devices_by_host(hostname)
 312         smart_data = {}  # type: dict
 313         if devices:
 314             for device in devices:
 315                 if device['devid'] not in smart_data:
 316                     smart_data.update(
 317                         CephService._get_smart_data_by_device(device))
 318         else:
 319             logger.debug('[SMART] could not retrieve device list from host %s', hostname)
 320         return smart_data
 321
 322     @staticmethod
 323     def get_smart_data_by_daemon(daemon_type, daemon_id):
 324         # type: (str, str) -> Dict[str, dict]
 325         """
 326         Get the SMART data of the devices associated with the given daemon.
 327         :param daemon_type: The daemon type, e.g. 'osd' or 'mon'.
 328         :param daemon_id: The daemon identifier.
 329         :return: A dictionary containing the SMART data of every device
 330           associated with the given daemon. The device name is used as the
 331           key in the dictionary.
 332         """
 333         devices = CephService.get_devices_by_daemon(daemon_type, daemon_id)
 334         smart_data = {}  # type: Dict[str, dict]
 335         if devices:
 336             for device in devices:
 337                 if device['devid'] not in smart_data:
 338                     smart_data.update(
 339                         CephService._get_smart_data_by_device(device))
 340         else:
 341             msg = '[SMART] could not retrieve device list from daemon with type %s and ' +\
 342                 'with ID %s'
 343             logger.debug(msg, daemon_type, daemon_id)
 344         return smart_data
 345
 346     @classmethod
 347     def get_rates(cls, svc_type, svc_name, path):
 348         """
 349         :return: the derivative of mgr.get_counter()
 350         :rtype: list[tuple[int, float]]"""
 351         data = mgr.get_counter(svc_type, svc_name, path)[path]
 352         return get_time_series_rates(data)
 353
 354     @classmethod
 355     def get_rate(cls, svc_type, svc_name, path):
 356         """returns most recent rate"""
 357         return get_most_recent_rate(cls.get_rates(svc_type, svc_name, path))
 358
 359     @classmethod
 360     def get_client_perf(cls):
 361         pools_stats = mgr.get('osd_pool_stats')['pool_stats']
 362
 363         io_stats = {
 364             'read_bytes_sec': 0,
 365             'read_op_per_sec': 0,
 366             'write_bytes_sec': 0,
 367             'write_op_per_sec': 0,
 368         }
 369         recovery_stats = {'recovering_bytes_per_sec': 0}
 370
 371         for pool_stats in pools_stats:
 372             client_io = pool_stats['client_io_rate']
 373             for stat in list(io_stats.keys()):
 374                 if stat in client_io:
 375                     io_stats[stat] += client_io[stat]
 376
 377             client_recovery = pool_stats['recovery_rate']
 378             for stat in list(recovery_stats.keys()):
 379                 if stat in client_recovery:
 380                     recovery_stats[stat] += client_recovery[stat]
 381
 382         client_perf = io_stats.copy()
 383         client_perf.update(recovery_stats)
 384
 385         return client_perf
 386
 387     @classmethod
 388     def get_scrub_status(cls):
 389         enabled_flags = mgr.get('osd_map')['flags_set']
 390         if cls.OSD_FLAG_NO_SCRUB in enabled_flags or cls.OSD_FLAG_NO_DEEP_SCRUB in enabled_flags:
 391             return cls.SCRUB_STATUS_DISABLED
 392
 393         grouped_pg_statuses = mgr.get('pg_summary')['all']
 394         for grouped_pg_status in grouped_pg_statuses.keys():
 395             if len(grouped_pg_status.split(cls.PG_STATUS_SCRUBBING)) > 1 \
 396                     or len(grouped_pg_status.split(cls.PG_STATUS_DEEP_SCRUBBING)) > 1:
 397                 return cls.SCRUB_STATUS_ACTIVE
 398
 399         return cls.SCRUB_STATUS_INACTIVE
 400
 401     @classmethod
 402     def get_pg_info(cls):
 403         pg_summary = mgr.get('pg_summary')
 404         object_stats = {stat: pg_summary['pg_stats_sum']['stat_sum'][stat] for stat in [
 405             'num_objects', 'num_object_copies', 'num_objects_degraded',
 406             'num_objects_misplaced', 'num_objects_unfound']}
 407
 408         pgs_per_osd = 0.0
 409         total_osds = len(pg_summary['by_osd'])
 410         if total_osds > 0:
 411             total_pgs = 0.0
 412             for _, osd_pg_statuses in pg_summary['by_osd'].items():
 413                 for _, pg_amount in osd_pg_statuses.items():
 414                     total_pgs += pg_amount
 415
 416             pgs_per_osd = total_pgs / total_osds
 417
 418         return {
 419             'object_stats': object_stats,
 420             'statuses': pg_summary['all'],
 421             'pgs_per_osd': pgs_per_osd,
 422         }