]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/dashboard/services/ceph_service.py
a4d0fbca9c74b0b80ff7b0f165b7aae8a0f083ac
[ceph.git] / ceph / src / pybind / mgr / dashboard / services / ceph_service.py
1 # -*- coding: utf-8 -*-
2 from __future__ import absolute_import
3
4 import json
5 import logging
6
7 import rados
8 from mgr_module import CommandResult
9 from mgr_util import get_most_recent_rate, get_time_series_rates
10
11 from .. import mgr
12 from ..exceptions import DashboardException
13
14 try:
15 from typing import Any, Dict, Optional, Union
16 except ImportError:
17 pass # For typing only
18
19 logger = logging.getLogger('ceph_service')
20
21
22 class SendCommandError(rados.Error):
23 def __init__(self, err, prefix, argdict, errno):
24 self.prefix = prefix
25 self.argdict = argdict
26 super(SendCommandError, self).__init__(err, errno)
27
28
29 # pylint: disable=too-many-public-methods
30 class CephService(object):
31
32 OSD_FLAG_NO_SCRUB = 'noscrub'
33 OSD_FLAG_NO_DEEP_SCRUB = 'nodeep-scrub'
34
35 PG_STATUS_SCRUBBING = 'scrubbing'
36 PG_STATUS_DEEP_SCRUBBING = 'deep'
37
38 SCRUB_STATUS_DISABLED = 'Disabled'
39 SCRUB_STATUS_ACTIVE = 'Active'
40 SCRUB_STATUS_INACTIVE = 'Inactive'
41
42 @classmethod
43 def get_service_map(cls, service_name):
44 service_map = {} # type: Dict[str, dict]
45 for server in mgr.list_servers():
46 for service in server['services']:
47 if service['type'] == service_name:
48 if server['hostname'] not in service_map:
49 service_map[server['hostname']] = {
50 'server': server,
51 'services': []
52 }
53 inst_id = service['id']
54 metadata = mgr.get_metadata(service_name, inst_id)
55 status = mgr.get_daemon_status(service_name, inst_id)
56 service_map[server['hostname']]['services'].append({
57 'id': inst_id,
58 'type': service_name,
59 'hostname': server['hostname'],
60 'metadata': metadata,
61 'status': status
62 })
63 return service_map
64
65 @classmethod
66 def get_service_list(cls, service_name):
67 service_map = cls.get_service_map(service_name)
68 return [svc for _, svcs in service_map.items() for svc in svcs['services']]
69
70 @classmethod
71 def get_service_data_by_metadata_id(cls,
72 service_type: str,
73 metadata_id: str) -> Optional[Dict[str, Any]]:
74 for server in mgr.list_servers():
75 for service in server['services']:
76 if service['type'] == service_type:
77 metadata = mgr.get_metadata(service_type, service['id'])
78 if metadata_id == metadata['id']:
79 return {
80 'id': metadata['id'],
81 'service_map_id': str(service['id']),
82 'type': service_type,
83 'hostname': server['hostname'],
84 'metadata': metadata
85 }
86 return None
87
88 @classmethod
89 def get_service(cls, service_type: str, metadata_id: str) -> Optional[Dict[str, Any]]:
90 svc_data = cls.get_service_data_by_metadata_id(service_type, metadata_id)
91 if svc_data:
92 svc_data['status'] = mgr.get_daemon_status(svc_data['type'], svc_data['service_map_id'])
93 return svc_data
94
95 @classmethod
96 def get_service_perf_counters(cls, service_type: str, service_id: str) -> Dict[str, Any]:
97 schema_dict = mgr.get_perf_schema(service_type, service_id)
98 schema = schema_dict["{}.{}".format(service_type, service_id)]
99 counters = []
100 for key, value in sorted(schema.items()):
101 counter = {'name': str(key), 'description': value['description']}
102 # pylint: disable=W0212
103 if mgr._stattype_to_str(value['type']) == 'counter':
104 counter['value'] = cls.get_rate(
105 service_type, service_id, key)
106 counter['unit'] = mgr._unit_to_str(value['units'])
107 else:
108 counter['value'] = mgr.get_latest(
109 service_type, service_id, key)
110 counter['unit'] = ''
111 counters.append(counter)
112
113 return {
114 'service': {
115 'type': service_type,
116 'id': str(service_id)
117 },
118 'counters': counters
119 }
120
121 @classmethod
122 def get_pool_list(cls, application=None):
123 osd_map = mgr.get('osd_map')
124 if not application:
125 return osd_map['pools']
126 return [pool for pool in osd_map['pools']
127 if application in pool.get('application_metadata', {})]
128
129 @classmethod
130 def get_pool_list_with_stats(cls, application=None):
131 # pylint: disable=too-many-locals
132 pools = cls.get_pool_list(application)
133
134 pools_w_stats = []
135
136 pg_summary = mgr.get("pg_summary")
137 pool_stats = mgr.get_updated_pool_stats()
138
139 for pool in pools:
140 pool['pg_status'] = pg_summary['by_pool'][pool['pool'].__str__()]
141 stats = pool_stats[pool['pool']]
142 s = {}
143
144 for stat_name, stat_series in stats.items():
145 rates = get_time_series_rates(stat_series)
146 s[stat_name] = {
147 'latest': stat_series[0][1],
148 'rate': get_most_recent_rate(rates),
149 'rates': rates
150 }
151 pool['stats'] = s
152 pools_w_stats.append(pool)
153 return pools_w_stats
154
155 @classmethod
156 def get_erasure_code_profiles(cls):
157 def _serialize_ecp(name, ecp):
158 def serialize_numbers(key):
159 value = ecp.get(key)
160 if value is not None:
161 ecp[key] = int(value)
162
163 ecp['name'] = name
164 serialize_numbers('k')
165 serialize_numbers('m')
166 return ecp
167
168 ret = []
169 for name, ecp in mgr.get('osd_map').get('erasure_code_profiles', {}).items():
170 ret.append(_serialize_ecp(name, ecp))
171 return ret
172
173 @classmethod
174 def get_pool_name_from_id(cls, pool_id):
175 # type: (int) -> Union[str, None]
176 return mgr.rados.pool_reverse_lookup(pool_id)
177
178 @classmethod
179 def get_pool_by_attribute(cls, attribute, value):
180 # type: (str, Any) -> Union[dict, None]
181 pool_list = cls.get_pool_list()
182 for pool in pool_list:
183 if attribute in pool and pool[attribute] == value:
184 return pool
185 return None
186
187 @classmethod
188 def get_pool_pg_status(cls, pool_name):
189 # type: (str) -> dict
190 pool = cls.get_pool_by_attribute('pool_name', pool_name)
191 if pool is None:
192 return {}
193 return mgr.get("pg_summary")['by_pool'][pool['pool'].__str__()]
194
195 @staticmethod
196 def send_command(srv_type, prefix, srv_spec='', **kwargs):
197 # type: (str, str, Optional[str], Any) -> Any
198 """
199 :type prefix: str
200 :param srv_type: mon |
201 :param kwargs: will be added to argdict
202 :param srv_spec: typically empty. or something like "<fs_id>:0"
203
204 :raises PermissionError: See rados.make_ex
205 :raises ObjectNotFound: See rados.make_ex
206 :raises IOError: See rados.make_ex
207 :raises NoSpace: See rados.make_ex
208 :raises ObjectExists: See rados.make_ex
209 :raises ObjectBusy: See rados.make_ex
210 :raises NoData: See rados.make_ex
211 :raises InterruptedOrTimeoutError: See rados.make_ex
212 :raises TimedOut: See rados.make_ex
213 :raises ValueError: return code != 0
214 """
215 argdict = {
216 "prefix": prefix,
217 "format": "json",
218 }
219 argdict.update({k: v for k, v in kwargs.items() if v is not None})
220 result = CommandResult("")
221 mgr.send_command(result, srv_type, srv_spec, json.dumps(argdict), "")
222 r, outb, outs = result.wait()
223 if r != 0:
224 logger.error("send_command '%s' failed. (r=%s, outs=\"%s\", kwargs=%s)", prefix, r,
225 outs, kwargs)
226
227 raise SendCommandError(outs, prefix, argdict, r)
228
229 try:
230 return json.loads(outb or outs)
231 except Exception: # pylint: disable=broad-except
232 return outb
233
234 @staticmethod
235 def _get_smart_data_by_device(device):
236 # type: (dict) -> Dict[str, dict]
237 # Check whether the device is associated with daemons.
238 if 'daemons' in device and device['daemons']:
239 dev_smart_data = None
240
241 # The daemons associated with the device. Note, the list may
242 # contain daemons that are 'down' or 'destroyed'.
243 daemons = device.get('daemons')
244
245 # Get a list of all OSD daemons on all hosts that are 'up'
246 # because SMART data can not be retrieved from daemons that
247 # are 'down' or 'destroyed'.
248 osd_tree = CephService.send_command('mon', 'osd tree')
249 osd_daemons_up = [
250 node['name'] for node in osd_tree.get('nodes', {})
251 if node.get('status') == 'up'
252 ]
253
254 # Finally get the daemons on the host of the given device
255 # that are 'up'. All daemons on the same host can deliver
256 # SMART data, thus it is not relevant for us which daemon
257 # we are using.
258 daemons = list(set(daemons) & set(osd_daemons_up)) # type: ignore
259
260 for daemon in daemons:
261 svc_type, svc_id = daemon.split('.')
262 if 'osd' in svc_type:
263 try:
264 dev_smart_data = CephService.send_command(
265 svc_type, 'smart', svc_id, devid=device['devid'])
266 except SendCommandError:
267 # Try to retrieve SMART data from another daemon.
268 continue
269 else:
270 try:
271 dev_smart_data = CephService.send_command(
272 svc_type, 'device get-health-metrics', svc_id, devid=device['devid'])
273 except SendCommandError:
274 # Try to retrieve SMART data from another daemon.
275 continue
276 for dev_id, dev_data in dev_smart_data.items():
277 if 'error' in dev_data:
278 logger.warning(
279 '[SMART] Error retrieving smartctl data for device ID "%s": %s',
280 dev_id, dev_data)
281 break
282 if dev_smart_data is None:
283 raise DashboardException(
284 'Failed to retrieve SMART data for device ID "{}"'.format(
285 device['devid']))
286 return dev_smart_data
287 logger.warning('[SMART] No daemons associated with device ID "%s"',
288 device['devid'])
289 return {}
290
291 @staticmethod
292 def get_devices_by_host(hostname):
293 # type: (str) -> dict
294 return CephService.send_command('mon',
295 'device ls-by-host',
296 host=hostname)
297
298 @staticmethod
299 def get_devices_by_daemon(daemon_type, daemon_id):
300 # type: (str, str) -> dict
301 return CephService.send_command('mon',
302 'device ls-by-daemon',
303 who='{}.{}'.format(
304 daemon_type, daemon_id))
305
306 @staticmethod
307 def get_smart_data_by_host(hostname):
308 # type: (str) -> dict
309 """
310 Get the SMART data of all devices on the given host, regardless
311 of the daemon (osd, mon, ...).
312 :param hostname: The name of the host.
313 :return: A dictionary containing the SMART data of every device
314 on the given host. The device name is used as the key in the
315 dictionary.
316 """
317 devices = CephService.get_devices_by_host(hostname)
318 smart_data = {} # type: dict
319 if devices:
320 for device in devices:
321 if device['devid'] not in smart_data:
322 smart_data.update(
323 CephService._get_smart_data_by_device(device))
324 else:
325 logger.debug('[SMART] could not retrieve device list from host %s', hostname)
326 return smart_data
327
328 @staticmethod
329 def get_smart_data_by_daemon(daemon_type, daemon_id):
330 # type: (str, str) -> Dict[str, dict]
331 """
332 Get the SMART data of the devices associated with the given daemon.
333 :param daemon_type: The daemon type, e.g. 'osd' or 'mon'.
334 :param daemon_id: The daemon identifier.
335 :return: A dictionary containing the SMART data of every device
336 associated with the given daemon. The device name is used as the
337 key in the dictionary.
338 """
339 devices = CephService.get_devices_by_daemon(daemon_type, daemon_id)
340 smart_data = {} # type: Dict[str, dict]
341 if devices:
342 for device in devices:
343 if device['devid'] not in smart_data:
344 smart_data.update(
345 CephService._get_smart_data_by_device(device))
346 else:
347 msg = '[SMART] could not retrieve device list from daemon with type %s and ' +\
348 'with ID %s'
349 logger.debug(msg, daemon_type, daemon_id)
350 return smart_data
351
352 @classmethod
353 def get_rates(cls, svc_type, svc_name, path):
354 """
355 :return: the derivative of mgr.get_counter()
356 :rtype: list[tuple[int, float]]"""
357 data = mgr.get_counter(svc_type, svc_name, path)[path]
358 return get_time_series_rates(data)
359
360 @classmethod
361 def get_rate(cls, svc_type, svc_name, path):
362 """returns most recent rate"""
363 return get_most_recent_rate(cls.get_rates(svc_type, svc_name, path))
364
365 @classmethod
366 def get_client_perf(cls):
367 pools_stats = mgr.get('osd_pool_stats')['pool_stats']
368
369 io_stats = {
370 'read_bytes_sec': 0,
371 'read_op_per_sec': 0,
372 'write_bytes_sec': 0,
373 'write_op_per_sec': 0,
374 }
375 recovery_stats = {'recovering_bytes_per_sec': 0}
376
377 for pool_stats in pools_stats:
378 client_io = pool_stats['client_io_rate']
379 for stat in list(io_stats.keys()):
380 if stat in client_io:
381 io_stats[stat] += client_io[stat]
382
383 client_recovery = pool_stats['recovery_rate']
384 for stat in list(recovery_stats.keys()):
385 if stat in client_recovery:
386 recovery_stats[stat] += client_recovery[stat]
387
388 client_perf = io_stats.copy()
389 client_perf.update(recovery_stats)
390
391 return client_perf
392
393 @classmethod
394 def get_scrub_status(cls):
395 enabled_flags = mgr.get('osd_map')['flags_set']
396 if cls.OSD_FLAG_NO_SCRUB in enabled_flags or cls.OSD_FLAG_NO_DEEP_SCRUB in enabled_flags:
397 return cls.SCRUB_STATUS_DISABLED
398
399 grouped_pg_statuses = mgr.get('pg_summary')['all']
400 for grouped_pg_status in grouped_pg_statuses.keys():
401 if len(grouped_pg_status.split(cls.PG_STATUS_SCRUBBING)) > 1 \
402 or len(grouped_pg_status.split(cls.PG_STATUS_DEEP_SCRUBBING)) > 1:
403 return cls.SCRUB_STATUS_ACTIVE
404
405 return cls.SCRUB_STATUS_INACTIVE
406
407 @classmethod
408 def get_pg_info(cls):
409 pg_summary = mgr.get('pg_summary')
410 object_stats = {stat: pg_summary['pg_stats_sum']['stat_sum'][stat] for stat in [
411 'num_objects', 'num_object_copies', 'num_objects_degraded',
412 'num_objects_misplaced', 'num_objects_unfound']}
413
414 pgs_per_osd = 0.0
415 total_osds = len(pg_summary['by_osd'])
416 if total_osds > 0:
417 total_pgs = 0.0
418 for _, osd_pg_statuses in pg_summary['by_osd'].items():
419 for _, pg_amount in osd_pg_statuses.items():
420 total_pgs += pg_amount
421
422 pgs_per_osd = total_pgs / total_osds
423
424 return {
425 'object_stats': object_stats,
426 'statuses': pg_summary['all'],
427 'pgs_per_osd': pgs_per_osd,
428 }