]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | # -*- coding: utf-8 -*- |
2 | from __future__ import absolute_import | |
f67539c2 | 3 | |
11fdf7f2 | 4 | import json |
9f95a23c | 5 | import logging |
11fdf7f2 TL |
6 | |
7 | import rados | |
11fdf7f2 | 8 | from mgr_module import CommandResult |
f67539c2 | 9 | from mgr_util import get_most_recent_rate, get_time_series_rates |
11fdf7f2 | 10 | |
9f95a23c TL |
11 | from .. import mgr |
12 | from ..exceptions import DashboardException | |
11fdf7f2 | 13 | |
81eedcae | 14 | try: |
f67539c2 | 15 | from typing import Any, Dict, Optional, Union |
81eedcae TL |
16 | except ImportError: |
17 | pass # For typing only | |
18 | ||
9f95a23c TL |
19 | logger = logging.getLogger('ceph_service') |
20 | ||
11fdf7f2 TL |
21 | |
22 | class SendCommandError(rados.Error): | |
23 | def __init__(self, err, prefix, argdict, errno): | |
24 | self.prefix = prefix | |
25 | self.argdict = argdict | |
26 | super(SendCommandError, self).__init__(err, errno) | |
27 | ||
28 | ||
522d829b | 29 | # pylint: disable=too-many-public-methods |
11fdf7f2 TL |
30 | class CephService(object): |
31 | ||
32 | OSD_FLAG_NO_SCRUB = 'noscrub' | |
33 | OSD_FLAG_NO_DEEP_SCRUB = 'nodeep-scrub' | |
34 | ||
35 | PG_STATUS_SCRUBBING = 'scrubbing' | |
36 | PG_STATUS_DEEP_SCRUBBING = 'deep' | |
37 | ||
38 | SCRUB_STATUS_DISABLED = 'Disabled' | |
39 | SCRUB_STATUS_ACTIVE = 'Active' | |
40 | SCRUB_STATUS_INACTIVE = 'Inactive' | |
41 | ||
42 | @classmethod | |
43 | def get_service_map(cls, service_name): | |
9f95a23c | 44 | service_map = {} # type: Dict[str, dict] |
11fdf7f2 TL |
45 | for server in mgr.list_servers(): |
46 | for service in server['services']: | |
47 | if service['type'] == service_name: | |
48 | if server['hostname'] not in service_map: | |
49 | service_map[server['hostname']] = { | |
50 | 'server': server, | |
51 | 'services': [] | |
52 | } | |
53 | inst_id = service['id'] | |
54 | metadata = mgr.get_metadata(service_name, inst_id) | |
55 | status = mgr.get_daemon_status(service_name, inst_id) | |
56 | service_map[server['hostname']]['services'].append({ | |
57 | 'id': inst_id, | |
58 | 'type': service_name, | |
59 | 'hostname': server['hostname'], | |
60 | 'metadata': metadata, | |
61 | 'status': status | |
62 | }) | |
63 | return service_map | |
64 | ||
65 | @classmethod | |
66 | def get_service_list(cls, service_name): | |
67 | service_map = cls.get_service_map(service_name) | |
68 | return [svc for _, svcs in service_map.items() for svc in svcs['services']] | |
69 | ||
70 | @classmethod | |
f67539c2 TL |
71 | def get_service_data_by_metadata_id(cls, |
72 | service_type: str, | |
73 | metadata_id: str) -> Optional[Dict[str, Any]]: | |
11fdf7f2 TL |
74 | for server in mgr.list_servers(): |
75 | for service in server['services']: | |
f67539c2 TL |
76 | if service['type'] == service_type: |
77 | metadata = mgr.get_metadata(service_type, service['id']) | |
78 | if metadata_id == metadata['id']: | |
11fdf7f2 | 79 | return { |
f67539c2 TL |
80 | 'id': metadata['id'], |
81 | 'service_map_id': str(service['id']), | |
82 | 'type': service_type, | |
11fdf7f2 | 83 | 'hostname': server['hostname'], |
f67539c2 | 84 | 'metadata': metadata |
11fdf7f2 TL |
85 | } |
86 | return None | |
87 | ||
f67539c2 TL |
88 | @classmethod |
89 | def get_service(cls, service_type: str, metadata_id: str) -> Optional[Dict[str, Any]]: | |
90 | svc_data = cls.get_service_data_by_metadata_id(service_type, metadata_id) | |
91 | if svc_data: | |
92 | svc_data['status'] = mgr.get_daemon_status(svc_data['type'], svc_data['service_map_id']) | |
93 | return svc_data | |
94 | ||
522d829b TL |
95 | @classmethod |
96 | def get_service_perf_counters(cls, service_type: str, service_id: str) -> Dict[str, Any]: | |
97 | schema_dict = mgr.get_perf_schema(service_type, service_id) | |
98 | schema = schema_dict["{}.{}".format(service_type, service_id)] | |
99 | counters = [] | |
100 | for key, value in sorted(schema.items()): | |
101 | counter = {'name': str(key), 'description': value['description']} | |
102 | # pylint: disable=W0212 | |
103 | if mgr._stattype_to_str(value['type']) == 'counter': | |
104 | counter['value'] = cls.get_rate( | |
105 | service_type, service_id, key) | |
106 | counter['unit'] = mgr._unit_to_str(value['units']) | |
107 | else: | |
108 | counter['value'] = mgr.get_latest( | |
109 | service_type, service_id, key) | |
110 | counter['unit'] = '' | |
111 | counters.append(counter) | |
112 | ||
113 | return { | |
114 | 'service': { | |
115 | 'type': service_type, | |
116 | 'id': str(service_id) | |
117 | }, | |
118 | 'counters': counters | |
119 | } | |
120 | ||
11fdf7f2 TL |
121 | @classmethod |
122 | def get_pool_list(cls, application=None): | |
123 | osd_map = mgr.get('osd_map') | |
124 | if not application: | |
125 | return osd_map['pools'] | |
126 | return [pool for pool in osd_map['pools'] | |
127 | if application in pool.get('application_metadata', {})] | |
128 | ||
129 | @classmethod | |
130 | def get_pool_list_with_stats(cls, application=None): | |
131 | # pylint: disable=too-many-locals | |
132 | pools = cls.get_pool_list(application) | |
133 | ||
134 | pools_w_stats = [] | |
135 | ||
136 | pg_summary = mgr.get("pg_summary") | |
137 | pool_stats = mgr.get_updated_pool_stats() | |
138 | ||
139 | for pool in pools: | |
140 | pool['pg_status'] = pg_summary['by_pool'][pool['pool'].__str__()] | |
141 | stats = pool_stats[pool['pool']] | |
142 | s = {} | |
143 | ||
11fdf7f2 | 144 | for stat_name, stat_series in stats.items(): |
9f95a23c | 145 | rates = get_time_series_rates(stat_series) |
11fdf7f2 TL |
146 | s[stat_name] = { |
147 | 'latest': stat_series[0][1], | |
9f95a23c TL |
148 | 'rate': get_most_recent_rate(rates), |
149 | 'rates': rates | |
11fdf7f2 TL |
150 | } |
151 | pool['stats'] = s | |
152 | pools_w_stats.append(pool) | |
153 | return pools_w_stats | |
154 | ||
9f95a23c TL |
155 | @classmethod |
156 | def get_erasure_code_profiles(cls): | |
157 | def _serialize_ecp(name, ecp): | |
158 | def serialize_numbers(key): | |
159 | value = ecp.get(key) | |
160 | if value is not None: | |
161 | ecp[key] = int(value) | |
162 | ||
163 | ecp['name'] = name | |
164 | serialize_numbers('k') | |
165 | serialize_numbers('m') | |
166 | return ecp | |
167 | ||
168 | ret = [] | |
169 | for name, ecp in mgr.get('osd_map').get('erasure_code_profiles', {}).items(): | |
170 | ret.append(_serialize_ecp(name, ecp)) | |
171 | return ret | |
172 | ||
11fdf7f2 TL |
173 | @classmethod |
174 | def get_pool_name_from_id(cls, pool_id): | |
e306af50 | 175 | # type: (int) -> Union[str, None] |
f6b5b4d7 | 176 | return mgr.rados.pool_reverse_lookup(pool_id) |
e306af50 TL |
177 | |
178 | @classmethod | |
179 | def get_pool_by_attribute(cls, attribute, value): | |
180 | # type: (str, Any) -> Union[dict, None] | |
11fdf7f2 TL |
181 | pool_list = cls.get_pool_list() |
182 | for pool in pool_list: | |
e306af50 TL |
183 | if attribute in pool and pool[attribute] == value: |
184 | return pool | |
11fdf7f2 TL |
185 | return None |
186 | ||
e306af50 TL |
187 | @classmethod |
188 | def get_pool_pg_status(cls, pool_name): | |
189 | # type: (str) -> dict | |
190 | pool = cls.get_pool_by_attribute('pool_name', pool_name) | |
191 | if pool is None: | |
192 | return {} | |
193 | return mgr.get("pg_summary")['by_pool'][pool['pool'].__str__()] | |
194 | ||
f67539c2 TL |
195 | @staticmethod |
196 | def send_command(srv_type, prefix, srv_spec='', **kwargs): | |
197 | # type: (str, str, Optional[str], Any) -> Any | |
11fdf7f2 TL |
198 | """ |
199 | :type prefix: str | |
200 | :param srv_type: mon | | |
201 | :param kwargs: will be added to argdict | |
202 | :param srv_spec: typically empty. or something like "<fs_id>:0" | |
203 | ||
204 | :raises PermissionError: See rados.make_ex | |
205 | :raises ObjectNotFound: See rados.make_ex | |
206 | :raises IOError: See rados.make_ex | |
207 | :raises NoSpace: See rados.make_ex | |
208 | :raises ObjectExists: See rados.make_ex | |
209 | :raises ObjectBusy: See rados.make_ex | |
210 | :raises NoData: See rados.make_ex | |
211 | :raises InterruptedOrTimeoutError: See rados.make_ex | |
212 | :raises TimedOut: See rados.make_ex | |
213 | :raises ValueError: return code != 0 | |
214 | """ | |
215 | argdict = { | |
216 | "prefix": prefix, | |
217 | "format": "json", | |
218 | } | |
219 | argdict.update({k: v for k, v in kwargs.items() if v is not None}) | |
220 | result = CommandResult("") | |
221 | mgr.send_command(result, srv_type, srv_spec, json.dumps(argdict), "") | |
222 | r, outb, outs = result.wait() | |
223 | if r != 0: | |
9f95a23c TL |
224 | logger.error("send_command '%s' failed. (r=%s, outs=\"%s\", kwargs=%s)", prefix, r, |
225 | outs, kwargs) | |
226 | ||
11fdf7f2 | 227 | raise SendCommandError(outs, prefix, argdict, r) |
9f95a23c TL |
228 | |
229 | try: | |
230 | return json.loads(outb or outs) | |
231 | except Exception: # pylint: disable=broad-except | |
232 | return outb | |
233 | ||
234 | @staticmethod | |
235 | def _get_smart_data_by_device(device): | |
236 | # type: (dict) -> Dict[str, dict] | |
237 | # Check whether the device is associated with daemons. | |
238 | if 'daemons' in device and device['daemons']: | |
239 | dev_smart_data = None | |
240 | ||
241 | # The daemons associated with the device. Note, the list may | |
242 | # contain daemons that are 'down' or 'destroyed'. | |
243 | daemons = device.get('daemons') | |
244 | ||
245 | # Get a list of all OSD daemons on all hosts that are 'up' | |
246 | # because SMART data can not be retrieved from daemons that | |
247 | # are 'down' or 'destroyed'. | |
248 | osd_tree = CephService.send_command('mon', 'osd tree') | |
249 | osd_daemons_up = [ | |
250 | node['name'] for node in osd_tree.get('nodes', {}) | |
251 | if node.get('status') == 'up' | |
252 | ] | |
253 | ||
254 | # Finally get the daemons on the host of the given device | |
255 | # that are 'up'. All daemons on the same host can deliver | |
256 | # SMART data, thus it is not relevant for us which daemon | |
257 | # we are using. | |
258 | daemons = list(set(daemons) & set(osd_daemons_up)) # type: ignore | |
259 | ||
260 | for daemon in daemons: | |
261 | svc_type, svc_id = daemon.split('.') | |
f67539c2 TL |
262 | if 'osd' in svc_type: |
263 | try: | |
264 | dev_smart_data = CephService.send_command( | |
265 | svc_type, 'smart', svc_id, devid=device['devid']) | |
266 | except SendCommandError: | |
267 | # Try to retrieve SMART data from another daemon. | |
268 | continue | |
a4b75251 | 269 | elif 'mon' in svc_type: |
f67539c2 TL |
270 | try: |
271 | dev_smart_data = CephService.send_command( | |
a4b75251 | 272 | svc_type, 'device query-daemon-health-metrics', who=daemon) |
f67539c2 TL |
273 | except SendCommandError: |
274 | # Try to retrieve SMART data from another daemon. | |
275 | continue | |
a4b75251 TL |
276 | else: |
277 | dev_smart_data = {} | |
9f95a23c TL |
278 | for dev_id, dev_data in dev_smart_data.items(): |
279 | if 'error' in dev_data: | |
280 | logger.warning( | |
281 | '[SMART] Error retrieving smartctl data for device ID "%s": %s', | |
282 | dev_id, dev_data) | |
283 | break | |
284 | if dev_smart_data is None: | |
285 | raise DashboardException( | |
286 | 'Failed to retrieve SMART data for device ID "{}"'.format( | |
287 | device['devid'])) | |
288 | return dev_smart_data | |
289 | logger.warning('[SMART] No daemons associated with device ID "%s"', | |
290 | device['devid']) | |
291 | return {} | |
292 | ||
293 | @staticmethod | |
294 | def get_devices_by_host(hostname): | |
f67539c2 | 295 | # type: (str) -> dict |
9f95a23c TL |
296 | return CephService.send_command('mon', |
297 | 'device ls-by-host', | |
298 | host=hostname) | |
299 | ||
300 | @staticmethod | |
301 | def get_devices_by_daemon(daemon_type, daemon_id): | |
f67539c2 | 302 | # type: (str, str) -> dict |
9f95a23c TL |
303 | return CephService.send_command('mon', |
304 | 'device ls-by-daemon', | |
305 | who='{}.{}'.format( | |
306 | daemon_type, daemon_id)) | |
307 | ||
308 | @staticmethod | |
309 | def get_smart_data_by_host(hostname): | |
310 | # type: (str) -> dict | |
311 | """ | |
312 | Get the SMART data of all devices on the given host, regardless | |
313 | of the daemon (osd, mon, ...). | |
314 | :param hostname: The name of the host. | |
315 | :return: A dictionary containing the SMART data of every device | |
316 | on the given host. The device name is used as the key in the | |
317 | dictionary. | |
318 | """ | |
319 | devices = CephService.get_devices_by_host(hostname) | |
320 | smart_data = {} # type: dict | |
321 | if devices: | |
322 | for device in devices: | |
323 | if device['devid'] not in smart_data: | |
324 | smart_data.update( | |
325 | CephService._get_smart_data_by_device(device)) | |
f67539c2 TL |
326 | else: |
327 | logger.debug('[SMART] could not retrieve device list from host %s', hostname) | |
9f95a23c TL |
328 | return smart_data |
329 | ||
330 | @staticmethod | |
331 | def get_smart_data_by_daemon(daemon_type, daemon_id): | |
332 | # type: (str, str) -> Dict[str, dict] | |
333 | """ | |
334 | Get the SMART data of the devices associated with the given daemon. | |
335 | :param daemon_type: The daemon type, e.g. 'osd' or 'mon'. | |
336 | :param daemon_id: The daemon identifier. | |
337 | :return: A dictionary containing the SMART data of every device | |
338 | associated with the given daemon. The device name is used as the | |
339 | key in the dictionary. | |
340 | """ | |
341 | devices = CephService.get_devices_by_daemon(daemon_type, daemon_id) | |
342 | smart_data = {} # type: Dict[str, dict] | |
343 | if devices: | |
344 | for device in devices: | |
345 | if device['devid'] not in smart_data: | |
346 | smart_data.update( | |
347 | CephService._get_smart_data_by_device(device)) | |
f67539c2 TL |
348 | else: |
349 | msg = '[SMART] could not retrieve device list from daemon with type %s and ' +\ | |
350 | 'with ID %s' | |
351 | logger.debug(msg, daemon_type, daemon_id) | |
9f95a23c | 352 | return smart_data |
11fdf7f2 TL |
353 | |
354 | @classmethod | |
355 | def get_rates(cls, svc_type, svc_name, path): | |
356 | """ | |
357 | :return: the derivative of mgr.get_counter() | |
358 | :rtype: list[tuple[int, float]]""" | |
359 | data = mgr.get_counter(svc_type, svc_name, path)[path] | |
9f95a23c | 360 | return get_time_series_rates(data) |
11fdf7f2 TL |
361 | |
362 | @classmethod | |
363 | def get_rate(cls, svc_type, svc_name, path): | |
364 | """returns most recent rate""" | |
9f95a23c | 365 | return get_most_recent_rate(cls.get_rates(svc_type, svc_name, path)) |
11fdf7f2 TL |
366 | |
367 | @classmethod | |
368 | def get_client_perf(cls): | |
369 | pools_stats = mgr.get('osd_pool_stats')['pool_stats'] | |
370 | ||
371 | io_stats = { | |
372 | 'read_bytes_sec': 0, | |
373 | 'read_op_per_sec': 0, | |
374 | 'write_bytes_sec': 0, | |
375 | 'write_op_per_sec': 0, | |
376 | } | |
377 | recovery_stats = {'recovering_bytes_per_sec': 0} | |
378 | ||
379 | for pool_stats in pools_stats: | |
380 | client_io = pool_stats['client_io_rate'] | |
381 | for stat in list(io_stats.keys()): | |
382 | if stat in client_io: | |
383 | io_stats[stat] += client_io[stat] | |
384 | ||
385 | client_recovery = pool_stats['recovery_rate'] | |
386 | for stat in list(recovery_stats.keys()): | |
387 | if stat in client_recovery: | |
388 | recovery_stats[stat] += client_recovery[stat] | |
389 | ||
390 | client_perf = io_stats.copy() | |
391 | client_perf.update(recovery_stats) | |
392 | ||
393 | return client_perf | |
394 | ||
395 | @classmethod | |
396 | def get_scrub_status(cls): | |
397 | enabled_flags = mgr.get('osd_map')['flags_set'] | |
398 | if cls.OSD_FLAG_NO_SCRUB in enabled_flags or cls.OSD_FLAG_NO_DEEP_SCRUB in enabled_flags: | |
399 | return cls.SCRUB_STATUS_DISABLED | |
400 | ||
401 | grouped_pg_statuses = mgr.get('pg_summary')['all'] | |
402 | for grouped_pg_status in grouped_pg_statuses.keys(): | |
403 | if len(grouped_pg_status.split(cls.PG_STATUS_SCRUBBING)) > 1 \ | |
404 | or len(grouped_pg_status.split(cls.PG_STATUS_DEEP_SCRUBBING)) > 1: | |
405 | return cls.SCRUB_STATUS_ACTIVE | |
406 | ||
407 | return cls.SCRUB_STATUS_INACTIVE | |
408 | ||
409 | @classmethod | |
410 | def get_pg_info(cls): | |
411 | pg_summary = mgr.get('pg_summary') | |
81eedcae TL |
412 | object_stats = {stat: pg_summary['pg_stats_sum']['stat_sum'][stat] for stat in [ |
413 | 'num_objects', 'num_object_copies', 'num_objects_degraded', | |
414 | 'num_objects_misplaced', 'num_objects_unfound']} | |
11fdf7f2 TL |
415 | |
416 | pgs_per_osd = 0.0 | |
417 | total_osds = len(pg_summary['by_osd']) | |
418 | if total_osds > 0: | |
419 | total_pgs = 0.0 | |
420 | for _, osd_pg_statuses in pg_summary['by_osd'].items(): | |
421 | for _, pg_amount in osd_pg_statuses.items(): | |
422 | total_pgs += pg_amount | |
423 | ||
424 | pgs_per_osd = total_pgs / total_osds | |
425 | ||
426 | return { | |
81eedcae | 427 | 'object_stats': object_stats, |
11fdf7f2 TL |
428 | 'statuses': pg_summary['all'], |
429 | 'pgs_per_osd': pgs_per_osd, | |
430 | } |