]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/dashboard/controllers/osd.py
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / pybind / mgr / dashboard / controllers / osd.py
CommitLineData
11fdf7f2
TL
1# -*- coding: utf-8 -*-
2from __future__ import absolute_import
f67539c2 3
9f95a23c
TL
4import json
5import logging
6import time
f67539c2 7from typing import Any, Dict, List, Optional, Union
9f95a23c 8
f67539c2 9from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError # type: ignore
9f95a23c
TL
10from mgr_util import get_most_recent_rate
11
9f95a23c
TL
12from .. import mgr
13from ..exceptions import DashboardException
11fdf7f2
TL
14from ..security import Scope
15from ..services.ceph_service import CephService, SendCommandError
f67539c2
TL
16from ..services.exception import handle_orchestrator_error, handle_send_command_error
17from ..services.orchestrator import OrchClient, OrchFeature
11fdf7f2 18from ..tools import str_to_bool
f67539c2
TL
19from . import ApiController, ControllerDoc, CreatePermission, \
20 DeletePermission, Endpoint, EndpointDoc, ReadPermission, RESTController, \
21 Task, UpdatePermission, allow_empty_body
22from .orchestrator import raise_if_no_orchestrator
11fdf7f2 23
9f95a23c
TL
24logger = logging.getLogger('controllers.osd')
25
f67539c2
TL
26SAFE_TO_DESTROY_SCHEMA = {
27 "safe_to_destroy": ([str], "Is OSD safe to destroy?"),
28 "active": ([int], ""),
29 "missing_stats": ([str], ""),
30 "stored_pgs": ([str], "Stored Pool groups in Osd"),
31 "is_safe_to_destroy": (bool, "Is OSD safe to destroy?")
32}
33
34EXPORT_FLAGS_SCHEMA = {
35 "list_of_flags": ([str], "")
36}
37
38EXPORT_INDIV_FLAGS_SCHEMA = {
39 "added": ([str], "List of added flags"),
40 "removed": ([str], "List of removed flags"),
41 "ids": ([int], "List of updated OSDs")
42}
43
44EXPORT_INDIV_FLAGS_GET_SCHEMA = {
45 "osd": (int, "OSD ID"),
46 "flags": ([str], "List of active flags")
47}
48
9f95a23c
TL
49
50def osd_task(name, metadata, wait_for=2.0):
51 return Task("osd/{}".format(name), metadata, wait_for)
52
53
11fdf7f2 54@ApiController('/osd', Scope.OSD)
f67539c2 55@ControllerDoc('OSD management API', 'OSD')
11fdf7f2
TL
56class Osd(RESTController):
57 def list(self):
58 osds = self.get_osd_map()
59
60 # Extending by osd stats information
61 for stat in mgr.get('osd_stats')['osd_stats']:
62 if stat['osd'] in osds:
63 osds[stat['osd']]['osd_stats'] = stat
64
65 # Extending by osd node information
66 nodes = mgr.get('osd_map_tree')['nodes']
67 for node in nodes:
68 if node['type'] == 'osd' and node['id'] in osds:
69 osds[node['id']]['tree'] = node
70
71 # Extending by osd parent node information
72 for host in [n for n in nodes if n['type'] == 'host']:
73 for osd_id in host['children']:
74 if osd_id >= 0 and osd_id in osds:
75 osds[osd_id]['host'] = host
76
f67539c2
TL
77 removing_osd_ids = self.get_removing_osds()
78
79 # Extending by osd histogram and orchestrator data
11fdf7f2
TL
80 for osd_id, osd in osds.items():
81 osd['stats'] = {}
82 osd['stats_history'] = {}
83 osd_spec = str(osd_id)
84 if 'osd' not in osd:
f6b5b4d7 85 continue # pragma: no cover - simple early continue
11fdf7f2
TL
86 for stat in ['osd.op_w', 'osd.op_in_bytes', 'osd.op_r', 'osd.op_out_bytes']:
87 prop = stat.split('.')[1]
9f95a23c
TL
88 rates = CephService.get_rates('osd', osd_spec, stat)
89 osd['stats'][prop] = get_most_recent_rate(rates)
90 osd['stats_history'][prop] = rates
11fdf7f2
TL
91 # Gauge stats
92 for stat in ['osd.numpg', 'osd.stat_bytes', 'osd.stat_bytes_used']:
93 osd['stats'][stat.split('.')[1]] = mgr.get_latest('osd', osd_spec, stat)
f67539c2 94 osd['operational_status'] = self._get_operational_status(osd_id, removing_osd_ids)
11fdf7f2
TL
95 return list(osds.values())
96
f67539c2
TL
97 def _get_operational_status(self, osd_id: int, removing_osd_ids: Optional[List[int]]):
98 if removing_osd_ids is None:
99 return 'unmanaged'
100 if osd_id in removing_osd_ids:
101 return 'deleting'
102 return 'working'
103
104 @staticmethod
105 def get_removing_osds() -> Optional[List[int]]:
106 orch = OrchClient.instance()
107 if orch.available(features=[OrchFeature.OSD_GET_REMOVE_STATUS]):
108 return [osd.osd_id for osd in orch.osds.removing_status()]
109 return None
110
11fdf7f2
TL
111 @staticmethod
112 def get_osd_map(svc_id=None):
9f95a23c 113 # type: (Union[int, None]) -> Dict[int, Union[dict, Any]]
11fdf7f2
TL
114 def add_id(osd):
115 osd['id'] = osd['osd']
116 return osd
9f95a23c 117
11fdf7f2
TL
118 resp = {
119 osd['osd']: add_id(osd)
120 for osd in mgr.get('osd_map')['osds'] if svc_id is None or osd['osd'] == int(svc_id)
121 }
122 return resp if svc_id is None else resp[int(svc_id)]
123
9f95a23c
TL
124 @staticmethod
125 def _get_smart_data(osd_id):
126 # type: (str) -> dict
127 """Returns S.M.A.R.T data for the given OSD ID."""
f67539c2 128 logger.debug('[SMART] retrieving data from OSD with ID %s', osd_id)
9f95a23c
TL
129 return CephService.get_smart_data_by_daemon('osd', osd_id)
130
131 @RESTController.Resource('GET')
132 def smart(self, svc_id):
133 # type: (str) -> dict
134 return self._get_smart_data(svc_id)
135
11fdf7f2
TL
136 @handle_send_command_error('osd')
137 def get(self, svc_id):
138 """
139 Returns collected data about an OSD.
140
f67539c2 141 :return: Returns the requested data.
11fdf7f2 142 """
11fdf7f2
TL
143 return {
144 'osd_map': self.get_osd_map(svc_id),
145 'osd_metadata': mgr.get_metadata('osd', svc_id),
f67539c2
TL
146 'operational_status': self._get_operational_status(int(svc_id),
147 self.get_removing_osds())
11fdf7f2
TL
148 }
149
f91f0fd5
TL
150 @RESTController.Resource('GET')
151 @handle_send_command_error('osd')
152 def histogram(self, svc_id):
153 # type: (int) -> Dict[str, Any]
154 """
155 :return: Returns the histogram data.
156 """
157 try:
158 histogram = CephService.send_command(
159 'osd', srv_spec=svc_id, prefix='perf histogram dump')
160 except SendCommandError as e: # pragma: no cover - the handling is too obvious
161 raise DashboardException(
162 component='osd', http_status_code=400, msg=str(e))
163
164 return histogram
165
f6b5b4d7 166 def set(self, svc_id, device_class): # pragma: no cover
9f95a23c
TL
167 old_device_class = CephService.send_command('mon', 'osd crush get-device-class',
168 ids=[svc_id])
169 old_device_class = old_device_class[0]['device_class']
170 if old_device_class != device_class:
171 CephService.send_command('mon', 'osd crush rm-device-class',
172 ids=[svc_id])
173 if device_class:
174 CephService.send_command('mon', 'osd crush set-device-class', **{
175 'class': device_class,
176 'ids': [svc_id]
177 })
178
179 def _check_delete(self, osd_ids):
180 # type: (List[str]) -> Dict[str, Any]
181 """
182 Check if it's safe to remove OSD(s).
183
184 :param osd_ids: list of OSD IDs
185 :return: a dictionary contains the following attributes:
186 `safe`: bool, indicate if it's safe to remove OSDs.
187 `message`: str, help message if it's not safe to remove OSDs.
188 """
189 _ = osd_ids
190 health_data = mgr.get('health') # type: ignore
191 health = json.loads(health_data['json'])
192 checks = health['checks'].keys()
193 unsafe_checks = set(['OSD_FULL', 'OSD_BACKFILLFULL', 'OSD_NEARFULL'])
194 failed_checks = checks & unsafe_checks
195 msg = 'Removing OSD(s) is not recommended because of these failed health check(s): {}.'.\
196 format(', '.join(failed_checks)) if failed_checks else ''
197 return {
198 'safe': not bool(failed_checks),
199 'message': msg
200 }
201
202 @DeletePermission
f67539c2 203 @raise_if_no_orchestrator([OrchFeature.OSD_DELETE, OrchFeature.OSD_GET_REMOVE_STATUS])
9f95a23c
TL
204 @handle_orchestrator_error('osd')
205 @osd_task('delete', {'svc_id': '{svc_id}'})
f6b5b4d7
TL
206 def delete(self, svc_id, preserve_id=None, force=None): # pragma: no cover
207 replace = False
f67539c2 208 check: Union[Dict[str, Any], bool] = False
f6b5b4d7
TL
209 try:
210 if preserve_id is not None:
211 replace = str_to_bool(preserve_id)
212 if force is not None:
213 check = not str_to_bool(force)
214 except ValueError:
215 raise DashboardException(
216 component='osd', http_status_code=400, msg='Invalid parameter(s)')
9f95a23c 217 orch = OrchClient.instance()
f6b5b4d7 218 if check:
9f95a23c
TL
219 logger.info('Check for removing osd.%s...', svc_id)
220 check = self._check_delete([svc_id])
221 if not check['safe']:
222 logger.error('Unable to remove osd.%s: %s', svc_id, check['message'])
223 raise DashboardException(component='osd', msg=check['message'])
f6b5b4d7
TL
224
225 logger.info('Start removing osd.%s (replace: %s)...', svc_id, replace)
226 orch.osds.remove([svc_id], replace)
9f95a23c
TL
227 while True:
228 removal_osds = orch.osds.removing_status()
229 logger.info('Current removing OSDs %s', removal_osds)
f67539c2 230 pending = [osd for osd in removal_osds if osd.osd_id == int(svc_id)]
9f95a23c
TL
231 if not pending:
232 break
233 logger.info('Wait until osd.%s is removed...', svc_id)
234 time.sleep(60)
235
11fdf7f2
TL
236 @RESTController.Resource('POST', query_params=['deep'])
237 @UpdatePermission
f91f0fd5 238 @allow_empty_body
11fdf7f2
TL
239 def scrub(self, svc_id, deep=False):
240 api_scrub = "osd deep-scrub" if str_to_bool(deep) else "osd scrub"
241 CephService.send_command("mon", api_scrub, who=svc_id)
242
f67539c2
TL
243 @RESTController.Resource('PUT')
244 @EndpointDoc("Mark OSD flags (out, in, down, lost, ...)",
245 parameters={'svc_id': (str, 'SVC ID')})
246 def mark(self, svc_id, action):
247 """
248 Note: osd must be marked `down` before marking lost.
249 """
250 valid_actions = ['out', 'in', 'down', 'lost']
251 args = {'srv_type': 'mon', 'prefix': 'osd ' + action}
252 if action.lower() in valid_actions:
253 if action == 'lost':
254 args['id'] = int(svc_id)
255 args['yes_i_really_mean_it'] = True
256 else:
257 args['ids'] = [svc_id]
258
259 CephService.send_command(**args)
260 else:
261 logger.error("Invalid OSD mark action: %s attempted on SVC_ID: %s", action, svc_id)
11fdf7f2
TL
262
263 @RESTController.Resource('POST')
f91f0fd5 264 @allow_empty_body
11fdf7f2
TL
265 def reweight(self, svc_id, weight):
266 """
267 Reweights the OSD temporarily.
268
269 Note that ‘ceph osd reweight’ is not a persistent setting. When an OSD
270 gets marked out, the osd weight will be set to 0. When it gets marked
271 in again, the weight will be changed to 1.
272
273 Because of this ‘ceph osd reweight’ is a temporary solution. You should
274 only use it to keep your cluster running while you’re ordering more
275 hardware.
276
277 - Craig Lewis (http://lists.ceph.com/pipermail/ceph-users-ceph.com/2014-June/040967.html)
278 """
279 CephService.send_command(
280 'mon',
281 'osd reweight',
282 id=int(svc_id),
283 weight=float(weight))
284
9f95a23c
TL
285 def _create_bare(self, data):
286 """Create a OSD container that has no associated device.
287
288 :param data: contain attributes to create a bare OSD.
289 : `uuid`: will be set automatically if the OSD starts up
290 : `svc_id`: the ID is only used if a valid uuid is given.
11fdf7f2 291 """
9f95a23c
TL
292 try:
293 uuid = data['uuid']
294 svc_id = int(data['svc_id'])
295 except (KeyError, ValueError) as e:
296 raise DashboardException(e, component='osd', http_status_code=400)
297
11fdf7f2 298 result = CephService.send_command(
9f95a23c 299 'mon', 'osd create', id=svc_id, uuid=uuid)
11fdf7f2
TL
300 return {
301 'result': result,
9f95a23c 302 'svc_id': svc_id,
11fdf7f2
TL
303 'uuid': uuid,
304 }
305
f67539c2 306 @raise_if_no_orchestrator([OrchFeature.OSD_CREATE])
9f95a23c
TL
307 @handle_orchestrator_error('osd')
308 def _create_with_drive_groups(self, drive_groups):
309 """Create OSDs with DriveGroups."""
310 orch = OrchClient.instance()
311 try:
312 dg_specs = [DriveGroupSpec.from_json(dg) for dg in drive_groups]
313 orch.osds.create(dg_specs)
314 except (ValueError, TypeError, DriveGroupValidationError) as e:
315 raise DashboardException(e, component='osd')
316
317 @CreatePermission
318 @osd_task('create', {'tracking_id': '{tracking_id}'})
f67539c2 319 def create(self, method, data, tracking_id): # pylint: disable=unused-argument
9f95a23c
TL
320 if method == 'bare':
321 return self._create_bare(data)
322 if method == 'drive_groups':
323 return self._create_with_drive_groups(data)
324 raise DashboardException(
325 component='osd', http_status_code=400, msg='Unknown method: {}'.format(method))
326
11fdf7f2 327 @RESTController.Resource('POST')
f91f0fd5 328 @allow_empty_body
11fdf7f2
TL
329 def purge(self, svc_id):
330 """
331 Note: osd must be marked `down` before removal.
332 """
333 CephService.send_command('mon', 'osd purge-actual', id=int(svc_id),
334 yes_i_really_mean_it=True)
335
336 @RESTController.Resource('POST')
f91f0fd5 337 @allow_empty_body
11fdf7f2
TL
338 def destroy(self, svc_id):
339 """
340 Mark osd as being destroyed. Keeps the ID intact (allowing reuse), but
341 removes cephx keys, config-key data and lockbox keys, rendering data
342 permanently unreadable.
343
344 The osd must be marked down before being destroyed.
345 """
346 CephService.send_command(
347 'mon', 'osd destroy-actual', id=int(svc_id), yes_i_really_mean_it=True)
348
9f95a23c
TL
349 @Endpoint('GET', query_params=['ids'])
350 @ReadPermission
f67539c2
TL
351 @EndpointDoc("Check If OSD is Safe to Destroy",
352 parameters={
353 'ids': (str, 'OSD Service Identifier'),
354 },
355 responses={200: SAFE_TO_DESTROY_SCHEMA})
9f95a23c 356 def safe_to_destroy(self, ids):
11fdf7f2 357 """
9f95a23c 358 :type ids: int|[int]
11fdf7f2 359 """
9f95a23c
TL
360
361 ids = json.loads(ids)
362 if isinstance(ids, list):
363 ids = list(map(str, ids))
364 else:
365 ids = [str(ids)]
366
11fdf7f2
TL
367 try:
368 result = CephService.send_command(
9f95a23c
TL
369 'mon', 'osd safe-to-destroy', ids=ids, target=('mgr', ''))
370 result['is_safe_to_destroy'] = set(result['safe_to_destroy']) == set(map(int, ids))
11fdf7f2
TL
371 return result
372
373 except SendCommandError as e:
374 return {
375 'message': str(e),
376 'is_safe_to_destroy': False,
377 }
378
9f95a23c
TL
379 @Endpoint('GET', query_params=['svc_ids'])
380 @ReadPermission
f67539c2 381 @raise_if_no_orchestrator()
9f95a23c
TL
382 @handle_orchestrator_error('osd')
383 def safe_to_delete(self, svc_ids):
384 """
385 :type ids: int|[int]
386 """
387 check = self._check_delete(svc_ids)
388 return {
389 'is_safe_to_delete': check.get('safe', False),
390 'message': check.get('message', '')
391 }
392
393 @RESTController.Resource('GET')
394 def devices(self, svc_id):
395 # (str) -> dict
396 return CephService.send_command('mon', 'device ls-by-daemon', who='osd.{}'.format(svc_id))
397
11fdf7f2
TL
398
399@ApiController('/osd/flags', Scope.OSD)
f67539c2 400@ControllerDoc(group='OSD')
11fdf7f2
TL
401class OsdFlagsController(RESTController):
402 @staticmethod
403 def _osd_flags():
404 enabled_flags = mgr.get('osd_map')['flags_set']
405 if 'pauserd' in enabled_flags and 'pausewr' in enabled_flags:
406 # 'pause' is set by calling `ceph osd set pause` and unset by
407 # calling `set osd unset pause`, but `ceph osd dump | jq '.flags'`
408 # will contain 'pauserd,pausewr' if pause is set.
409 # Let's pretend to the API that 'pause' is in fact a proper flag.
410 enabled_flags = list(
411 set(enabled_flags) - {'pauserd', 'pausewr'} | {'pause'})
412 return sorted(enabled_flags)
413
adb31ebb
TL
414 @staticmethod
415 def _update_flags(action, flags, ids=None):
416 if ids:
417 if flags:
418 ids = list(map(str, ids))
419 CephService.send_command('mon', 'osd ' + action, who=ids,
420 flags=','.join(flags))
421 else:
422 for flag in flags:
423 CephService.send_command('mon', 'osd ' + action, '', key=flag)
424
f67539c2
TL
425 @EndpointDoc("Display OSD Flags",
426 responses={200: EXPORT_FLAGS_SCHEMA})
11fdf7f2
TL
427 def list(self):
428 return self._osd_flags()
429
f67539c2
TL
430 @EndpointDoc('Sets OSD flags for the entire cluster.',
431 parameters={
432 'flags': ([str], 'List of flags to set. The flags `recovery_deletes`, '
433 '`sortbitwise` and `pglog_hardlimit` cannot be unset. '
434 'Additionally `purged_snapshots` cannot even be set.')
435 },
436 responses={200: EXPORT_FLAGS_SCHEMA})
11fdf7f2
TL
437 def bulk_set(self, flags):
438 """
439 The `recovery_deletes`, `sortbitwise` and `pglog_hardlimit` flags cannot be unset.
440 `purged_snapshots` cannot even be set. It is therefore required to at
441 least include those four flags for a successful operation.
442 """
443 assert isinstance(flags, list)
444
445 enabled_flags = set(self._osd_flags())
446 data = set(flags)
447 added = data - enabled_flags
448 removed = enabled_flags - data
adb31ebb
TL
449
450 self._update_flags('set', added)
451 self._update_flags('unset', removed)
452
11fdf7f2
TL
453 logger.info('Changed OSD flags: added=%s removed=%s', added, removed)
454
455 return sorted(enabled_flags - removed | added)
adb31ebb
TL
456
457 @Endpoint('PUT', 'individual')
458 @UpdatePermission
f67539c2
TL
459 @EndpointDoc('Sets OSD flags for a subset of individual OSDs.',
460 parameters={
461 'flags': ({'noout': (bool, 'Sets/unsets `noout`', True, None),
462 'noin': (bool, 'Sets/unsets `noin`', True, None),
463 'noup': (bool, 'Sets/unsets `noup`', True, None),
464 'nodown': (bool, 'Sets/unsets `nodown`', True, None)},
465 'Directory of flags to set or unset. The flags '
466 '`noin`, `noout`, `noup` and `nodown` are going to '
467 'be considered only.'),
468 'ids': ([int], 'List of OSD ids the flags should be applied '
469 'to.')
470 },
471 responses={200: EXPORT_INDIV_FLAGS_SCHEMA})
adb31ebb
TL
472 def set_individual(self, flags, ids):
473 """
474 Updates flags (`noout`, `noin`, `nodown`, `noup`) for an individual
475 subset of OSDs.
476 """
477 assert isinstance(flags, dict)
478 assert isinstance(ids, list)
479 assert all(isinstance(id, int) for id in ids)
480
481 # These are to only flags that can be applied to an OSD individually.
482 all_flags = {'noin', 'noout', 'nodown', 'noup'}
483 added = set()
484 removed = set()
485 for flag, activated in flags.items():
486 if flag in all_flags:
487 if activated is not None:
488 if activated:
489 added.add(flag)
490 else:
491 removed.add(flag)
492
493 self._update_flags('set-group', added, ids)
494 self._update_flags('unset-group', removed, ids)
495
496 logger.error('Changed individual OSD flags: added=%s removed=%s for ids=%s',
497 added, removed, ids)
498
499 return {'added': sorted(added),
500 'removed': sorted(removed),
501 'ids': ids}
502
503 @Endpoint('GET', 'individual')
504 @ReadPermission
f67539c2
TL
505 @EndpointDoc('Displays individual OSD flags',
506 responses={200: EXPORT_INDIV_FLAGS_GET_SCHEMA})
adb31ebb
TL
507 def get_individual(self):
508 osd_map = mgr.get('osd_map')['osds']
509 resp = []
510
511 for osd in osd_map:
512 resp.append({
513 'osd': osd['osd'],
514 'flags': osd['state']
515 })
516 return resp