]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/devicehealth/module.py
04986bb17b1bbf446ffe47b339a7ad90c5e52f3b
[ceph.git] / ceph / src / pybind / mgr / devicehealth / module.py
1 """
2 Device health monitoring
3 """
4
5 import errno
6 import json
7 from mgr_module import MgrModule, CommandResult, CLIRequiresDB, CLICommand, CLIReadCommand, Option
8 import operator
9 import rados
10 import re
11 from threading import Event
12 from datetime import datetime, timedelta, timezone
13 from typing import cast, Any, Dict, List, Optional, Sequence, Tuple, TYPE_CHECKING, Union
14
15 TIME_FORMAT = '%Y%m%d-%H%M%S'
16
17 DEVICE_HEALTH = 'DEVICE_HEALTH'
18 DEVICE_HEALTH_IN_USE = 'DEVICE_HEALTH_IN_USE'
19 DEVICE_HEALTH_TOOMANY = 'DEVICE_HEALTH_TOOMANY'
20 HEALTH_MESSAGES = {
21 DEVICE_HEALTH: '%d device(s) expected to fail soon',
22 DEVICE_HEALTH_IN_USE: '%d daemon(s) expected to fail soon and still contain data',
23 DEVICE_HEALTH_TOOMANY: 'Too many daemons are expected to fail soon',
24 }
25
26
27 def get_ata_wear_level(data: Dict[Any, Any]) -> Optional[float]:
28 """
29 Extract wear level (as float) from smartctl -x --json output for SATA SSD
30 """
31 for page in data.get("ata_device_statistics", {}).get("pages", []):
32 if page is None or page.get("number") != 7:
33 continue
34 for item in page.get("table", []):
35 if item["offset"] == 8:
36 return item["value"] / 100.0
37 return None
38
39
40 def get_nvme_wear_level(data: Dict[Any, Any]) -> Optional[float]:
41 """
42 Extract wear level (as float) from smartctl -x --json output for NVME SSD
43 """
44 pct_used = data.get("nvme_smart_health_information_log", {}).get("percentage_used")
45 if pct_used is None:
46 return None
47 return pct_used / 100.0
48
49
50 class Module(MgrModule):
51
52 # latest (if db does not exist)
53 SCHEMA = """
54 CREATE TABLE Device (
55 devid TEXT PRIMARY KEY
56 ) WITHOUT ROWID;
57 CREATE TABLE DeviceHealthMetrics (
58 time DATETIME DEFAULT (strftime('%s', 'now')),
59 devid TEXT NOT NULL REFERENCES Device (devid),
60 raw_smart TEXT NOT NULL,
61 PRIMARY KEY (time, devid)
62 );
63 """
64
65 SCHEMA_VERSIONED = [
66 # v1
67 """
68 CREATE TABLE Device (
69 devid TEXT PRIMARY KEY
70 ) WITHOUT ROWID;
71 CREATE TABLE DeviceHealthMetrics (
72 time DATETIME DEFAULT (strftime('%s', 'now')),
73 devid TEXT NOT NULL REFERENCES Device (devid),
74 raw_smart TEXT NOT NULL,
75 PRIMARY KEY (time, devid)
76 );
77 """
78 ]
79
80 MODULE_OPTIONS = [
81 Option(
82 name='enable_monitoring',
83 default=True,
84 type='bool',
85 desc='monitor device health metrics',
86 runtime=True,
87 ),
88 Option(
89 name='scrape_frequency',
90 default=86400,
91 type='secs',
92 desc='how frequently to scrape device health metrics',
93 runtime=True,
94 ),
95 Option(
96 name='pool_name',
97 default='device_health_metrics',
98 type='str',
99 desc='name of pool in which to store device health metrics',
100 runtime=True,
101 ),
102 Option(
103 name='retention_period',
104 default=(86400 * 180),
105 type='secs',
106 desc='how long to retain device health metrics',
107 runtime=True,
108 ),
109 Option(
110 name='mark_out_threshold',
111 default=(86400 * 14 * 2),
112 type='secs',
113 desc='automatically mark OSD if it may fail before this long',
114 runtime=True,
115 ),
116 Option(
117 name='warn_threshold',
118 default=(86400 * 14 * 6),
119 type='secs',
120 desc='raise health warning if OSD may fail before this long',
121 runtime=True,
122 ),
123 Option(
124 name='self_heal',
125 default=True,
126 type='bool',
127 desc='preemptively heal cluster around devices that may fail',
128 runtime=True,
129 ),
130 Option(
131 name='sleep_interval',
132 default=600,
133 type='secs',
134 desc='how frequently to wake up and check device health',
135 runtime=True,
136 ),
137 ]
138
139 def __init__(self, *args: Any, **kwargs: Any) -> None:
140 super(Module, self).__init__(*args, **kwargs)
141
142 # populate options (just until serve() runs)
143 for opt in self.MODULE_OPTIONS:
144 setattr(self, opt['name'], opt['default'])
145
146 # other
147 self.run = True
148 self.event = Event()
149
150 # for mypy which does not run the code
151 if TYPE_CHECKING:
152 self.enable_monitoring = True
153 self.scrape_frequency = 0.0
154 self.pool_name = ''
155 self.device_health_metrics = ''
156 self.retention_period = 0.0
157 self.mark_out_threshold = 0.0
158 self.warn_threshold = 0.0
159 self.self_heal = True
160 self.sleep_interval = 0.0
161
162 def is_valid_daemon_name(self, who: str) -> bool:
163 parts = who.split('.', 1)
164 if len(parts) != 2:
165 return False
166 return parts[0] in ('osd', 'mon')
167
168 @CLIReadCommand('device query-daemon-health-metrics')
169 def do_query_daemon_health_metrics(self, who: str) -> Tuple[int, str, str]:
170 '''
171 Get device health metrics for a given daemon
172 '''
173 if not self.is_valid_daemon_name(who):
174 return -errno.EINVAL, '', 'not a valid mon or osd daemon name'
175 (daemon_type, daemon_id) = who.split('.')
176 result = CommandResult('')
177 self.send_command(result, daemon_type, daemon_id, json.dumps({
178 'prefix': 'smart',
179 'format': 'json',
180 }), '')
181 return result.wait()
182
183 @CLIRequiresDB
184 @CLIReadCommand('device scrape-daemon-health-metrics')
185 def do_scrape_daemon_health_metrics(self, who: str) -> Tuple[int, str, str]:
186 '''
187 Scrape and store device health metrics for a given daemon
188 '''
189 if not self.is_valid_daemon_name(who):
190 return -errno.EINVAL, '', 'not a valid mon or osd daemon name'
191 (daemon_type, daemon_id) = who.split('.')
192 return self.scrape_daemon(daemon_type, daemon_id)
193
194 @CLIRequiresDB
195 @CLIReadCommand('device scrape-health-metrics')
196 def do_scrape_health_metrics(self, devid: Optional[str] = None) -> Tuple[int, str, str]:
197 '''
198 Scrape and store device health metrics
199 '''
200 if devid is None:
201 return self.scrape_all()
202 else:
203 return self.scrape_device(devid)
204
205 @CLIRequiresDB
206 @CLIReadCommand('device get-health-metrics')
207 def do_get_health_metrics(self, devid: str, sample: Optional[str] = None) -> Tuple[int, str, str]:
208 '''
209 Show stored device metrics for the device
210 '''
211 return self.show_device_metrics(devid, sample)
212
213 @CLIRequiresDB
214 @CLICommand('device check-health')
215 def do_check_health(self) -> Tuple[int, str, str]:
216 '''
217 Check life expectancy of devices
218 '''
219 return self.check_health()
220
221 @CLICommand('device monitoring on')
222 def do_monitoring_on(self) -> Tuple[int, str, str]:
223 '''
224 Enable device health monitoring
225 '''
226 self.set_module_option('enable_monitoring', True)
227 self.event.set()
228 return 0, '', ''
229
230 @CLICommand('device monitoring off')
231 def do_monitoring_off(self) -> Tuple[int, str, str]:
232 '''
233 Disable device health monitoring
234 '''
235 self.set_module_option('enable_monitoring', False)
236 self.set_health_checks({}) # avoid stuck health alerts
237 return 0, '', ''
238
239 @CLIRequiresDB
240 @CLIReadCommand('device predict-life-expectancy')
241 def do_predict_life_expectancy(self, devid: str) -> Tuple[int, str, str]:
242 '''
243 Predict life expectancy with local predictor
244 '''
245 return self.predict_lift_expectancy(devid)
246
247 def self_test(self) -> None:
248 assert self.db_ready()
249 self.config_notify()
250 osdmap = self.get('osd_map')
251 osd_id = osdmap['osds'][0]['osd']
252 osdmeta = self.get('osd_metadata')
253 devs = osdmeta.get(str(osd_id), {}).get('device_ids')
254 if devs:
255 devid = devs.split()[0].split('=')[1]
256 self.log.debug(f"getting devid {devid}")
257 (r, before, err) = self.show_device_metrics(devid, None)
258 assert r == 0
259 self.log.debug(f"before: {before}")
260 (r, out, err) = self.scrape_device(devid)
261 assert r == 0
262 (r, after, err) = self.show_device_metrics(devid, None)
263 assert r == 0
264 self.log.debug(f"after: {after}")
265 assert before != after
266
267 def config_notify(self) -> None:
268 for opt in self.MODULE_OPTIONS:
269 setattr(self,
270 opt['name'],
271 self.get_module_option(opt['name']))
272 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
273
274 def _legacy_put_device_metrics(self, t: str, devid: str, data: str) -> None:
275 SQL = """
276 INSERT OR IGNORE INTO DeviceHealthMetrics (time, devid, raw_smart)
277 VALUES (?, ?, ?);
278 """
279
280 self._create_device(devid)
281 epoch = self._t2epoch(t)
282 json.loads(data) # valid?
283 self.db.execute(SQL, (epoch, devid, data))
284
285 devre = r"[a-zA-Z0-9-]+[_-][a-zA-Z0-9-]+[_-][a-zA-Z0-9-]+"
286
287 def _load_legacy_object(self, ioctx: rados.Ioctx, oid: str) -> bool:
288 MAX_OMAP = 10000
289 self.log.debug(f"loading object {oid}")
290 if re.search(self.devre, oid) is None:
291 return False
292 with rados.ReadOpCtx() as op:
293 it, rc = ioctx.get_omap_vals(op, None, None, MAX_OMAP)
294 if rc == 0:
295 ioctx.operate_read_op(op, oid)
296 count = 0
297 for t, raw_smart in it:
298 self.log.debug(f"putting {oid} {t}")
299 self._legacy_put_device_metrics(t, oid, raw_smart)
300 count += 1
301 assert count < MAX_OMAP
302 self.log.debug(f"removing object {oid}")
303 ioctx.remove_object(oid)
304 return True
305
306 def check_legacy_pool(self) -> bool:
307 try:
308 # 'device_health_metrics' is automatically renamed '.mgr' in
309 # create_mgr_pool
310 ioctx = self.rados.open_ioctx(self.MGR_POOL_NAME)
311 except rados.ObjectNotFound:
312 return True
313 if not ioctx:
314 return True
315
316 done = False
317 with ioctx, self._db_lock, self.db:
318 count = 0
319 for obj in ioctx.list_objects():
320 try:
321 if self._load_legacy_object(ioctx, obj.key):
322 count += 1
323 except json.decoder.JSONDecodeError:
324 pass
325 if count >= 10:
326 break
327 done = count < 10
328 self.log.debug(f"finished reading legacy pool, complete = {done}")
329 return done
330
331 def serve(self) -> None:
332 self.log.info("Starting")
333 self.config_notify()
334
335 last_scrape = None
336 finished_loading_legacy = False
337 while self.run:
338 if self.db_ready() and self.enable_monitoring:
339 self.log.debug('Running')
340
341 if not finished_loading_legacy:
342 finished_loading_legacy = self.check_legacy_pool()
343
344 if last_scrape is None:
345 ls = self.get_kv('last_scrape')
346 if ls:
347 try:
348 last_scrape = datetime.strptime(ls, TIME_FORMAT)
349 except ValueError:
350 pass
351 self.log.debug('Last scrape %s', last_scrape)
352
353 self.check_health()
354
355 now = datetime.utcnow()
356 if not last_scrape:
357 next_scrape = now
358 else:
359 # align to scrape interval
360 scrape_frequency = self.scrape_frequency or 86400
361 seconds = (last_scrape - datetime.utcfromtimestamp(0)).total_seconds()
362 seconds -= seconds % scrape_frequency
363 seconds += scrape_frequency
364 next_scrape = datetime.utcfromtimestamp(seconds)
365 if last_scrape:
366 self.log.debug('Last scrape %s, next scrape due %s',
367 last_scrape.strftime(TIME_FORMAT),
368 next_scrape.strftime(TIME_FORMAT))
369 else:
370 self.log.debug('Last scrape never, next scrape due %s',
371 next_scrape.strftime(TIME_FORMAT))
372 if now >= next_scrape:
373 self.scrape_all()
374 self.predict_all_devices()
375 last_scrape = now
376 self.set_kv('last_scrape', last_scrape.strftime(TIME_FORMAT))
377
378 # sleep
379 sleep_interval = self.sleep_interval or 60
380 if not finished_loading_legacy:
381 sleep_interval = 2
382 self.log.debug('Sleeping for %d seconds', sleep_interval)
383 self.event.wait(sleep_interval)
384 self.event.clear()
385
386 def shutdown(self) -> None:
387 self.log.info('Stopping')
388 self.run = False
389 self.event.set()
390
391 def scrape_daemon(self, daemon_type: str, daemon_id: str) -> Tuple[int, str, str]:
392 if not self.db_ready():
393 return -errno.EAGAIN, "", "mgr db not yet available"
394 raw_smart_data = self.do_scrape_daemon(daemon_type, daemon_id)
395 if raw_smart_data:
396 for device, raw_data in raw_smart_data.items():
397 data = self.extract_smart_features(raw_data)
398 if device and data:
399 self.put_device_metrics(device, data)
400 return 0, "", ""
401
402 def scrape_all(self) -> Tuple[int, str, str]:
403 if not self.db_ready():
404 return -errno.EAGAIN, "", "mgr db not yet available"
405 osdmap = self.get("osd_map")
406 assert osdmap is not None
407 did_device = {}
408 ids = []
409 for osd in osdmap['osds']:
410 ids.append(('osd', str(osd['osd'])))
411 monmap = self.get("mon_map")
412 for mon in monmap['mons']:
413 ids.append(('mon', mon['name']))
414 for daemon_type, daemon_id in ids:
415 raw_smart_data = self.do_scrape_daemon(daemon_type, daemon_id)
416 if not raw_smart_data:
417 continue
418 for device, raw_data in raw_smart_data.items():
419 if device in did_device:
420 self.log.debug('skipping duplicate %s' % device)
421 continue
422 did_device[device] = 1
423 data = self.extract_smart_features(raw_data)
424 if device and data:
425 self.put_device_metrics(device, data)
426 return 0, "", ""
427
428 def scrape_device(self, devid: str) -> Tuple[int, str, str]:
429 if not self.db_ready():
430 return -errno.EAGAIN, "", "mgr db not yet available"
431 r = self.get("device " + devid)
432 if not r or 'device' not in r.keys():
433 return -errno.ENOENT, '', 'device ' + devid + ' not found'
434 daemons = r['device'].get('daemons', [])
435 if not daemons:
436 return (-errno.EAGAIN, '',
437 'device ' + devid + ' not claimed by any active daemons')
438 (daemon_type, daemon_id) = daemons[0].split('.')
439 raw_smart_data = self.do_scrape_daemon(daemon_type, daemon_id,
440 devid=devid)
441 if raw_smart_data:
442 for device, raw_data in raw_smart_data.items():
443 data = self.extract_smart_features(raw_data)
444 if device and data:
445 self.put_device_metrics(device, data)
446 return 0, "", ""
447
448 def do_scrape_daemon(self,
449 daemon_type: str,
450 daemon_id: str,
451 devid: str = '') -> Optional[Dict[str, Any]]:
452 """
453 :return: a dict, or None if the scrape failed.
454 """
455 self.log.debug('do_scrape_daemon %s.%s' % (daemon_type, daemon_id))
456 result = CommandResult('')
457 self.send_command(result, daemon_type, daemon_id, json.dumps({
458 'prefix': 'smart',
459 'format': 'json',
460 'devid': devid,
461 }), '')
462 r, outb, outs = result.wait()
463
464 try:
465 return json.loads(outb)
466 except (IndexError, ValueError):
467 self.log.error(
468 "Fail to parse JSON result from daemon {0}.{1} ({2})".format(
469 daemon_type, daemon_id, outb))
470 return None
471
472 def _prune_device_metrics(self) -> None:
473 SQL = """
474 DELETE FROM DeviceHealthMetrics
475 WHERE time < (strftime('%s', 'now') - ?);
476 """
477
478 cursor = self.db.execute(SQL, (self.retention_period,))
479 if cursor.rowcount >= 1:
480 self.log.info(f"pruned {cursor.rowcount} metrics")
481
482 def _create_device(self, devid: str) -> None:
483 SQL = """
484 INSERT OR IGNORE INTO Device VALUES (?);
485 """
486
487 cursor = self.db.execute(SQL, (devid,))
488 if cursor.rowcount >= 1:
489 self.log.info(f"created device {devid}")
490 else:
491 self.log.debug(f"device {devid} already exists")
492
493 def put_device_metrics(self, devid: str, data: Any) -> None:
494 SQL = """
495 INSERT INTO DeviceHealthMetrics (devid, raw_smart)
496 VALUES (?, ?);
497 """
498
499 with self._db_lock, self.db:
500 self._create_device(devid)
501 self.db.execute(SQL, (devid, json.dumps(data)))
502 self._prune_device_metrics()
503
504 # extract wear level?
505 wear_level = get_ata_wear_level(data)
506 if wear_level is None:
507 wear_level = get_nvme_wear_level(data)
508 dev_data = self.get(f"device {devid}") or {}
509 if wear_level is not None:
510 if dev_data.get(wear_level) != str(wear_level):
511 dev_data["wear_level"] = str(wear_level)
512 self.log.debug(f"updating {devid} wear level to {wear_level}")
513 self.set_device_wear_level(devid, wear_level)
514 else:
515 if "wear_level" in dev_data:
516 del dev_data["wear_level"]
517 self.log.debug(f"removing {devid} wear level")
518 self.set_device_wear_level(devid, -1.0)
519
520 def _t2epoch(self, t: Optional[str]) -> int:
521 if not t:
522 return 0
523 else:
524 return int(datetime.strptime(t, TIME_FORMAT).strftime("%s"))
525
526 def _get_device_metrics(self, devid: str,
527 sample: Optional[str] = None,
528 min_sample: Optional[str] = None) -> Dict[str, Dict[str, Any]]:
529 res = {}
530
531 SQL_EXACT = """
532 SELECT time, raw_smart
533 FROM DeviceHealthMetrics
534 WHERE devid = ? AND time = ?
535 ORDER BY time DESC;
536 """
537 SQL_MIN = """
538 SELECT time, raw_smart
539 FROM DeviceHealthMetrics
540 WHERE devid = ? AND ? <= time
541 ORDER BY time DESC;
542 """
543
544 isample = None
545 imin_sample = None
546 if sample:
547 isample = self._t2epoch(sample)
548 else:
549 imin_sample = self._t2epoch(min_sample)
550
551 self.log.debug(f"_get_device_metrics: {devid} {sample} {min_sample}")
552
553 with self._db_lock, self.db:
554 if isample:
555 cursor = self.db.execute(SQL_EXACT, (devid, isample))
556 else:
557 cursor = self.db.execute(SQL_MIN, (devid, imin_sample))
558 for row in cursor:
559 t = row['time']
560 dt = datetime.utcfromtimestamp(t).strftime(TIME_FORMAT)
561 try:
562 res[dt] = json.loads(row['raw_smart'])
563 except (ValueError, IndexError):
564 self.log.debug(f"unable to parse value for {devid}:{t}")
565 pass
566 return res
567
568 def show_device_metrics(self, devid: str, sample: Optional[str]) -> Tuple[int, str, str]:
569 # verify device exists
570 r = self.get("device " + devid)
571 if not r or 'device' not in r.keys():
572 return -errno.ENOENT, '', 'device ' + devid + ' not found'
573 # fetch metrics
574 res = self._get_device_metrics(devid, sample=sample)
575 return 0, json.dumps(res, indent=4, sort_keys=True), ''
576
577 def check_health(self) -> Tuple[int, str, str]:
578 self.log.info('Check health')
579 config = self.get('config')
580 min_in_ratio = float(config.get('mon_osd_min_in_ratio'))
581 mark_out_threshold_td = timedelta(seconds=self.mark_out_threshold)
582 warn_threshold_td = timedelta(seconds=self.warn_threshold)
583 checks: Dict[str, Dict[str, Union[int, str, Sequence[str]]]] = {}
584 health_warnings: Dict[str, List[str]] = {
585 DEVICE_HEALTH: [],
586 DEVICE_HEALTH_IN_USE: [],
587 }
588 devs = self.get("devices")
589 osds_in = {}
590 osds_out = {}
591 now = datetime.now(timezone.utc) # e.g. '2021-09-22 13:18:45.021712+00:00'
592 osdmap = self.get("osd_map")
593 assert osdmap is not None
594 for dev in devs['devices']:
595 if 'life_expectancy_max' not in dev:
596 continue
597 # ignore devices that are not consumed by any daemons
598 if not dev['daemons']:
599 continue
600 if not dev['life_expectancy_max'] or \
601 dev['life_expectancy_max'] == '0.000000':
602 continue
603 # life_expectancy_(min/max) is in the format of:
604 # '%Y-%m-%dT%H:%M:%S.%f%z', e.g.:
605 # '2019-01-20 21:12:12.000000+00:00'
606 life_expectancy_max = datetime.strptime(
607 dev['life_expectancy_max'],
608 '%Y-%m-%dT%H:%M:%S.%f%z')
609 self.log.debug('device %s expectancy max %s', dev,
610 life_expectancy_max)
611
612 if life_expectancy_max - now <= mark_out_threshold_td:
613 if self.self_heal:
614 # dev['daemons'] == ["osd.0","osd.1","osd.2"]
615 if dev['daemons']:
616 osds = [x for x in dev['daemons']
617 if x.startswith('osd.')]
618 osd_ids = map(lambda x: x[4:], osds)
619 for _id in osd_ids:
620 if self.is_osd_in(osdmap, _id):
621 osds_in[_id] = life_expectancy_max
622 else:
623 osds_out[_id] = 1
624
625 if life_expectancy_max - now <= warn_threshold_td:
626 # device can appear in more than one location in case
627 # of SCSI multipath
628 device_locations = map(lambda x: x['host'] + ':' + x['dev'],
629 dev['location'])
630 health_warnings[DEVICE_HEALTH].append(
631 '%s (%s); daemons %s; life expectancy between %s and %s'
632 % (dev['devid'],
633 ','.join(device_locations),
634 ','.join(dev.get('daemons', ['none'])),
635 dev['life_expectancy_max'],
636 dev.get('life_expectancy_max', 'unknown')))
637
638 # OSD might be marked 'out' (which means it has no
639 # data), however PGs are still attached to it.
640 for _id in osds_out:
641 num_pgs = self.get_osd_num_pgs(_id)
642 if num_pgs > 0:
643 health_warnings[DEVICE_HEALTH_IN_USE].append(
644 'osd.%s is marked out '
645 'but still has %s PG(s)' %
646 (_id, num_pgs))
647 if osds_in:
648 self.log.debug('osds_in %s' % osds_in)
649 # calculate target in ratio
650 num_osds = len(osdmap['osds'])
651 num_in = len([x for x in osdmap['osds'] if x['in']])
652 num_bad = len(osds_in)
653 # sort with next-to-fail first
654 bad_osds = sorted(osds_in.items(), key=operator.itemgetter(1))
655 did = 0
656 to_mark_out = []
657 for osd_id, when in bad_osds:
658 ratio = float(num_in - did - 1) / float(num_osds)
659 if ratio < min_in_ratio:
660 final_ratio = float(num_in - num_bad) / float(num_osds)
661 checks[DEVICE_HEALTH_TOOMANY] = {
662 'severity': 'warning',
663 'summary': HEALTH_MESSAGES[DEVICE_HEALTH_TOOMANY],
664 'detail': [
665 '%d OSDs with failing device(s) would bring "in" ratio to %f < mon_osd_min_in_ratio %f' % (
666 num_bad - did, final_ratio, min_in_ratio)
667 ]
668 }
669 break
670 to_mark_out.append(osd_id)
671 did += 1
672 if to_mark_out:
673 self.mark_out_etc(to_mark_out)
674 for warning, ls in health_warnings.items():
675 n = len(ls)
676 if n:
677 checks[warning] = {
678 'severity': 'warning',
679 'summary': HEALTH_MESSAGES[warning] % n,
680 'count': len(ls),
681 'detail': ls,
682 }
683 self.set_health_checks(checks)
684 return 0, "", ""
685
686 def is_osd_in(self, osdmap: Dict[str, Any], osd_id: str) -> bool:
687 for osd in osdmap['osds']:
688 if osd_id == str(osd['osd']):
689 return bool(osd['in'])
690 return False
691
692 def get_osd_num_pgs(self, osd_id: str) -> int:
693 stats = self.get('osd_stats')
694 assert stats is not None
695 for stat in stats['osd_stats']:
696 if osd_id == str(stat['osd']):
697 return stat['num_pgs']
698 return -1
699
700 def mark_out_etc(self, osd_ids: List[str]) -> None:
701 self.log.info('Marking out OSDs: %s' % osd_ids)
702 result = CommandResult('')
703 self.send_command(result, 'mon', '', json.dumps({
704 'prefix': 'osd out',
705 'format': 'json',
706 'ids': osd_ids,
707 }), '')
708 r, outb, outs = result.wait()
709 if r != 0:
710 self.log.warning('Could not mark OSD %s out. r: [%s], outb: [%s], outs: [%s]',
711 osd_ids, r, outb, outs)
712 for osd_id in osd_ids:
713 result = CommandResult('')
714 self.send_command(result, 'mon', '', json.dumps({
715 'prefix': 'osd primary-affinity',
716 'format': 'json',
717 'id': int(osd_id),
718 'weight': 0.0,
719 }), '')
720 r, outb, outs = result.wait()
721 if r != 0:
722 self.log.warning('Could not set osd.%s primary-affinity, '
723 'r: [%s], outb: [%s], outs: [%s]',
724 osd_id, r, outb, outs)
725
726 def extract_smart_features(self, raw: Any) -> Any:
727 # FIXME: extract and normalize raw smartctl --json output and
728 # generate a dict of the fields we care about.
729 return raw
730
731 def predict_lift_expectancy(self, devid: str) -> Tuple[int, str, str]:
732 plugin_name = ''
733 model = self.get_ceph_option('device_failure_prediction_mode')
734 if cast(str, model).lower() == 'local':
735 plugin_name = 'diskprediction_local'
736 else:
737 return -1, '', 'unable to enable any disk prediction model[local/cloud]'
738 try:
739 can_run, _ = self.remote(plugin_name, 'can_run')
740 if can_run:
741 return self.remote(plugin_name, 'predict_life_expectancy', devid=devid)
742 else:
743 return -1, '', f'{plugin_name} is not available'
744 except Exception:
745 return -1, '', 'unable to invoke diskprediction local or remote plugin'
746
747 def predict_all_devices(self) -> Tuple[int, str, str]:
748 plugin_name = ''
749 model = self.get_ceph_option('device_failure_prediction_mode')
750 if cast(str, model).lower() == 'local':
751 plugin_name = 'diskprediction_local'
752 else:
753 return -1, '', 'unable to enable any disk prediction model[local/cloud]'
754 try:
755 can_run, _ = self.remote(plugin_name, 'can_run')
756 if can_run:
757 return self.remote(plugin_name, 'predict_all_devices')
758 else:
759 return -1, '', f'{plugin_name} is not available'
760 except Exception:
761 return -1, '', 'unable to invoke diskprediction local or remote plugin'
762
763 def get_recent_device_metrics(self, devid: str, min_sample: str) -> Dict[str, Dict[str, Any]]:
764 return self._get_device_metrics(devid, min_sample=min_sample)
765
766 def get_time_format(self) -> str:
767 return TIME_FORMAT