]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/devicehealth/module.py
2 Device health monitoring
7 from mgr_module
import MgrModule
, CommandResult
10 from threading
import Event
11 from datetime
import datetime
, timedelta
, date
, time
, timezone
12 from six
import iteritems
14 TIME_FORMAT
= '%Y%m%d-%H%M%S'
16 DEVICE_HEALTH
= 'DEVICE_HEALTH'
17 DEVICE_HEALTH_IN_USE
= 'DEVICE_HEALTH_IN_USE'
18 DEVICE_HEALTH_TOOMANY
= 'DEVICE_HEALTH_TOOMANY'
20 DEVICE_HEALTH
: '%d device(s) expected to fail soon',
21 DEVICE_HEALTH_IN_USE
: '%d daemon(s) expected to fail soon and still contain data',
22 DEVICE_HEALTH_TOOMANY
: 'Too many daemons are expected to fail soon',
28 class Module(MgrModule
):
31 'name': 'enable_monitoring',
34 'desc': 'monitor device health metrics',
38 'name': 'scrape_frequency',
41 'desc': 'how frequently to scrape device health metrics',
46 'default': 'device_health_metrics',
48 'desc': 'name of pool in which to store device health metrics',
52 'name': 'retention_period',
53 'default': (86400 * 180),
55 'desc': 'how long to retain device health metrics',
59 'name': 'mark_out_threshold',
60 'default': (86400 * 14 * 2),
62 'desc': 'automatically mark OSD if it may fail before this long',
66 'name': 'warn_threshold',
67 'default': (86400 * 14 * 6),
69 'desc': 'raise health warning if OSD may fail before this long',
76 'desc': 'preemptively heal cluster around devices that may fail',
80 'name': 'sleep_interval',
83 'desc': 'how frequently to wake up and check device health',
90 "cmd": "device query-daemon-health-metrics "
91 "name=who,type=CephString",
92 "desc": "Get device health metrics for a given daemon",
96 "cmd": "device scrape-daemon-health-metrics "
97 "name=who,type=CephString",
98 "desc": "Scrape and store device health metrics "
103 "cmd": "device scrape-health-metrics "
104 "name=devid,type=CephString,req=False",
105 "desc": "Scrape and store health metrics",
109 "cmd": "device get-health-metrics "
110 "name=devid,type=CephString "
111 "name=sample,type=CephString,req=False",
112 "desc": "Show stored device metrics for the device",
116 "cmd": "device check-health",
117 "desc": "Check life expectancy of devices",
121 "cmd": "device monitoring on",
122 "desc": "Enable device health monitoring",
126 "cmd": "device monitoring off",
127 "desc": "Disable device health monitoring",
131 'cmd': 'device predict-life-expectancy '
132 'name=devid,type=CephString,req=true',
133 'desc': 'Predict life expectancy with local predictor',
138 def __init__(self
, *args
, **kwargs
):
139 super(Module
, self
).__init
__(*args
, **kwargs
)
141 # populate options (just until serve() runs)
142 for opt
in self
.MODULE_OPTIONS
:
143 setattr(self
, opt
['name'], opt
['default'])
148 self
.has_device_pool
= False
150 def is_valid_daemon_name(self
, who
):
154 if l
[0] not in ('osd', 'mon'):
158 def handle_command(self
, _
, cmd
):
159 self
.log
.error("handle_command")
161 if cmd
['prefix'] == 'device query-daemon-health-metrics':
162 who
= cmd
.get('who', '')
163 if not self
.is_valid_daemon_name(who
):
164 return -errno
.EINVAL
, '', 'not a valid mon or osd daemon name'
165 (daemon_type
, daemon_id
) = cmd
.get('who', '').split('.')
166 result
= CommandResult('')
167 self
.send_command(result
, daemon_type
, daemon_id
, json
.dumps({
171 r
, outb
, outs
= result
.wait()
173 elif cmd
['prefix'] == 'device scrape-daemon-health-metrics':
174 who
= cmd
.get('who', '')
175 if not self
.is_valid_daemon_name(who
):
176 return -errno
.EINVAL
, '', 'not a valid mon or osd daemon name'
177 (daemon_type
, daemon_id
) = cmd
.get('who', '').split('.')
178 return self
.scrape_daemon(daemon_type
, daemon_id
)
179 elif cmd
['prefix'] == 'device scrape-health-metrics':
181 return self
.scrape_device(cmd
['devid'])
182 return self
.scrape_all()
183 elif cmd
['prefix'] == 'device get-health-metrics':
184 return self
.show_device_metrics(cmd
['devid'], cmd
.get('sample'))
185 elif cmd
['prefix'] == 'device check-health':
186 return self
.check_health()
187 elif cmd
['prefix'] == 'device monitoring on':
188 self
.set_module_option('enable_monitoring', True)
191 elif cmd
['prefix'] == 'device monitoring off':
192 self
.set_module_option('enable_monitoring', False)
193 self
.set_health_checks({}) # avoid stuck health alerts
195 elif cmd
['prefix'] == 'device predict-life-expectancy':
196 return self
.predict_lift_expectancy(cmd
['devid'])
198 # mgr should respect our self.COMMANDS and not call us for
199 # any prefix we don't advertise
200 raise NotImplementedError(cmd
['prefix'])
204 osdmap
= self
.get('osd_map')
205 osd_id
= osdmap
['osds'][0]['osd']
206 osdmeta
= self
.get('osd_metadata')
207 devs
= osdmeta
.get(str(osd_id
), {}).get('device_ids')
209 devid
= devs
.split()[0].split('=')[1]
210 (r
, before
, err
) = self
.show_device_metrics(devid
, '')
212 (r
, out
, err
) = self
.scrape_device(devid
)
214 (r
, after
, err
) = self
.show_device_metrics(devid
, '')
216 assert before
!= after
218 def config_notify(self
):
219 for opt
in self
.MODULE_OPTIONS
:
222 self
.get_module_option(opt
['name']))
223 self
.log
.debug(' %s = %s', opt
['name'], getattr(self
, opt
['name']))
225 def notify(self
, notify_type
, notify_id
):
226 # create device_health_metrics pool if it doesn't exist
227 if notify_type
== "osd_map" and self
.enable_monitoring
:
228 if not self
.has_device_pool
:
229 self
.create_device_pool()
230 self
.has_device_pool
= True
232 def create_device_pool(self
):
233 self
.log
.debug('create %s pool' % self
.pool_name
)
235 result
= CommandResult('')
236 self
.send_command(result
, 'mon', '', json
.dumps({
237 'prefix': 'osd pool create',
239 'pool': self
.pool_name
,
243 r
, outb
, outs
= result
.wait()
245 # set pool application
246 result
= CommandResult('')
247 self
.send_command(result
, 'mon', '', json
.dumps({
248 'prefix': 'osd pool application enable',
250 'pool': self
.pool_name
,
251 'app': 'mgr_devicehealth',
253 r
, outb
, outs
= result
.wait()
257 self
.log
.info("Starting")
261 ls
= self
.get_store('last_scrape')
264 last_scrape
= datetime
.strptime(ls
, TIME_FORMAT
)
265 except ValueError as e
:
267 self
.log
.debug('Last scrape %s', last_scrape
)
270 if self
.enable_monitoring
:
271 self
.log
.debug('Running')
274 now
= datetime
.utcnow()
278 # align to scrape interval
279 scrape_frequency
= int(self
.scrape_frequency
) or 86400
280 seconds
= (last_scrape
- datetime
.utcfromtimestamp(0)).total_seconds()
281 seconds
-= seconds
% scrape_frequency
282 seconds
+= scrape_frequency
283 next_scrape
= datetime
.utcfromtimestamp(seconds
)
285 self
.log
.debug('Last scrape %s, next scrape due %s',
286 last_scrape
.strftime(TIME_FORMAT
),
287 next_scrape
.strftime(TIME_FORMAT
))
289 self
.log
.debug('Last scrape never, next scrape due %s',
290 next_scrape
.strftime(TIME_FORMAT
))
291 if now
>= next_scrape
:
293 self
.predict_all_devices()
295 self
.set_store('last_scrape', last_scrape
.strftime(TIME_FORMAT
))
298 sleep_interval
= int(self
.sleep_interval
) or 60
299 self
.log
.debug('Sleeping for %d seconds', sleep_interval
)
300 ret
= self
.event
.wait(sleep_interval
)
304 self
.log
.info('Stopping')
308 def open_connection(self
, create_if_missing
=True):
309 osdmap
= self
.get("osd_map")
310 assert osdmap
is not None
311 if len(osdmap
['osds']) == 0:
313 if not self
.has_device_pool
:
314 if not create_if_missing
:
316 if self
.enable_monitoring
:
317 self
.create_device_pool()
318 self
.has_device_pool
= True
319 ioctx
= self
.rados
.open_ioctx(self
.pool_name
)
322 def scrape_daemon(self
, daemon_type
, daemon_id
):
323 ioctx
= self
.open_connection()
326 raw_smart_data
= self
.do_scrape_daemon(daemon_type
, daemon_id
)
328 for device
, raw_data
in raw_smart_data
.items():
329 data
= self
.extract_smart_features(raw_data
)
331 self
.put_device_metrics(ioctx
, device
, data
)
335 def scrape_all(self
):
336 osdmap
= self
.get("osd_map")
337 assert osdmap
is not None
338 ioctx
= self
.open_connection()
343 for osd
in osdmap
['osds']:
344 ids
.append(('osd', str(osd
['osd'])))
345 monmap
= self
.get("mon_map")
346 for mon
in monmap
['mons']:
347 ids
.append(('mon', mon
['name']))
348 for daemon_type
, daemon_id
in ids
:
349 raw_smart_data
= self
.do_scrape_daemon(daemon_type
, daemon_id
)
350 if not raw_smart_data
:
352 for device
, raw_data
in raw_smart_data
.items():
353 if device
in did_device
:
354 self
.log
.debug('skipping duplicate %s' % device
)
356 did_device
[device
] = 1
357 data
= self
.extract_smart_features(raw_data
)
359 self
.put_device_metrics(ioctx
, device
, data
)
363 def scrape_device(self
, devid
):
364 r
= self
.get("device " + devid
)
365 if not r
or 'device' not in r
.keys():
366 return -errno
.ENOENT
, '', 'device ' + devid
+ ' not found'
367 daemons
= r
['device'].get('daemons', [])
369 return (-errno
.EAGAIN
, '',
370 'device ' + devid
+ ' not claimed by any active daemons')
371 (daemon_type
, daemon_id
) = daemons
[0].split('.')
372 ioctx
= self
.open_connection()
375 raw_smart_data
= self
.do_scrape_daemon(daemon_type
, daemon_id
,
378 for device
, raw_data
in raw_smart_data
.items():
379 data
= self
.extract_smart_features(raw_data
)
381 self
.put_device_metrics(ioctx
, device
, data
)
385 def do_scrape_daemon(self
, daemon_type
, daemon_id
, devid
=''):
387 :return: a dict, or None if the scrape failed.
389 self
.log
.debug('do_scrape_daemon %s.%s' % (daemon_type
, daemon_id
))
390 result
= CommandResult('')
391 self
.send_command(result
, daemon_type
, daemon_id
, json
.dumps({
396 r
, outb
, outs
= result
.wait()
399 return json
.loads(outb
)
400 except (IndexError, ValueError):
402 "Fail to parse JSON result from daemon {0}.{1} ({2})".format(
403 daemon_type
, daemon_id
, outb
))
405 def put_device_metrics(self
, ioctx
, devid
, data
):
407 old_key
= datetime
.utcnow() - timedelta(
408 seconds
=int(self
.retention_period
))
409 prune
= old_key
.strftime(TIME_FORMAT
)
410 self
.log
.debug('put_device_metrics device %s prune %s' %
414 with rados
.ReadOpCtx() as op
:
415 omap_iter
, ret
= ioctx
.get_omap_keys(op
, "", MAX_SAMPLES
) # fixme
417 ioctx
.operate_read_op(op
, devid
)
418 for key
, _
in list(omap_iter
):
422 except rados
.ObjectNotFound
:
423 # The object doesn't already exist, no problem.
425 except rados
.Error
as e
:
426 # Do not proceed with writes if something unexpected
427 # went wrong with the reads.
428 self
.log
.exception("Error reading OMAP: {0}".format(e
))
431 key
= datetime
.utcnow().strftime(TIME_FORMAT
)
432 self
.log
.debug('put_device_metrics device %s key %s = %s, erase %s' %
433 (devid
, key
, data
, erase
))
434 with rados
.WriteOpCtx() as op
:
435 ioctx
.set_omap(op
, (key
,), (str(json
.dumps(data
)),))
437 ioctx
.remove_omap_keys(op
, tuple(erase
))
438 ioctx
.operate_write_op(op
, devid
)
440 def _get_device_metrics(self
, devid
, sample
=None, min_sample
=None):
442 ioctx
= self
.open_connection(create_if_missing
=False)
446 with rados
.ReadOpCtx() as op
:
447 omap_iter
, ret
= ioctx
.get_omap_vals(op
, min_sample
or '', sample
or '',
451 ioctx
.operate_read_op(op
, devid
)
452 for key
, value
in list(omap_iter
):
453 if sample
and key
!= sample
:
455 if min_sample
and key
< min_sample
:
458 v
= json
.loads(value
)
459 except (ValueError, IndexError):
460 self
.log
.debug('unable to parse value for %s: "%s"' %
464 except rados
.ObjectNotFound
:
466 except rados
.Error
as e
:
467 self
.log
.exception("RADOS error reading omap: {0}".format(e
))
471 def show_device_metrics(self
, devid
, sample
):
472 # verify device exists
473 r
= self
.get("device " + devid
)
474 if not r
or 'device' not in r
.keys():
475 return -errno
.ENOENT
, '', 'device ' + devid
+ ' not found'
477 res
= self
._get
_device
_metrics
(devid
, sample
=sample
)
478 return 0, json
.dumps(res
, indent
=4, sort_keys
=True), ''
480 def check_health(self
):
481 self
.log
.info('Check health')
482 config
= self
.get('config')
483 min_in_ratio
= float(config
.get('mon_osd_min_in_ratio'))
484 mark_out_threshold_td
= timedelta(seconds
=int(self
.mark_out_threshold
))
485 warn_threshold_td
= timedelta(seconds
=int(self
.warn_threshold
))
489 DEVICE_HEALTH_IN_USE
: [],
491 devs
= self
.get("devices")
494 now
= datetime
.now(timezone
.utc
) # e.g. '2021-09-22 13:18:45.021712+00:00'
495 osdmap
= self
.get("osd_map")
496 assert osdmap
is not None
497 for dev
in devs
['devices']:
499 if 'life_expectancy_max' not in dev
:
501 # ignore devices that are not consumed by any daemons
502 if not dev
['daemons']:
504 if not dev
['life_expectancy_max'] or \
505 dev
['life_expectancy_max'] == '0.000000':
507 # life_expectancy_(min/max) is in the format of:
508 # '%Y-%m-%dT%H:%M:%S.%f%z', e.g.:
509 # '2019-01-20 21:12:12.000000+00:00'
510 life_expectancy_max
= datetime
.strptime(
511 dev
['life_expectancy_max'],
512 '%Y-%m-%dT%H:%M:%S.%f%z')
513 self
.log
.debug('device %s expectancy max %s', dev
,
516 if life_expectancy_max
- now
<= mark_out_threshold_td
:
518 # dev['daemons'] == ["osd.0","osd.1","osd.2"]
520 osds
= [x
for x
in dev
['daemons']
521 if x
.startswith('osd.')]
522 osd_ids
= map(lambda x
: x
[4:], osds
)
524 if self
.is_osd_in(osdmap
, _id
):
525 osds_in
[_id
] = life_expectancy_max
529 if life_expectancy_max
- now
<= warn_threshold_td
:
530 # device can appear in more than one location in case
532 device_locations
= map(lambda x
: x
['host'] + ':' + x
['dev'],
534 health_warnings
[DEVICE_HEALTH
].append(
535 '%s (%s); daemons %s; life expectancy between %s and %s'
537 ','.join(device_locations
),
538 ','.join(dev
.get('daemons', ['none'])),
539 dev
['life_expectancy_max'],
540 dev
.get('life_expectancy_max', 'unknown')))
542 # OSD might be marked 'out' (which means it has no
543 # data), however PGs are still attached to it.
545 num_pgs
= self
.get_osd_num_pgs(_id
)
547 health_warnings
[DEVICE_HEALTH_IN_USE
].append(
548 'osd.%s is marked out '
549 'but still has %s PG(s)' %
552 self
.log
.debug('osds_in %s' % osds_in
)
553 # calculate target in ratio
554 num_osds
= len(osdmap
['osds'])
555 num_in
= len([x
for x
in osdmap
['osds'] if x
['in']])
556 num_bad
= len(osds_in
)
557 # sort with next-to-fail first
558 bad_osds
= sorted(osds_in
.items(), key
=operator
.itemgetter(1))
561 for osd_id
, when
in bad_osds
:
562 ratio
= float(num_in
- did
- 1) / float(num_osds
)
563 if ratio
< min_in_ratio
:
564 final_ratio
= float(num_in
- num_bad
) / float(num_osds
)
565 checks
[DEVICE_HEALTH_TOOMANY
] = {
566 'severity': 'warning',
567 'summary': HEALTH_MESSAGES
[DEVICE_HEALTH_TOOMANY
],
569 '%d OSDs with failing device(s) would bring "in" ratio to %f < mon_osd_min_in_ratio %f' % (num_bad
- did
, final_ratio
, min_in_ratio
)
573 to_mark_out
.append(osd_id
)
576 self
.mark_out_etc(to_mark_out
)
577 for warning
, ls
in iteritems(health_warnings
):
581 'severity': 'warning',
582 'summary': HEALTH_MESSAGES
[warning
] % n
,
586 self
.set_health_checks(checks
)
589 def is_osd_in(self
, osdmap
, osd_id
):
590 for osd
in osdmap
['osds']:
591 if str(osd_id
) == str(osd
['osd']):
592 return bool(osd
['in'])
595 def get_osd_num_pgs(self
, osd_id
):
596 stats
= self
.get('osd_stats')
597 assert stats
is not None
598 for stat
in stats
['osd_stats']:
599 if str(osd_id
) == str(stat
['osd']):
600 return stat
['num_pgs']
603 def mark_out_etc(self
, osd_ids
):
604 self
.log
.info('Marking out OSDs: %s' % osd_ids
)
605 result
= CommandResult('')
606 self
.send_command(result
, 'mon', '', json
.dumps({
611 r
, outb
, outs
= result
.wait()
613 self
.log
.warning('Could not mark OSD %s out. r: [%s], outb: [%s], outs: [%s]' % (osd_ids
, r
, outb
, outs
))
614 for osd_id
in osd_ids
:
615 result
= CommandResult('')
616 self
.send_command(result
, 'mon', '', json
.dumps({
617 'prefix': 'osd primary-affinity',
622 r
, outb
, outs
= result
.wait()
624 self
.log
.warning('Could not set osd.%s primary-affinity, r: [%s], outs: [%s]' % (osd_id
, r
, outb
, outs
))
626 def extract_smart_features(self
, raw
):
627 # FIXME: extract and normalize raw smartctl --json output and
628 # generate a dict of the fields we care about.
631 def predict_lift_expectancy(self
, devid
):
633 model
= self
.get_ceph_option('device_failure_prediction_mode')
634 if model
and model
.lower() == 'cloud':
635 plugin_name
= 'diskprediction_cloud'
636 elif model
and model
.lower() == 'local':
637 plugin_name
= 'diskprediction_local'
639 return -1, '', 'unable to enable any disk prediction model[local/cloud]'
641 can_run
, _
= self
.remote(plugin_name
, 'can_run')
643 return self
.remote(plugin_name
, 'predict_life_expectancy', devid
=devid
)
645 return -1, '', 'unable to invoke diskprediction local or remote plugin'
647 def predict_all_devices(self
):
649 model
= self
.get_ceph_option('device_failure_prediction_mode')
650 if model
and model
.lower() == 'cloud':
651 plugin_name
= 'diskprediction_cloud'
652 elif model
and model
.lower() == 'local':
653 plugin_name
= 'diskprediction_local'
655 return -1, '', 'unable to enable any disk prediction model[local/cloud]'
657 can_run
, _
= self
.remote(plugin_name
, 'can_run')
659 return self
.remote(plugin_name
, 'predict_all_devices')
661 return -1, '', 'unable to invoke diskprediction local or remote plugin'
663 def get_recent_device_metrics(self
, devid
, min_sample
):
664 return self
._get
_device
_metrics
(devid
, min_sample
=min_sample
)
666 def get_time_format(self
):