]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/devicehealth/module.py
2 Device health monitoring
7 from mgr_module
import MgrModule
, CommandResult
10 from threading
import Event
11 from datetime
import datetime
, timedelta
, date
, time
12 from six
import iteritems
14 TIME_FORMAT
= '%Y%m%d-%H%M%S'
16 DEVICE_HEALTH
= 'DEVICE_HEALTH'
17 DEVICE_HEALTH_IN_USE
= 'DEVICE_HEALTH_IN_USE'
18 DEVICE_HEALTH_TOOMANY
= 'DEVICE_HEALTH_TOOMANY'
20 DEVICE_HEALTH
: '%d device(s) expected to fail soon',
21 DEVICE_HEALTH_IN_USE
: '%d daemons(s) expected to fail soon and still contain data',
22 DEVICE_HEALTH_TOOMANY
: 'Too many daemons are expected to fail soon',
26 class Module(MgrModule
):
29 'name': 'enable_monitoring',
32 'desc': 'monitor device health metrics',
36 'name': 'scrape_frequency',
39 'desc': 'how frequently to scrape device health metrics',
44 'default': 'device_health_metrics',
46 'desc': 'name of pool in which to store device health metrics',
50 'name': 'retention_period',
51 'default': (86400 * 180),
53 'desc': 'how long to retain device health metrics',
57 'name': 'mark_out_threshold',
58 'default': (86400 * 14 * 2),
60 'desc': 'automatically mark OSD if it may fail before this long',
64 'name': 'warn_threshold',
65 'default': (86400 * 14 * 6),
67 'desc': 'raise health warning if OSD may fail before this long',
74 'desc': 'preemptively heal cluster around devices that may fail',
78 'name': 'sleep_interval',
81 'desc': 'how frequently to wake up and check device health',
88 "cmd": "device query-daemon-health-metrics "
89 "name=who,type=CephString",
90 "desc": "Get device health metrics for a given daemon",
94 "cmd": "device scrape-daemon-health-metrics "
95 "name=who,type=CephString",
96 "desc": "Scrape and store device health metrics "
101 "cmd": "device scrape-health-metrics "
102 "name=devid,type=CephString,req=False",
103 "desc": "Scrape and store health metrics",
107 "cmd": "device get-health-metrics "
108 "name=devid,type=CephString "
109 "name=sample,type=CephString,req=False",
110 "desc": "Show stored device metrics for the device",
114 "cmd": "device check-health",
115 "desc": "Check life expectancy of devices",
119 "cmd": "device monitoring on",
120 "desc": "Enable device health monitoring",
124 "cmd": "device monitoring off",
125 "desc": "Disable device health monitoring",
129 'cmd': 'device predict-life-expectancy '
130 'name=devid,type=CephString,req=true',
131 'desc': 'Predict life expectancy with local predictor',
136 def __init__(self
, *args
, **kwargs
):
137 super(Module
, self
).__init
__(*args
, **kwargs
)
139 # populate options (just until serve() runs)
140 for opt
in self
.MODULE_OPTIONS
:
141 setattr(self
, opt
['name'], opt
['default'])
147 def is_valid_daemon_name(self
, who
):
151 if l
[0] not in ('osd', 'mon'):
155 def handle_command(self
, _
, cmd
):
156 self
.log
.error("handle_command")
158 if cmd
['prefix'] == 'device query-daemon-health-metrics':
159 who
= cmd
.get('who', '')
160 if not self
.is_valid_daemon_name(who
):
161 return -errno
.EINVAL
, '', 'not a valid mon or osd daemon name'
162 (daemon_type
, daemon_id
) = cmd
.get('who', '').split('.')
163 result
= CommandResult('')
164 self
.send_command(result
, daemon_type
, daemon_id
, json
.dumps({
168 r
, outb
, outs
= result
.wait()
170 elif cmd
['prefix'] == 'device scrape-daemon-health-metrics':
171 who
= cmd
.get('who', '')
172 if not self
.is_valid_daemon_name(who
):
173 return -errno
.EINVAL
, '', 'not a valid mon or osd daemon name'
174 (daemon_type
, daemon_id
) = cmd
.get('who', '').split('.')
175 return self
.scrape_daemon(daemon_type
, daemon_id
)
176 elif cmd
['prefix'] == 'device scrape-health-metrics':
178 return self
.scrape_device(cmd
['devid'])
179 return self
.scrape_all()
180 elif cmd
['prefix'] == 'device get-health-metrics':
181 return self
.show_device_metrics(cmd
['devid'], cmd
.get('sample'))
182 elif cmd
['prefix'] == 'device check-health':
183 return self
.check_health()
184 elif cmd
['prefix'] == 'device monitoring on':
185 self
.set_module_option('enable_monitoring', True)
188 elif cmd
['prefix'] == 'device monitoring off':
189 self
.set_module_option('enable_monitoring', False)
190 self
.set_health_checks({}) # avoid stuck health alerts
192 elif cmd
['prefix'] == 'device predict-life-expectancy':
193 return self
.predict_lift_expectancy(cmd
['devid'])
195 # mgr should respect our self.COMMANDS and not call us for
196 # any prefix we don't advertise
197 raise NotImplementedError(cmd
['prefix'])
201 osdmap
= self
.get('osd_map')
202 osd_id
= osdmap
['osds'][0]['osd']
203 osdmeta
= self
.get('osd_metadata')
204 devs
= osdmeta
.get(str(osd_id
), {}).get('device_ids')
206 devid
= devs
.split()[0].split('=')[1]
207 (r
, before
, err
) = self
.show_device_metrics(devid
, '')
209 (r
, out
, err
) = self
.scrape_device(devid
)
211 (r
, after
, err
) = self
.show_device_metrics(devid
, '')
213 assert before
!= after
215 def config_notify(self
):
216 for opt
in self
.MODULE_OPTIONS
:
219 self
.get_module_option(opt
['name']))
220 self
.log
.debug(' %s = %s', opt
['name'], getattr(self
, opt
['name']))
223 self
.log
.info("Starting")
227 ls
= self
.get_store('last_scrape')
230 last_scrape
= datetime
.strptime(ls
, TIME_FORMAT
)
231 except ValueError as e
:
233 self
.log
.debug('Last scrape %s', last_scrape
)
236 if self
.enable_monitoring
:
237 self
.log
.debug('Running')
240 now
= datetime
.utcnow()
244 # align to scrape interval
245 scrape_frequency
= int(self
.scrape_frequency
) or 86400
246 seconds
= (last_scrape
- datetime
.utcfromtimestamp(0)).total_seconds()
247 seconds
-= seconds
% scrape_frequency
248 seconds
+= scrape_frequency
249 next_scrape
= datetime
.utcfromtimestamp(seconds
)
251 self
.log
.debug('Last scrape %s, next scrape due %s',
252 last_scrape
.strftime(TIME_FORMAT
),
253 next_scrape
.strftime(TIME_FORMAT
))
255 self
.log
.debug('Last scrape never, next scrape due %s',
256 next_scrape
.strftime(TIME_FORMAT
))
257 if now
>= next_scrape
:
259 self
.predict_all_devices()
261 self
.set_store('last_scrape', last_scrape
.strftime(TIME_FORMAT
))
264 sleep_interval
= int(self
.sleep_interval
) or 60
265 self
.log
.debug('Sleeping for %d seconds', sleep_interval
)
266 ret
= self
.event
.wait(sleep_interval
)
270 self
.log
.info('Stopping')
274 def open_connection(self
, create_if_missing
=True):
275 pools
= self
.rados
.list_pools()
278 if pool
== self
.pool_name
:
282 if not create_if_missing
:
284 self
.log
.debug('create %s pool' % self
.pool_name
)
286 result
= CommandResult('')
287 self
.send_command(result
, 'mon', '', json
.dumps({
288 'prefix': 'osd pool create',
290 'pool': self
.pool_name
,
294 r
, outb
, outs
= result
.wait()
297 # set pool application
298 result
= CommandResult('')
299 self
.send_command(result
, 'mon', '', json
.dumps({
300 'prefix': 'osd pool application enable',
302 'pool': self
.pool_name
,
303 'app': 'mgr_devicehealth',
305 r
, outb
, outs
= result
.wait()
308 ioctx
= self
.rados
.open_ioctx(self
.pool_name
)
311 def scrape_daemon(self
, daemon_type
, daemon_id
):
312 ioctx
= self
.open_connection()
313 raw_smart_data
= self
.do_scrape_daemon(daemon_type
, daemon_id
)
315 for device
, raw_data
in raw_smart_data
.items():
316 data
= self
.extract_smart_features(raw_data
)
317 self
.put_device_metrics(ioctx
, device
, data
)
321 def scrape_all(self
):
322 osdmap
= self
.get("osd_map")
323 assert osdmap
is not None
324 ioctx
= self
.open_connection()
327 for osd
in osdmap
['osds']:
328 ids
.append(('osd', str(osd
['osd'])))
329 monmap
= self
.get("mon_map")
330 for mon
in monmap
['mons']:
331 ids
.append(('mon', mon
['name']))
332 for daemon_type
, daemon_id
in ids
:
333 raw_smart_data
= self
.do_scrape_daemon(daemon_type
, daemon_id
)
334 if not raw_smart_data
:
336 for device
, raw_data
in raw_smart_data
.items():
337 if device
in did_device
:
338 self
.log
.debug('skipping duplicate %s' % device
)
340 did_device
[device
] = 1
341 data
= self
.extract_smart_features(raw_data
)
342 self
.put_device_metrics(ioctx
, device
, data
)
346 def scrape_device(self
, devid
):
347 r
= self
.get("device " + devid
)
348 if not r
or 'device' not in r
.keys():
349 return -errno
.ENOENT
, '', 'device ' + devid
+ ' not found'
350 daemons
= r
['device'].get('daemons', [])
352 return (-errno
.EAGAIN
, '',
353 'device ' + devid
+ ' not claimed by any active daemons')
354 (daemon_type
, daemon_id
) = daemons
[0].split('.')
355 ioctx
= self
.open_connection()
356 raw_smart_data
= self
.do_scrape_daemon(daemon_type
, daemon_id
,
359 for device
, raw_data
in raw_smart_data
.items():
360 data
= self
.extract_smart_features(raw_data
)
361 self
.put_device_metrics(ioctx
, device
, data
)
365 def do_scrape_daemon(self
, daemon_type
, daemon_id
, devid
=''):
367 :return: a dict, or None if the scrape failed.
369 self
.log
.debug('do_scrape_daemon %s.%s' % (daemon_type
, daemon_id
))
370 result
= CommandResult('')
371 self
.send_command(result
, daemon_type
, daemon_id
, json
.dumps({
376 r
, outb
, outs
= result
.wait()
379 return json
.loads(outb
)
380 except (IndexError, ValueError):
382 "Fail to parse JSON result from daemon {0}.{1} ({2})".format(
383 daemon_type
, daemon_id
, outb
))
385 def put_device_metrics(self
, ioctx
, devid
, data
):
386 old_key
= datetime
.utcnow() - timedelta(
387 seconds
=int(self
.retention_period
))
388 prune
= old_key
.strftime(TIME_FORMAT
)
389 self
.log
.debug('put_device_metrics device %s prune %s' %
393 with rados
.ReadOpCtx() as op
:
394 omap_iter
, ret
= ioctx
.get_omap_keys(op
, "", 500) # fixme
396 ioctx
.operate_read_op(op
, devid
)
397 for key
, _
in list(omap_iter
):
401 except rados
.ObjectNotFound
:
402 # The object doesn't already exist, no problem.
404 except rados
.Error
as e
:
405 # Do not proceed with writes if something unexpected
406 # went wrong with the reads.
407 self
.log
.exception("Error reading OMAP: {0}".format(e
))
410 key
= datetime
.utcnow().strftime(TIME_FORMAT
)
411 self
.log
.debug('put_device_metrics device %s key %s = %s, erase %s' %
412 (devid
, key
, data
, erase
))
413 with rados
.WriteOpCtx() as op
:
414 ioctx
.set_omap(op
, (key
,), (str(json
.dumps(data
)),))
416 ioctx
.remove_omap_keys(op
, tuple(erase
))
417 ioctx
.operate_write_op(op
, devid
)
419 def show_device_metrics(self
, devid
, sample
):
420 # verify device exists
421 r
= self
.get("device " + devid
)
422 if not r
or 'device' not in r
.keys():
423 return -errno
.ENOENT
, '', 'device ' + devid
+ ' not found'
426 ioctx
= self
.open_connection(create_if_missing
=False)
428 return 0, json
.dumps(res
, indent
=4), ''
430 with rados
.ReadOpCtx() as op
:
431 omap_iter
, ret
= ioctx
.get_omap_vals(op
, "", sample
or '', 500) # fixme
434 ioctx
.operate_read_op(op
, devid
)
435 for key
, value
in list(omap_iter
):
436 if sample
and key
!= sample
:
439 v
= json
.loads(value
)
440 except (ValueError, IndexError):
441 self
.log
.debug('unable to parse value for %s: "%s"' %
445 except rados
.ObjectNotFound
:
447 except rados
.Error
as e
:
448 self
.log
.exception("RADOS error reading omap: {0}".format(e
))
451 return 0, json
.dumps(res
, indent
=4), ''
453 def check_health(self
):
454 self
.log
.info('Check health')
455 config
= self
.get('config')
456 min_in_ratio
= float(config
.get('mon_osd_min_in_ratio'))
457 mark_out_threshold_td
= timedelta(seconds
=int(self
.mark_out_threshold
))
458 warn_threshold_td
= timedelta(seconds
=int(self
.warn_threshold
))
462 DEVICE_HEALTH_IN_USE
: [],
464 devs
= self
.get("devices")
467 now
= datetime
.utcnow()
468 osdmap
= self
.get("osd_map")
469 assert osdmap
is not None
470 for dev
in devs
['devices']:
472 if 'life_expectancy_max' not in dev
:
474 # ignore devices that are not consumed by any daemons
475 if not dev
['daemons']:
477 if not dev
['life_expectancy_max'] or \
478 dev
['life_expectancy_max'] == '0.000000':
480 # life_expectancy_(min/max) is in the format of:
481 # '%Y-%m-%d %H:%M:%S.%f', e.g.:
482 # '2019-01-20 21:12:12.000000'
483 life_expectancy_max
= datetime
.strptime(
484 dev
['life_expectancy_max'],
485 '%Y-%m-%d %H:%M:%S.%f')
486 self
.log
.debug('device %s expectancy max %s', dev
,
489 if life_expectancy_max
- now
<= mark_out_threshold_td
:
491 # dev['daemons'] == ["osd.0","osd.1","osd.2"]
493 osds
= [x
for x
in dev
['daemons']
494 if x
.startswith('osd.')]
495 osd_ids
= map(lambda x
: x
[4:], osds
)
497 if self
.is_osd_in(osdmap
, _id
):
498 osds_in
[_id
] = life_expectancy_max
502 if life_expectancy_max
- now
<= warn_threshold_td
:
503 # device can appear in more than one location in case
505 device_locations
= map(lambda x
: x
['host'] + ':' + x
['dev'],
507 health_warnings
[DEVICE_HEALTH
].append(
508 '%s (%s); daemons %s; life expectancy between %s and %s'
510 ','.join(device_locations
),
511 ','.join(dev
.get('daemons', ['none'])),
512 dev
['life_expectancy_max'],
513 dev
.get('life_expectancy_max', 'unknown')))
515 # OSD might be marked 'out' (which means it has no
516 # data), however PGs are still attached to it.
518 num_pgs
= self
.get_osd_num_pgs(_id
)
520 health_warnings
[DEVICE_HEALTH_IN_USE
].append(
521 'osd.%s is marked out '
522 'but still has %s PG(s)' %
525 self
.log
.debug('osds_in %s' % osds_in
)
526 # calculate target in ratio
527 num_osds
= len(osdmap
['osds'])
528 num_in
= len([x
for x
in osdmap
['osds'] if x
['in']])
529 num_bad
= len(osds_in
)
530 # sort with next-to-fail first
531 bad_osds
= sorted(osds_in
.items(), key
=operator
.itemgetter(1))
534 for osd_id
, when
in bad_osds
:
535 ratio
= float(num_in
- did
- 1) / float(num_osds
)
536 if ratio
< min_in_ratio
:
537 final_ratio
= float(num_in
- num_bad
) / float(num_osds
)
538 checks
[DEVICE_HEALTH_TOOMANY
] = {
539 'severity': 'warning',
540 'summary': HEALTH_MESSAGES
[DEVICE_HEALTH_TOOMANY
],
542 '%d OSDs with failing device(s) would bring "in" ratio to %f < mon_osd_min_in_ratio %f' % (num_bad
- did
, final_ratio
, min_in_ratio
)
546 to_mark_out
.append(osd_id
)
549 self
.mark_out_etc(to_mark_out
)
550 for warning
, ls
in iteritems(health_warnings
):
554 'severity': 'warning',
555 'summary': HEALTH_MESSAGES
[warning
] % n
,
558 self
.set_health_checks(checks
)
561 def is_osd_in(self
, osdmap
, osd_id
):
562 for osd
in osdmap
['osds']:
563 if str(osd_id
) == str(osd
['osd']):
564 return bool(osd
['in'])
567 def get_osd_num_pgs(self
, osd_id
):
568 stats
= self
.get('osd_stats')
569 assert stats
is not None
570 for stat
in stats
['osd_stats']:
571 if str(osd_id
) == str(stat
['osd']):
572 return stat
['num_pgs']
575 def mark_out_etc(self
, osd_ids
):
576 self
.log
.info('Marking out OSDs: %s' % osd_ids
)
577 result
= CommandResult('')
578 self
.send_command(result
, 'mon', '', json
.dumps({
583 r
, outb
, outs
= result
.wait()
585 self
.log
.warn('Could not mark OSD %s out. r: [%s], outb: [%s], outs: [%s]' % (osd_ids
, r
, outb
, outs
))
586 for osd_id
in osd_ids
:
587 result
= CommandResult('')
588 self
.send_command(result
, 'mon', '', json
.dumps({
589 'prefix': 'osd primary-affinity',
594 r
, outb
, outs
= result
.wait()
596 self
.log
.warn('Could not set osd.%s primary-affinity, r: [%s], outs: [%s]' % (osd_id
, r
, outb
, outs
))
598 def extract_smart_features(self
, raw
):
599 # FIXME: extract and normalize raw smartctl --json output and
600 # generate a dict of the fields we care about.
603 def predict_lift_expectancy(self
, devid
):
605 model
= self
.get_ceph_option('device_failure_prediction_mode')
606 if model
and model
.lower() == 'cloud':
607 plugin_name
= 'diskprediction_cloud'
608 elif model
and model
.lower() == 'local':
609 plugin_name
= 'diskprediction_local'
611 return -1, '', 'unable to enable any disk prediction model[local/cloud]'
613 can_run
, _
= self
.remote(plugin_name
, 'can_run')
615 return self
.remote(plugin_name
, 'predict_life_expectancy', devid
=devid
)
617 return -1, '', 'unable to invoke diskprediction local or remote plugin'
619 def predict_all_devices(self
):
621 model
= self
.get_ceph_option('device_failure_prediction_mode')
622 if model
and model
.lower() == 'cloud':
623 plugin_name
= 'diskprediction_cloud'
624 elif model
and model
.lower() == 'local':
625 plugin_name
= 'diskprediction_local'
627 return -1, '', 'unable to enable any disk prediction model[local/cloud]'
629 can_run
, _
= self
.remote(plugin_name
, 'can_run')
631 return self
.remote(plugin_name
, 'predict_all_devices')
633 return -1, '', 'unable to invoke diskprediction local or remote plugin'