]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/prometheus/module.py
2 from distutils
.version
import StrictVersion
10 from mgr_module
import MgrModule
, MgrStandbyModule
12 # Defaults for the Prometheus HTTP server. Can also set in config-key
13 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
14 # for Prometheus exporter port registry
19 # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
20 # that the ports its listening on are in fact bound. When using the any address
21 # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
22 # ipv6 isn't yet configured / supported and CherryPy throws an uncaught
24 if cherrypy
is not None:
25 v
= StrictVersion(cherrypy
.__version
__)
26 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
27 # centos:7) and back to at least 3.0.0.
28 if StrictVersion("3.1.2") <= v
< StrictVersion("3.2.3"):
29 # https://github.com/cherrypy/cherrypy/issues/1100
30 from cherrypy
.process
import servers
31 servers
.wait_for_occupied_port
= lambda host
, port
: None
33 # cherrypy likes to sys.exit on error. don't let it take us down too!
34 def os_exit_noop(*args
, **kwargs
):
38 os
._exit
= os_exit_noop
41 # to access things in class Module from subclass Root. Because
42 # it's a dict, the writer doesn't need to declare 'global' for access
44 _global_instance
= {'plugin': None}
47 def global_instance():
48 assert _global_instance
['plugin'] is not None
49 return _global_instance
['plugin']
52 def health_status_to_number(status
):
54 if status
== 'HEALTH_OK':
56 elif status
== 'HEALTH_WARN':
58 elif status
== 'HEALTH_ERR':
93 DF_CLUSTER
= ['total_bytes', 'total_used_bytes', 'total_objects']
95 DF_POOL
= ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
96 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
98 OSD_FLAGS
= ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
99 'norecover', 'noscrub', 'nodeep-scrub')
101 FS_METADATA
= ('data_pools', 'fs_id', 'metadata_pool', 'name')
103 MDS_METADATA
= ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
106 MON_METADATA
= ('ceph_daemon', 'hostname', 'public_addr', 'rank', 'ceph_version')
108 OSD_METADATA
= ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
109 'front_iface', 'hostname', 'objectstore', 'public_addr',
112 OSD_STATUS
= ['weight', 'up', 'in']
114 OSD_STATS
= ['apply_latency_ms', 'commit_latency_ms']
116 POOL_METADATA
= ('pool_id', 'name')
118 RGW_METADATA
= ('ceph_daemon', 'hostname', 'ceph_version')
120 DISK_OCCUPATION
= ('ceph_daemon', 'device', 'db_device', 'wal_device', 'instance')
122 NUM_OBJECTS
= ['degraded', 'misplaced', 'unfound']
125 class Metric(object):
126 def __init__(self
, mtype
, name
, desc
, labels
=None):
130 self
.labelnames
= labels
# tuple if present
131 self
.value
= {} # indexed by label values
136 def set(self
, value
, labelvalues
=None):
137 # labelvalues must be a tuple
138 labelvalues
= labelvalues
or ('',)
139 self
.value
[labelvalues
] = value
141 def str_expfmt(self
):
143 def promethize(path
):
144 ''' replace illegal metric name characters '''
145 result
= path
.replace('.', '_').replace('+', '_plus').replace('::', '_')
147 # Hyphens usually turn into underscores, unless they are
149 if result
.endswith("-"):
150 result
= result
[0:-1] + "_minus"
152 result
= result
.replace("-", "_")
154 return "ceph_{0}".format(result
)
157 ''' represent as Go-compatible float '''
158 if value
== float('inf'):
160 if value
== float('-inf'):
162 if math
.isnan(value
):
164 return repr(float(value
))
166 name
= promethize(self
.name
)
169 # TYPE {name} {mtype}'''.format(
175 for labelvalues
, value
in self
.value
.items():
177 labels
= zip(self
.labelnames
, labelvalues
)
178 labels
= ','.join('%s="%s"' % (k
, v
) for k
, v
in labels
)
182 fmtstr
= '\n{name}{{{labels}}} {value}'
184 fmtstr
= '\n{name} {value}'
185 expfmt
+= fmtstr
.format(
188 value
=floatstr(value
),
193 class Module(MgrModule
):
196 "cmd": "prometheus self-test",
197 "desc": "Run a self test on the prometheus module",
203 {'name': 'server_addr'},
204 {'name': 'server_port'},
205 {'name': 'scrape_interval'},
208 def __init__(self
, *args
, **kwargs
):
209 super(Module
, self
).__init
__(*args
, **kwargs
)
210 self
.metrics
= self
._setup
_static
_metrics
()
211 self
.shutdown_event
= threading
.Event()
212 self
.collect_lock
= threading
.RLock()
213 self
.collect_time
= 0
214 self
.collect_timeout
= 5.0
215 self
.collect_cache
= None
216 _global_instance
['plugin'] = self
218 def _setup_static_metrics(self
):
220 metrics
['health_status'] = Metric(
223 'Cluster health status'
225 metrics
['mon_quorum_status'] = Metric(
228 'Monitors in quorum',
231 metrics
['fs_metadata'] = Metric(
237 metrics
['mds_metadata'] = Metric(
243 metrics
['mon_metadata'] = Metric(
249 metrics
['osd_metadata'] = Metric(
256 # The reason for having this separate to OSD_METADATA is
257 # so that we can stably use the same tag names that
258 # the Prometheus node_exporter does
259 metrics
['disk_occupation'] = Metric(
262 'Associate Ceph daemon with disk used',
266 metrics
['pool_metadata'] = Metric(
273 metrics
['rgw_metadata'] = Metric(
280 metrics
['pg_total'] = Metric(
286 for flag
in OSD_FLAGS
:
287 path
= 'osd_flag_{}'.format(flag
)
288 metrics
[path
] = Metric(
291 'OSD Flag {}'.format(flag
)
293 for state
in OSD_STATUS
:
294 path
= 'osd_{}'.format(state
)
295 metrics
[path
] = Metric(
298 'OSD status {}'.format(state
),
301 for stat
in OSD_STATS
:
302 path
= 'osd_{}'.format(stat
)
303 metrics
[path
] = Metric(
306 'OSD stat {}'.format(stat
),
309 for state
in PG_STATES
:
310 path
= 'pg_{}'.format(state
)
311 metrics
[path
] = Metric(
314 'PG {}'.format(state
),
316 for state
in DF_CLUSTER
:
317 path
= 'cluster_{}'.format(state
)
318 metrics
[path
] = Metric(
321 'DF {}'.format(state
),
323 for state
in DF_POOL
:
324 path
= 'pool_{}'.format(state
)
325 metrics
[path
] = Metric(
328 'DF pool {}'.format(state
),
331 for state
in NUM_OBJECTS
:
332 path
= 'num_objects_{}'.format(state
)
333 metrics
[path
] = Metric(
336 'Number of {} objects'.format(state
),
341 def get_health(self
):
342 health
= json
.loads(self
.get('health')['json'])
343 self
.metrics
['health_status'].set(
344 health_status_to_number(health
['status'])
348 # maybe get the to-be-exported metrics from a config?
350 for stat
in DF_CLUSTER
:
351 self
.metrics
['cluster_{}'.format(stat
)].set(df
['stats'][stat
])
353 for pool
in df
['pools']:
355 self
.metrics
['pool_{}'.format(stat
)].set(
361 fs_map
= self
.get('fs_map')
362 servers
= self
.get_service_list()
364 for fs
in fs_map
['filesystems']:
365 # collect fs metadata
366 data_pools
= ",".join([str(pool
) for pool
in fs
['mdsmap']['data_pools']])
367 self
.metrics
['fs_metadata'].set(1, (
370 fs
['mdsmap']['metadata_pool'],
371 fs
['mdsmap']['fs_name']
373 self
.log
.debug('mdsmap: {}'.format(fs
['mdsmap']))
374 for gid
, daemon
in fs
['mdsmap']['info'].items():
376 host_version
= servers
.get((id_
, 'mds'), ('',''))
377 self
.metrics
['mds_metadata'].set(1, (
378 'mds.{}'.format(id_
), fs
['id'],
379 host_version
[0], daemon
['addr'],
380 daemon
['rank'], host_version
[1]
383 def get_quorum_status(self
):
384 mon_status
= json
.loads(self
.get('mon_status')['json'])
385 servers
= self
.get_service_list()
386 for mon
in mon_status
['monmap']['mons']:
389 host_version
= servers
.get((id_
, 'mon'), ('',''))
390 self
.metrics
['mon_metadata'].set(1, (
391 'mon.{}'.format(id_
), host_version
[0],
392 mon
['public_addr'].split(':')[0], rank
,
395 in_quorum
= int(rank
in mon_status
['quorum'])
396 self
.metrics
['mon_quorum_status'].set(in_quorum
, (
397 'mon.{}'.format(id_
),
400 def get_pg_status(self
):
401 # TODO add per pool status?
402 pg_status
= self
.get('pg_status')
404 # Set total count of PGs, first
405 self
.metrics
['pg_total'].set(pg_status
['num_pgs'])
408 for pg
in pg_status
['pgs_by_state']:
409 for state
in pg
['state_name'].split('+'):
410 reported_states
[state
] = reported_states
.get(state
, 0) + pg
['count']
412 for state
in reported_states
:
413 path
= 'pg_{}'.format(state
)
415 self
.metrics
[path
].set(reported_states
[state
])
417 self
.log
.warn("skipping pg in unknown state {}".format(state
))
419 for state
in PG_STATES
:
420 if state
not in reported_states
:
422 self
.metrics
['pg_{}'.format(state
)].set(0)
424 self
.log
.warn("skipping pg in unknown state {}".format(state
))
426 def get_osd_stats(self
):
427 osd_stats
= self
.get('osd_stats')
428 for osd
in osd_stats
['osd_stats']:
430 for stat
in OSD_STATS
:
431 val
= osd
['perf_stat'][stat
]
432 self
.metrics
['osd_{}'.format(stat
)].set(val
, (
433 'osd.{}'.format(id_
),
436 def get_service_list(self
):
438 for server
in self
.list_servers():
439 version
= server
.get('ceph_version', '')
440 host
= server
.get('hostname', '')
441 for service
in server
.get('services', []):
442 ret
.update({(service
['id'], service
['type']): (host
, version
)})
445 def get_metadata_and_osd_status(self
):
446 osd_map
= self
.get('osd_map')
447 osd_flags
= osd_map
['flags'].split(',')
448 for flag
in OSD_FLAGS
:
449 self
.metrics
['osd_flag_{}'.format(flag
)].set(
450 int(flag
in osd_flags
)
453 osd_devices
= self
.get('osd_map_crush')['devices']
454 servers
= self
.get_service_list()
455 for osd
in osd_map
['osds']:
456 # id can be used to link osd metrics and metadata
458 # collect osd metadata
459 p_addr
= osd
['public_addr'].split(':')[0]
460 c_addr
= osd
['cluster_addr'].split(':')[0]
461 if p_addr
== "-" or c_addr
== "-":
463 "Missing address metadata for osd {0}, skipping occupation"
464 " and metadata records for this osd".format(id_
)
469 for osd_device
in osd_devices
:
470 if osd_device
['id'] == id_
:
471 dev_class
= osd_device
.get('class', '')
474 if dev_class
is None:
476 "OSD {0} is missing from CRUSH map, skipping output".format(
480 host_version
= servers
.get((str(id_
), 'osd'), ('',''))
482 # collect disk occupation metadata
483 osd_metadata
= self
.get_metadata("osd", str(id_
))
484 if osd_metadata
is None:
487 obj_store
= osd_metadata
.get('osd_objectstore', '')
488 f_iface
= osd_metadata
.get('front_iface', '')
489 b_iface
= osd_metadata
.get('back_iface', '')
491 self
.metrics
['osd_metadata'].set(1, (
493 'osd.{}'.format(id_
),
504 for state
in OSD_STATUS
:
506 self
.metrics
['osd_{}'.format(state
)].set(status
, (
507 'osd.{}'.format(id_
),
510 if obj_store
== "filestore":
511 # collect filestore backend device
512 osd_dev_node
= osd_metadata
.get('backend_filestore_dev_node', None)
513 # collect filestore journal device
514 osd_wal_dev_node
= osd_metadata
.get('osd_journal', '')
516 elif obj_store
== "bluestore":
517 # collect bluestore backend device
518 osd_dev_node
= osd_metadata
.get('bluestore_bdev_dev_node', None)
519 # collect bluestore wal backend
520 osd_wal_dev_node
= osd_metadata
.get('bluefs_wal_dev_node', '')
521 # collect bluestore db backend
522 osd_db_dev_node
= osd_metadata
.get('bluefs_db_dev_node', '')
523 if osd_dev_node
and osd_dev_node
== "unknown":
526 osd_hostname
= osd_metadata
.get('hostname', None)
527 if osd_dev_node
and osd_hostname
:
528 self
.log
.debug("Got dev for osd {0}: {1}/{2}".format(
529 id_
, osd_hostname
, osd_dev_node
))
530 self
.metrics
['disk_occupation'].set(1, (
531 "osd.{0}".format(id_
),
538 self
.log
.info("Missing dev node metadata for osd {0}, skipping "
539 "occupation record for this osd".format(id_
))
542 for pool
in osd_map
['pools']:
543 self
.metrics
['pool_metadata'].set(1, (pool
['pool'], pool
['pool_name']))
545 # Populate rgw_metadata
546 for key
, value
in servers
.items():
547 service_id
, service_type
= key
548 if service_type
!= 'rgw':
550 hostname
, version
= value
551 self
.metrics
['rgw_metadata'].set(
553 ('{}.{}'.format(service_type
, service_id
), hostname
, version
)
556 def get_num_objects(self
):
557 pg_sum
= self
.get('pg_summary')['pg_stats_sum']['stat_sum']
558 for obj
in NUM_OBJECTS
:
559 stat
= 'num_objects_{}'.format(obj
)
560 self
.metrics
[stat
].set(pg_sum
[stat
])
563 # Clear the metrics before scraping
564 for k
in self
.metrics
.keys():
565 self
.metrics
[k
].clear()
571 self
.get_quorum_status()
572 self
.get_metadata_and_osd_status()
574 self
.get_num_objects()
576 for daemon
, counters
in self
.get_all_perf_counters().items():
577 for path
, counter_info
in counters
.items():
578 # Skip histograms, they are represented by long running avgs
579 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
580 if not stattype
or stattype
== 'histogram':
581 self
.log
.debug('ignoring %s, type %s' % (path
, stattype
))
584 # Get the value of the counter
585 value
= self
._perfvalue
_to
_value
(counter_info
['type'], counter_info
['value'])
587 # Represent the long running avgs as sum/count pairs
588 if counter_info
['type'] & self
.PERFCOUNTER_LONGRUNAVG
:
589 _path
= path
+ '_sum'
590 if _path
not in self
.metrics
:
591 self
.metrics
[_path
] = Metric(
594 counter_info
['description'] + ' Total',
597 self
.metrics
[_path
].set(value
, (daemon
,))
599 _path
= path
+ '_count'
600 if _path
not in self
.metrics
:
601 self
.metrics
[_path
] = Metric(
604 counter_info
['description'] + ' Count',
607 self
.metrics
[_path
].set(counter_info
['count'], (daemon
,))
609 if path
not in self
.metrics
:
610 self
.metrics
[path
] = Metric(
613 counter_info
['description'],
616 self
.metrics
[path
].set(value
, (daemon
,))
618 # Return formatted metrics and clear no longer used data
619 _metrics
= [m
.str_expfmt() for m
in self
.metrics
.values()]
620 for k
in self
.metrics
.keys():
621 self
.metrics
[k
].clear()
623 return ''.join(_metrics
) + '\n'
625 def handle_command(self
, cmd
):
626 if cmd
['prefix'] == 'prometheus self-test':
628 return 0, '', 'Self-test OK'
630 return (-errno
.EINVAL
, '',
631 "Command not found '{0}'".format(cmd
['prefix']))
637 # collapse everything to '/'
638 def _cp_dispatch(self
, vpath
):
639 cherrypy
.request
.path
= ''
644 return '''<!DOCTYPE html>
646 <head><title>Ceph Exporter</title></head>
648 <h1>Ceph Exporter</h1>
649 <p><a href='/metrics'>Metrics</a></p>
655 instance
= global_instance()
656 # Lock the function execution
658 instance
.collect_lock
.acquire()
659 return self
._metrics
(instance
)
661 instance
.collect_lock
.release()
663 def _metrics(self
, instance
):
664 # Return cached data if available and collected before the cache times out
665 if instance
.collect_cache
and time
.time() - instance
.collect_time
< instance
.collect_timeout
:
666 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
667 return instance
.collect_cache
669 if instance
.have_mon_connection():
670 instance
.collect_cache
= None
671 instance
.collect_time
= time
.time()
672 instance
.collect_cache
= instance
.collect()
673 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
674 return instance
.collect_cache
676 raise cherrypy
.HTTPError(503, 'No MON connection')
678 # Make the cache timeout for collecting configurable
679 self
.collect_timeout
= self
.get_localized_config('scrape_interval', 5.0)
681 server_addr
= self
.get_localized_config('server_addr', DEFAULT_ADDR
)
682 server_port
= self
.get_localized_config('server_port', DEFAULT_PORT
)
684 "server_addr: %s server_port: %s" %
685 (server_addr
, server_port
)
688 # Publish the URI that others may use to access the service we're
689 # about to start serving
690 self
.set_uri('http://{0}:{1}/'.format(
691 socket
.getfqdn() if server_addr
== '::' else server_addr
,
695 cherrypy
.config
.update({
696 'server.socket_host': server_addr
,
697 'server.socket_port': int(server_port
),
698 'engine.autoreload.on': False
700 cherrypy
.tree
.mount(Root(), "/")
701 self
.log
.info('Starting engine...')
702 cherrypy
.engine
.start()
703 self
.log
.info('Engine started.')
704 # wait for the shutdown event
705 self
.shutdown_event
.wait()
706 self
.shutdown_event
.clear()
707 cherrypy
.engine
.stop()
708 self
.log
.info('Engine stopped.')
711 self
.log
.info('Stopping engine...')
712 self
.shutdown_event
.set()
715 class StandbyModule(MgrStandbyModule
):
716 def __init__(self
, *args
, **kwargs
):
717 super(StandbyModule
, self
).__init
__(*args
, **kwargs
)
718 self
.shutdown_event
= threading
.Event()
721 server_addr
= self
.get_localized_config('server_addr', '::')
722 server_port
= self
.get_localized_config('server_port', DEFAULT_PORT
)
723 self
.log
.info("server_addr: %s server_port: %s" % (server_addr
, server_port
))
724 cherrypy
.config
.update({
725 'server.socket_host': server_addr
,
726 'server.socket_port': int(server_port
),
727 'engine.autoreload.on': False
736 active_uri
= module
.get_active_uri()
737 return '''<!DOCTYPE html>
739 <head><title>Ceph Exporter</title></head>
741 <h1>Ceph Exporter</h1>
742 <p><a href='{}metrics'>Metrics</a></p>
744 </html>'''.format(active_uri
)
748 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
751 cherrypy
.tree
.mount(Root(), '/', {})
752 self
.log
.info('Starting engine...')
753 cherrypy
.engine
.start()
754 self
.log
.info('Engine started.')
755 # Wait for shutdown event
756 self
.shutdown_event
.wait()
757 self
.shutdown_event
.clear()
758 cherrypy
.engine
.stop()
759 self
.log
.info('Engine stopped.')
762 self
.log
.info("Stopping engine...")
763 self
.shutdown_event
.set()
764 self
.log
.info("Stopped engine")