]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/prometheus/module.py
6 from collections
import OrderedDict
7 from mgr_module
import MgrModule
9 # Defaults for the Prometheus HTTP server. Can also set in config-key
10 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
11 # for Prometheus exporter port registry
17 # cherrypy likes to sys.exit on error. don't let it take us down too!
18 def os_exit_noop(*args
, **kwargs
):
22 os
._exit
= os_exit_noop
25 # to access things in class Module from subclass Root. Because
26 # it's a dict, the writer doesn't need to declare 'global' for access
28 _global_instance
= {'plugin': None}
31 def global_instance():
32 assert _global_instance
['plugin'] is not None
33 return _global_instance
['plugin']
36 def health_status_to_number(status
):
38 if status
== 'HEALTH_OK':
40 elif status
== 'HEALTH_WARN':
42 elif status
== 'HEALTH_ERR':
45 PG_STATES
= ['creating', 'active', 'clean', 'down', 'scrubbing', 'deep', 'degraded',
46 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
47 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
48 'incomplete', 'stale', 'remapped', 'undersized', 'peered']
50 DF_CLUSTER
= ['total_bytes', 'total_used_bytes', 'total_objects']
52 DF_POOL
= ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
53 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
55 OSD_METADATA
= ('cluster_addr', 'device_class', 'id', 'public_addr')
57 OSD_STATUS
= ['weight', 'up', 'in']
59 OSD_STATS
= ['apply_latency_ms', 'commit_latency_ms']
61 POOL_METADATA
= ('pool_id', 'name')
63 DISK_OCCUPATION
= ('instance', 'device', 'ceph_daemon')
67 def __init__(self
, mtype
, name
, desc
, labels
=None):
71 self
.labelnames
= labels
# tuple if present
72 self
.value
= dict() # indexed by label values
74 def set(self
, value
, labelvalues
=None):
75 # labelvalues must be a tuple
76 labelvalues
= labelvalues
or ('',)
77 self
.value
[labelvalues
] = value
82 ''' replace illegal metric name characters '''
83 result
= path
.replace('.', '_').replace('+', '_plus').replace('::', '_')
85 # Hyphens usually turn into underscores, unless they are
87 if result
.endswith("-"):
88 result
= result
[0:-1] + "_minus"
90 result
= result
.replace("-", "_")
92 return "ceph_{0}".format(result
)
95 ''' represent as Go-compatible float '''
96 if value
== float('inf'):
98 if value
== float('-inf'):
100 if math
.isnan(value
):
102 return repr(float(value
))
104 name
= promethize(self
.name
)
107 # TYPE {name} {mtype}'''.format(
113 for labelvalues
, value
in self
.value
.items():
115 labels
= zip(self
.labelnames
, labelvalues
)
116 labels
= ','.join('%s="%s"' % (k
, v
) for k
, v
in labels
)
120 fmtstr
= '\n{name}{{{labels}}} {value}'
122 fmtstr
= '\n{name} {value}'
123 expfmt
+= fmtstr
.format(
126 value
=floatstr(value
),
131 class Module(MgrModule
):
134 "cmd": "prometheus self-test",
135 "desc": "Run a self test on the prometheus module",
140 def __init__(self
, *args
, **kwargs
):
141 super(Module
, self
).__init
__(*args
, **kwargs
)
142 self
.notified
= False
144 self
.metrics
= self
._setup
_static
_metrics
()
145 self
.schema
= OrderedDict()
146 _global_instance
['plugin'] = self
148 def _stattype_to_str(self
, stattype
):
150 typeonly
= stattype
& self
.PERFCOUNTER_TYPE_MASK
153 if typeonly
== self
.PERFCOUNTER_LONGRUNAVG
:
154 # this lie matches the DaemonState decoding: only val, no counts
156 if typeonly
== self
.PERFCOUNTER_COUNTER
:
158 if typeonly
== self
.PERFCOUNTER_HISTOGRAM
:
163 def _setup_static_metrics(self
):
165 metrics
['health_status'] = Metric(
168 'Cluster health status'
170 metrics
['mon_quorum_count'] = Metric(
175 metrics
['osd_metadata'] = Metric(
182 # The reason for having this separate to OSD_METADATA is
183 # so that we can stably use the same tag names that
184 # the Prometheus node_exporter does
185 metrics
['disk_occupation'] = Metric(
188 'Associate Ceph daemon with disk used',
192 metrics
['pool_metadata'] = Metric(
198 for state
in OSD_STATUS
:
199 path
= 'osd_{}'.format(state
)
200 self
.log
.debug("init: creating {}".format(path
))
201 metrics
[path
] = Metric(
204 'OSD status {}'.format(state
),
207 for stat
in OSD_STATS
:
208 path
= 'osd_{}'.format(stat
)
209 self
.log
.debug("init: creating {}".format(path
))
210 metrics
[path
] = Metric(
213 'OSD stat {}'.format(stat
),
216 for state
in PG_STATES
:
217 path
= 'pg_{}'.format(state
)
218 self
.log
.debug("init: creating {}".format(path
))
219 metrics
[path
] = Metric(
222 'PG {}'.format(state
),
224 for state
in DF_CLUSTER
:
225 path
= 'cluster_{}'.format(state
)
226 self
.log
.debug("init: creating {}".format(path
))
227 metrics
[path
] = Metric(
230 'DF {}'.format(state
),
232 for state
in DF_POOL
:
233 path
= 'pool_{}'.format(state
)
234 self
.log
.debug("init: creating {}".format(path
))
235 metrics
[path
] = Metric(
238 'DF pool {}'.format(state
),
248 def get_health(self
):
249 health
= json
.loads(self
.get('health')['json'])
250 self
.metrics
['health_status'].set(
251 health_status_to_number(health
['status'])
255 # maybe get the to-be-exported metrics from a config?
257 for stat
in DF_CLUSTER
:
258 path
= 'cluster_{}'.format(stat
)
259 self
.metrics
[path
].set(df
['stats'][stat
])
261 for pool
in df
['pools']:
263 path
= 'pool_{}'.format(stat
)
264 self
.metrics
[path
].set(pool
['stats'][stat
], (pool
['id'],))
266 def get_quorum_status(self
):
267 mon_status
= json
.loads(self
.get('mon_status')['json'])
268 self
.metrics
['mon_quorum_count'].set(len(mon_status
['quorum']))
270 def get_pg_status(self
):
271 # TODO add per pool status?
272 pg_s
= self
.get('pg_summary')['all']
273 reported_pg_s
= [(s
,v
) for key
, v
in pg_s
.items() for s
in
275 for state
, value
in reported_pg_s
:
276 path
= 'pg_{}'.format(state
)
278 self
.metrics
[path
].set(value
)
280 self
.log
.warn("skipping pg in unknown state {}".format(state
))
281 reported_states
= [s
[0] for s
in reported_pg_s
]
282 for state
in PG_STATES
:
283 path
= 'pg_{}'.format(state
)
284 if state
not in reported_states
:
286 self
.metrics
[path
].set(0)
288 self
.log
.warn("skipping pg in unknown state {}".format(state
))
290 def get_osd_stats(self
):
291 osd_stats
= self
.get('osd_stats')
292 for osd
in osd_stats
['osd_stats']:
294 for stat
in OSD_STATS
:
295 status
= osd
['perf_stat'][stat
]
296 self
.metrics
['osd_{}'.format(stat
)].set(
298 ('osd.{}'.format(id_
),))
300 def get_metadata_and_osd_status(self
):
301 osd_map
= self
.get('osd_map')
302 osd_devices
= self
.get('osd_map_crush')['devices']
303 for osd
in osd_map
['osds']:
305 p_addr
= osd
['public_addr'].split(':')[0]
306 c_addr
= osd
['cluster_addr'].split(':')[0]
307 dev_class
= next((osd
for osd
in osd_devices
if osd
['id'] == id_
))
308 self
.metrics
['osd_metadata'].set(0, (
314 for state
in OSD_STATUS
:
316 self
.metrics
['osd_{}'.format(state
)].set(
318 ('osd.{}'.format(id_
),))
320 osd_metadata
= self
.get_metadata("osd", str(id_
))
321 dev_keys
= ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
323 for dev_key
in dev_keys
:
324 val
= osd_metadata
.get(dev_key
, None)
325 if val
and val
!= "unknown":
328 osd_hostname
= osd_metadata
.get('hostname', None)
329 if osd_dev_node
and osd_hostname
:
330 self
.log
.debug("Got dev for osd {0}: {1}/{2}".format(
331 id_
, osd_hostname
, osd_dev_node
))
332 self
.metrics
['disk_occupation'].set(0, (
335 "osd.{0}".format(id_
)
338 self
.log
.info("Missing dev node metadata for osd {0}, skipping "
339 "occupation record for this osd".format(id_
))
341 for pool
in osd_map
['pools']:
343 name
= pool
['pool_name']
344 self
.metrics
['pool_metadata'].set(0, (id_
, name
))
350 self
.get_quorum_status()
351 self
.get_metadata_and_osd_status()
354 for daemon
, counters
in self
.get_all_perf_counters().iteritems():
355 for path
, counter_info
in counters
.items():
356 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
357 # XXX simplify first effort: no histograms
358 # averages are already collapsed to one value for us
359 if not stattype
or stattype
== 'histogram':
360 self
.log
.debug('ignoring %s, type %s' % (path
, stattype
))
363 if path
not in self
.metrics
:
364 self
.metrics
[path
] = Metric(
367 counter_info
['description'],
371 self
.metrics
[path
].set(
372 counter_info
['value'],
378 def handle_command(self
, cmd
):
379 if cmd
['prefix'] == 'prometheus self-test':
381 return 0, '', 'Self-test OK'
383 return (-errno
.EINVAL
, '',
384 "Command not found '{0}'".format(cmd
['prefix']))
390 # collapse everything to '/'
391 def _cp_dispatch(self
, vpath
):
392 cherrypy
.request
.path
= ''
395 def format_metrics(self
, metrics
):
397 for m
in metrics
.values():
398 formatted
+= m
.str_expfmt()
399 return formatted
+ '\n'
403 return '''<!DOCTYPE html>
405 <head><title>Ceph Exporter</title></head>
407 <h1>Ceph Exporter</h1>
408 <p><a href='/metrics'>Metrics</a></p>
414 metrics
= global_instance().collect()
415 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
417 return self
.format_metrics(metrics
)
419 server_addr
= self
.get_localized_config('server_addr', DEFAULT_ADDR
)
420 server_port
= self
.get_localized_config('server_port', DEFAULT_PORT
)
422 "server_addr: %s server_port: %s" %
423 (server_addr
, server_port
)
426 cherrypy
.config
.update({
427 'server.socket_host': server_addr
,
428 'server.socket_port': int(server_port
),
429 'engine.autoreload.on': False
431 cherrypy
.tree
.mount(Root(), "/")
432 cherrypy
.engine
.start()
433 cherrypy
.engine
.block()