]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/prometheus/module.py
842517f208f37e7a101864d875b295263caec083
6 from collections
import OrderedDict
7 from mgr_module
import MgrModule
9 # Defaults for the Prometheus HTTP server. Can also set in config-key
10 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
11 # for Prometheus exporter port registry
17 # cherrypy likes to sys.exit on error. don't let it take us down too!
18 def os_exit_noop(*args
, **kwargs
):
22 os
._exit
= os_exit_noop
25 # to access things in class Module from subclass Root. Because
26 # it's a dict, the writer doesn't need to declare 'global' for access
28 _global_instance
= {'plugin': None}
31 def global_instance():
32 assert _global_instance
['plugin'] is not None
33 return _global_instance
['plugin']
36 def health_status_to_number(status
):
38 if status
== 'HEALTH_OK':
40 elif status
== 'HEALTH_WARN':
42 elif status
== 'HEALTH_ERR':
45 PG_STATES
= ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded',
46 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
47 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
48 'incomplete', 'stale', 'remapped', 'undersized', 'peered']
50 DF_CLUSTER
= ['total_bytes', 'total_used_bytes', 'total_objects']
52 DF_POOL
= ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
53 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
55 OSD_METADATA
= ('cluster_addr', 'device_class', 'id', 'public_addr')
57 OSD_STATUS
= ['weight', 'up', 'in']
59 POOL_METADATA
= ('pool_id', 'name')
61 DISK_OCCUPATION
= ('instance', 'device', 'ceph_daemon')
65 def __init__(self
, mtype
, name
, desc
, labels
=None):
69 self
.labelnames
= labels
# tuple if present
70 self
.value
= dict() # indexed by label values
72 def set(self
, value
, labelvalues
=None):
73 # labelvalues must be a tuple
74 labelvalues
= labelvalues
or ('',)
75 self
.value
[labelvalues
] = value
80 ''' replace illegal metric name characters '''
81 result
= path
.replace('.', '_').replace('+', '_plus').replace('::', '_')
83 # Hyphens usually turn into underscores, unless they are
85 if result
.endswith("-"):
86 result
= result
[0:-1] + "_minus"
88 result
= result
.replace("-", "_")
90 return "ceph_{0}".format(result
)
93 ''' represent as Go-compatible float '''
94 if value
== float('inf'):
96 if value
== float('-inf'):
100 return repr(float(value
))
102 name
= promethize(self
.name
)
105 # TYPE {name} {mtype}'''.format(
111 for labelvalues
, value
in self
.value
.items():
113 labels
= zip(self
.labelnames
, labelvalues
)
114 labels
= ','.join('%s="%s"' % (k
, v
) for k
, v
in labels
)
118 fmtstr
= '\n{name}{{{labels}}} {value}'
120 fmtstr
= '\n{name} {value}'
121 expfmt
+= fmtstr
.format(
124 value
=floatstr(value
),
129 class Module(MgrModule
):
132 "cmd": "prometheus self-test",
133 "desc": "Run a self test on the prometheus module",
138 def __init__(self
, *args
, **kwargs
):
139 super(Module
, self
).__init
__(*args
, **kwargs
)
140 self
.notified
= False
142 self
.metrics
= self
._setup
_static
_metrics
()
143 self
.schema
= OrderedDict()
144 _global_instance
['plugin'] = self
146 def _stattype_to_str(self
, stattype
):
148 typeonly
= stattype
& self
.PERFCOUNTER_TYPE_MASK
151 if typeonly
== self
.PERFCOUNTER_LONGRUNAVG
:
152 # this lie matches the DaemonState decoding: only val, no counts
154 if typeonly
== self
.PERFCOUNTER_COUNTER
:
156 if typeonly
== self
.PERFCOUNTER_HISTOGRAM
:
161 def _setup_static_metrics(self
):
163 metrics
['health_status'] = Metric(
166 'Cluster health status'
168 metrics
['mon_quorum_count'] = Metric(
173 metrics
['osd_metadata'] = Metric(
180 # The reason for having this separate to OSD_METADATA is
181 # so that we can stably use the same tag names that
182 # the Prometheus node_exporter does
183 metrics
['disk_occupation'] = Metric(
186 'Associate Ceph daemon with disk used',
190 metrics
['pool_metadata'] = Metric(
196 for state
in OSD_STATUS
:
197 path
= 'osd_{}'.format(state
)
198 self
.log
.debug("init: creating {}".format(path
))
199 metrics
[path
] = Metric(
202 'OSD status {}'.format(state
),
205 for state
in PG_STATES
:
206 path
= 'pg_{}'.format(state
)
207 self
.log
.debug("init: creating {}".format(path
))
208 metrics
[path
] = Metric(
211 'PG {}'.format(state
),
213 for state
in DF_CLUSTER
:
214 path
= 'cluster_{}'.format(state
)
215 self
.log
.debug("init: creating {}".format(path
))
216 metrics
[path
] = Metric(
219 'DF {}'.format(state
),
221 for state
in DF_POOL
:
222 path
= 'pool_{}'.format(state
)
223 self
.log
.debug("init: creating {}".format(path
))
224 metrics
[path
] = Metric(
227 'DF pool {}'.format(state
),
237 def get_health(self
):
238 health
= json
.loads(self
.get('health')['json'])
239 self
.metrics
['health_status'].set(
240 health_status_to_number(health
['status'])
244 # maybe get the to-be-exported metrics from a config?
246 for stat
in DF_CLUSTER
:
247 path
= 'cluster_{}'.format(stat
)
248 self
.metrics
[path
].set(df
['stats'][stat
])
250 for pool
in df
['pools']:
252 path
= 'pool_{}'.format(stat
)
253 self
.metrics
[path
].set(pool
['stats'][stat
], (pool
['id'],))
255 def get_quorum_status(self
):
256 mon_status
= json
.loads(self
.get('mon_status')['json'])
257 self
.metrics
['mon_quorum_count'].set(len(mon_status
['quorum']))
259 def get_pg_status(self
):
260 # TODO add per pool status?
261 pg_s
= self
.get('pg_summary')['all']
262 reported_pg_s
= [(s
,v
) for key
, v
in pg_s
.items() for s
in
264 for state
, value
in reported_pg_s
:
265 path
= 'pg_{}'.format(state
)
266 self
.metrics
[path
].set(value
)
267 reported_states
= [s
[0] for s
in reported_pg_s
]
268 for state
in PG_STATES
:
269 path
= 'pg_{}'.format(state
)
270 if state
not in reported_states
:
271 self
.metrics
[path
].set(0)
273 def get_metadata_and_osd_status(self
):
274 osd_map
= self
.get('osd_map')
275 osd_devices
= self
.get('osd_map_crush')['devices']
276 for osd
in osd_map
['osds']:
278 p_addr
= osd
['public_addr'].split(':')[0]
279 c_addr
= osd
['cluster_addr'].split(':')[0]
280 dev_class
= next((osd
for osd
in osd_devices
if osd
['id'] == id_
))
281 self
.metrics
['osd_metadata'].set(0, (
287 for state
in OSD_STATUS
:
289 self
.metrics
['osd_{}'.format(state
)].set(
291 ('osd.{}'.format(id_
),))
293 osd_metadata
= self
.get_metadata("osd", str(id_
))
294 dev_keys
= ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
296 for dev_key
in dev_keys
:
297 val
= osd_metadata
.get(dev_key
, None)
298 if val
and val
!= "unknown":
301 osd_hostname
= osd_metadata
.get('hostname', None)
302 if osd_dev_node
and osd_hostname
:
303 self
.log
.debug("Got dev for osd {0}: {1}/{2}".format(
304 id_
, osd_hostname
, osd_dev_node
))
305 self
.metrics
['disk_occupation'].set(0, (
308 "osd.{0}".format(id_
)
311 self
.log
.info("Missing dev node metadata for osd {0}, skipping "
312 "occupation record for this osd".format(id_
))
314 for pool
in osd_map
['pools']:
316 name
= pool
['pool_name']
317 self
.metrics
['pool_metadata'].set(0, (id_
, name
))
322 self
.get_quorum_status()
323 self
.get_metadata_and_osd_status()
326 for daemon
, counters
in self
.get_all_perf_counters().iteritems():
327 for path
, counter_info
in counters
.items():
328 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
329 # XXX simplify first effort: no histograms
330 # averages are already collapsed to one value for us
331 if not stattype
or stattype
== 'histogram':
332 self
.log
.debug('ignoring %s, type %s' % (path
, stattype
))
335 if path
not in self
.metrics
:
336 self
.metrics
[path
] = Metric(
339 counter_info
['description'],
343 self
.metrics
[path
].set(
344 counter_info
['value'],
350 def handle_command(self
, cmd
):
351 if cmd
['prefix'] == 'prometheus self-test':
353 return 0, '', 'Self-test OK'
355 return (-errno
.EINVAL
, '',
356 "Command not found '{0}'".format(cmd
['prefix']))
362 # collapse everything to '/'
363 def _cp_dispatch(self
, vpath
):
364 cherrypy
.request
.path
= ''
367 def format_metrics(self
, metrics
):
369 for m
in metrics
.values():
370 formatted
+= m
.str_expfmt()
371 return formatted
+ '\n'
375 return '''<!DOCTYPE html>
377 <head><title>Ceph Exporter</title></head>
379 <h1>Ceph Exporter</h1>
380 <p><a href='/metrics'>Metrics</a></p>
386 metrics
= global_instance().collect()
387 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
389 return self
.format_metrics(metrics
)
391 server_addr
= self
.get_localized_config('server_addr', DEFAULT_ADDR
)
392 server_port
= self
.get_localized_config('server_port', DEFAULT_PORT
)
394 "server_addr: %s server_port: %s" %
395 (server_addr
, server_port
)
398 cherrypy
.config
.update({
399 'server.socket_host': server_addr
,
400 'server.socket_port': int(server_port
),
401 'engine.autoreload.on': False
403 cherrypy
.tree
.mount(Root(), "/")
404 cherrypy
.engine
.start()
405 cherrypy
.engine
.block()