]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/prometheus/module.py
update sources to 12.2.2
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
CommitLineData
c07f9fc5 1import cherrypy
3efd9988
FG
2import json
3import errno
c07f9fc5
FG
4import math
5import os
c07f9fc5
FG
6from collections import OrderedDict
7from mgr_module import MgrModule
8
9# Defaults for the Prometheus HTTP server. Can also set in config-key
10# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
11# for Prometheus exporter port registry
12
13DEFAULT_ADDR = '::'
14DEFAULT_PORT = 9283
15
16
17# cherrypy likes to sys.exit on error. don't let it take us down too!
3efd9988 18def os_exit_noop(*args, **kwargs):
c07f9fc5
FG
19 pass
20
21
22os._exit = os_exit_noop
23
24
25# to access things in class Module from subclass Root. Because
26# it's a dict, the writer doesn't need to declare 'global' for access
27
28_global_instance = {'plugin': None}
29
30
31def global_instance():
32 assert _global_instance['plugin'] is not None
33 return _global_instance['plugin']
34
35
3efd9988 36def health_status_to_number(status):
c07f9fc5 37
3efd9988
FG
38 if status == 'HEALTH_OK':
39 return 0
40 elif status == 'HEALTH_WARN':
41 return 1
42 elif status == 'HEALTH_ERR':
43 return 2
c07f9fc5 44
3efd9988
FG
45PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded',
46 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
47 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
48 'incomplete', 'stale', 'remapped', 'undersized', 'peered']
c07f9fc5 49
3efd9988 50DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
c07f9fc5 51
3efd9988
FG
52DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
53 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
c07f9fc5 54
3efd9988
FG
55OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
56
57OSD_STATUS = ['weight', 'up', 'in']
58
59POOL_METADATA = ('pool_id', 'name')
60
61DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
c07f9fc5
FG
62
63
64class Metric(object):
65 def __init__(self, mtype, name, desc, labels=None):
66 self.mtype = mtype
67 self.name = name
68 self.desc = desc
69 self.labelnames = labels # tuple if present
70 self.value = dict() # indexed by label values
71
72 def set(self, value, labelvalues=None):
73 # labelvalues must be a tuple
74 labelvalues = labelvalues or ('',)
75 self.value[labelvalues] = value
76
77 def str_expfmt(self):
78
79 def promethize(path):
80 ''' replace illegal metric name characters '''
3efd9988
FG
81 result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
82
83 # Hyphens usually turn into underscores, unless they are
84 # trailing
85 if result.endswith("-"):
86 result = result[0:-1] + "_minus"
87 else:
88 result = result.replace("-", "_")
89
90 return "ceph_{0}".format(result)
c07f9fc5
FG
91
92 def floatstr(value):
93 ''' represent as Go-compatible float '''
94 if value == float('inf'):
95 return '+Inf'
96 if value == float('-inf'):
97 return '-Inf'
98 if math.isnan(value):
99 return 'NaN'
100 return repr(float(value))
101
102 name = promethize(self.name)
103 expfmt = '''
104# HELP {name} {desc}
105# TYPE {name} {mtype}'''.format(
106 name=name,
107 desc=self.desc,
108 mtype=self.mtype,
109 )
110
111 for labelvalues, value in self.value.items():
112 if self.labelnames:
113 labels = zip(self.labelnames, labelvalues)
114 labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
115 else:
116 labels = ''
117 if labels:
118 fmtstr = '\n{name}{{{labels}}} {value}'
119 else:
120 fmtstr = '\n{name} {value}'
121 expfmt += fmtstr.format(
122 name=name,
123 labels=labels,
124 value=floatstr(value),
125 )
126 return expfmt
127
128
129class Module(MgrModule):
3efd9988
FG
130 COMMANDS = [
131 {
132 "cmd": "prometheus self-test",
133 "desc": "Run a self test on the prometheus module",
134 "perm": "rw"
135 },
136 ]
c07f9fc5
FG
137
138 def __init__(self, *args, **kwargs):
139 super(Module, self).__init__(*args, **kwargs)
140 self.notified = False
141 self.serving = False
3efd9988 142 self.metrics = self._setup_static_metrics()
c07f9fc5
FG
143 self.schema = OrderedDict()
144 _global_instance['plugin'] = self
145
3efd9988
FG
146 def _stattype_to_str(self, stattype):
147
148 typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
149 if typeonly == 0:
150 return 'gauge'
151 if typeonly == self.PERFCOUNTER_LONGRUNAVG:
152 # this lie matches the DaemonState decoding: only val, no counts
153 return 'counter'
154 if typeonly == self.PERFCOUNTER_COUNTER:
155 return 'counter'
156 if typeonly == self.PERFCOUNTER_HISTOGRAM:
157 return 'histogram'
158
159 return ''
160
161 def _setup_static_metrics(self):
162 metrics = {}
163 metrics['health_status'] = Metric(
164 'untyped',
165 'health_status',
166 'Cluster health status'
167 )
168 metrics['mon_quorum_count'] = Metric(
169 'gauge',
170 'mon_quorum_count',
171 'Monitors in quorum'
172 )
173 metrics['osd_metadata'] = Metric(
174 'untyped',
175 'osd_metadata',
176 'OSD Metadata',
177 OSD_METADATA
178 )
c07f9fc5 179
3efd9988
FG
180 # The reason for having this separate to OSD_METADATA is
181 # so that we can stably use the same tag names that
182 # the Prometheus node_exporter does
183 metrics['disk_occupation'] = Metric(
184 'undef',
185 'disk_occupation',
186 'Associate Ceph daemon with disk used',
187 DISK_OCCUPATION
188 )
c07f9fc5 189
3efd9988
FG
190 metrics['pool_metadata'] = Metric(
191 'untyped',
192 'pool_metadata',
193 'POOL Metadata',
194 POOL_METADATA
195 )
196 for state in OSD_STATUS:
197 path = 'osd_{}'.format(state)
198 self.log.debug("init: creating {}".format(path))
199 metrics[path] = Metric(
200 'untyped',
c07f9fc5 201 path,
3efd9988
FG
202 'OSD status {}'.format(state),
203 ('ceph_daemon',)
c07f9fc5 204 )
3efd9988
FG
205 for state in PG_STATES:
206 path = 'pg_{}'.format(state)
207 self.log.debug("init: creating {}".format(path))
208 metrics[path] = Metric(
209 'gauge',
210 path,
211 'PG {}'.format(state),
212 )
213 for state in DF_CLUSTER:
214 path = 'cluster_{}'.format(state)
215 self.log.debug("init: creating {}".format(path))
216 metrics[path] = Metric(
217 'gauge',
218 path,
219 'DF {}'.format(state),
220 )
221 for state in DF_POOL:
222 path = 'pool_{}'.format(state)
223 self.log.debug("init: creating {}".format(path))
224 metrics[path] = Metric(
225 'gauge',
226 path,
227 'DF pool {}'.format(state),
228 ('pool_id',)
229 )
230
231 return metrics
c07f9fc5 232
3efd9988
FG
233 def shutdown(self):
234 self.serving = False
235 pass
c07f9fc5 236
3efd9988
FG
237 def get_health(self):
238 health = json.loads(self.get('health')['json'])
239 self.metrics['health_status'].set(
240 health_status_to_number(health['status'])
c07f9fc5
FG
241 )
242
3efd9988
FG
243 def get_df(self):
244 # maybe get the to-be-exported metrics from a config?
245 df = self.get('df')
246 for stat in DF_CLUSTER:
247 path = 'cluster_{}'.format(stat)
248 self.metrics[path].set(df['stats'][stat])
249
250 for pool in df['pools']:
251 for stat in DF_POOL:
252 path = 'pool_{}'.format(stat)
253 self.metrics[path].set(pool['stats'][stat], (pool['id'],))
254
255 def get_quorum_status(self):
256 mon_status = json.loads(self.get('mon_status')['json'])
257 self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
258
259 def get_pg_status(self):
260 # TODO add per pool status?
261 pg_s = self.get('pg_summary')['all']
262 reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
263 key.split('+')]
264 for state, value in reported_pg_s:
265 path = 'pg_{}'.format(state)
266 self.metrics[path].set(value)
267 reported_states = [s[0] for s in reported_pg_s]
268 for state in PG_STATES:
269 path = 'pg_{}'.format(state)
270 if state not in reported_states:
271 self.metrics[path].set(0)
272
273 def get_metadata_and_osd_status(self):
274 osd_map = self.get('osd_map')
275 osd_devices = self.get('osd_map_crush')['devices']
276 for osd in osd_map['osds']:
277 id_ = osd['osd']
278 p_addr = osd['public_addr'].split(':')[0]
279 c_addr = osd['cluster_addr'].split(':')[0]
280 dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
281 self.metrics['osd_metadata'].set(0, (
282 c_addr,
283 dev_class['class'],
284 id_,
285 p_addr
286 ))
287 for state in OSD_STATUS:
288 status = osd[state]
289 self.metrics['osd_{}'.format(state)].set(
290 status,
291 ('osd.{}'.format(id_),))
292
293 osd_metadata = self.get_metadata("osd", str(id_))
294 dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
295 osd_dev_node = None
296 for dev_key in dev_keys:
297 val = osd_metadata.get(dev_key, None)
298 if val and val != "unknown":
299 osd_dev_node = val
300 break
301 osd_hostname = osd_metadata.get('hostname', None)
302 if osd_dev_node and osd_hostname:
303 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
304 id_, osd_hostname, osd_dev_node))
305 self.metrics['disk_occupation'].set(0, (
306 osd_hostname,
307 osd_dev_node,
308 "osd.{0}".format(id_)
309 ))
310 else:
311 self.log.info("Missing dev node metadata for osd {0}, skipping "
312 "occupation record for this osd".format(id_))
313
314 for pool in osd_map['pools']:
315 id_ = pool['pool']
316 name = pool['pool_name']
317 self.metrics['pool_metadata'].set(0, (id_, name))
318
c07f9fc5 319 def collect(self):
3efd9988
FG
320 self.get_health()
321 self.get_df()
322 self.get_quorum_status()
323 self.get_metadata_and_osd_status()
324 self.get_pg_status()
325
326 for daemon, counters in self.get_all_perf_counters().iteritems():
327 for path, counter_info in counters.items():
328 stattype = self._stattype_to_str(counter_info['type'])
329 # XXX simplify first effort: no histograms
330 # averages are already collapsed to one value for us
331 if not stattype or stattype == 'histogram':
332 self.log.debug('ignoring %s, type %s' % (path, stattype))
333 continue
334
335 if path not in self.metrics:
336 self.metrics[path] = Metric(
337 stattype,
338 path,
339 counter_info['description'],
340 ("ceph_daemon",),
341 )
342
343 self.metrics[path].set(
344 counter_info['value'],
345 (daemon,)
346 )
347
c07f9fc5
FG
348 return self.metrics
349
3efd9988
FG
350 def handle_command(self, cmd):
351 if cmd['prefix'] == 'prometheus self-test':
352 self.collect()
353 return 0, '', 'Self-test OK'
354 else:
355 return (-errno.EINVAL, '',
356 "Command not found '{0}'".format(cmd['prefix']))
c07f9fc5
FG
357
358 def serve(self):
359
360 class Root(object):
361
362 # collapse everything to '/'
363 def _cp_dispatch(self, vpath):
364 cherrypy.request.path = ''
365 return self
366
367 def format_metrics(self, metrics):
368 formatted = ''
369 for m in metrics.values():
370 formatted += m.str_expfmt()
371 return formatted + '\n'
372
373 @cherrypy.expose
374 def index(self):
3efd9988
FG
375 return '''<!DOCTYPE html>
376<html>
377 <head><title>Ceph Exporter</title></head>
378 <body>
379 <h1>Ceph Exporter</h1>
380 <p><a href='/metrics'>Metrics</a></p>
381 </body>
382</html>'''
383
384 @cherrypy.expose
385 def metrics(self):
c07f9fc5
FG
386 metrics = global_instance().collect()
387 cherrypy.response.headers['Content-Type'] = 'text/plain'
388 if metrics:
389 return self.format_metrics(metrics)
390
391 server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
392 server_port = self.get_localized_config('server_port', DEFAULT_PORT)
393 self.log.info(
394 "server_addr: %s server_port: %s" %
395 (server_addr, server_port)
396 )
c07f9fc5
FG
397
398 cherrypy.config.update({
399 'server.socket_host': server_addr,
3efd9988 400 'server.socket_port': int(server_port),
c07f9fc5
FG
401 'engine.autoreload.on': False
402 })
403 cherrypy.tree.mount(Root(), "/")
404 cherrypy.engine.start()
405 cherrypy.engine.block()