]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/prometheus/module.py
update sources to v12.2.3
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
1 import cherrypy
2 import json
3 import errno
4 import math
5 import os
6 from collections import OrderedDict
7 from mgr_module import MgrModule
8
9 # Defaults for the Prometheus HTTP server. Can also set in config-key
10 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
11 # for Prometheus exporter port registry
12
13 DEFAULT_ADDR = '::'
14 DEFAULT_PORT = 9283
15
16
17 # cherrypy likes to sys.exit on error. don't let it take us down too!
18 def os_exit_noop(*args, **kwargs):
19 pass
20
21
22 os._exit = os_exit_noop
23
24
25 # to access things in class Module from subclass Root. Because
26 # it's a dict, the writer doesn't need to declare 'global' for access
27
28 _global_instance = {'plugin': None}
29
30
31 def global_instance():
32 assert _global_instance['plugin'] is not None
33 return _global_instance['plugin']
34
35
36 def health_status_to_number(status):
37
38 if status == 'HEALTH_OK':
39 return 0
40 elif status == 'HEALTH_WARN':
41 return 1
42 elif status == 'HEALTH_ERR':
43 return 2
44
45 PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'deep', 'degraded',
46 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
47 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
48 'incomplete', 'stale', 'remapped', 'undersized', 'peered']
49
50 DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
51
52 DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
53 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
54
55 OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
56
57 OSD_STATUS = ['weight', 'up', 'in']
58
59 OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
60
61 POOL_METADATA = ('pool_id', 'name')
62
63 DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
64
65
66 class Metric(object):
67 def __init__(self, mtype, name, desc, labels=None):
68 self.mtype = mtype
69 self.name = name
70 self.desc = desc
71 self.labelnames = labels # tuple if present
72 self.value = dict() # indexed by label values
73
74 def set(self, value, labelvalues=None):
75 # labelvalues must be a tuple
76 labelvalues = labelvalues or ('',)
77 self.value[labelvalues] = value
78
79 def str_expfmt(self):
80
81 def promethize(path):
82 ''' replace illegal metric name characters '''
83 result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
84
85 # Hyphens usually turn into underscores, unless they are
86 # trailing
87 if result.endswith("-"):
88 result = result[0:-1] + "_minus"
89 else:
90 result = result.replace("-", "_")
91
92 return "ceph_{0}".format(result)
93
94 def floatstr(value):
95 ''' represent as Go-compatible float '''
96 if value == float('inf'):
97 return '+Inf'
98 if value == float('-inf'):
99 return '-Inf'
100 if math.isnan(value):
101 return 'NaN'
102 return repr(float(value))
103
104 name = promethize(self.name)
105 expfmt = '''
106 # HELP {name} {desc}
107 # TYPE {name} {mtype}'''.format(
108 name=name,
109 desc=self.desc,
110 mtype=self.mtype,
111 )
112
113 for labelvalues, value in self.value.items():
114 if self.labelnames:
115 labels = zip(self.labelnames, labelvalues)
116 labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
117 else:
118 labels = ''
119 if labels:
120 fmtstr = '\n{name}{{{labels}}} {value}'
121 else:
122 fmtstr = '\n{name} {value}'
123 expfmt += fmtstr.format(
124 name=name,
125 labels=labels,
126 value=floatstr(value),
127 )
128 return expfmt
129
130
131 class Module(MgrModule):
132 COMMANDS = [
133 {
134 "cmd": "prometheus self-test",
135 "desc": "Run a self test on the prometheus module",
136 "perm": "rw"
137 },
138 ]
139
140 def __init__(self, *args, **kwargs):
141 super(Module, self).__init__(*args, **kwargs)
142 self.notified = False
143 self.serving = False
144 self.metrics = self._setup_static_metrics()
145 self.schema = OrderedDict()
146 _global_instance['plugin'] = self
147
148 def _stattype_to_str(self, stattype):
149
150 typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
151 if typeonly == 0:
152 return 'gauge'
153 if typeonly == self.PERFCOUNTER_LONGRUNAVG:
154 # this lie matches the DaemonState decoding: only val, no counts
155 return 'counter'
156 if typeonly == self.PERFCOUNTER_COUNTER:
157 return 'counter'
158 if typeonly == self.PERFCOUNTER_HISTOGRAM:
159 return 'histogram'
160
161 return ''
162
163 def _setup_static_metrics(self):
164 metrics = {}
165 metrics['health_status'] = Metric(
166 'untyped',
167 'health_status',
168 'Cluster health status'
169 )
170 metrics['mon_quorum_count'] = Metric(
171 'gauge',
172 'mon_quorum_count',
173 'Monitors in quorum'
174 )
175 metrics['osd_metadata'] = Metric(
176 'untyped',
177 'osd_metadata',
178 'OSD Metadata',
179 OSD_METADATA
180 )
181
182 # The reason for having this separate to OSD_METADATA is
183 # so that we can stably use the same tag names that
184 # the Prometheus node_exporter does
185 metrics['disk_occupation'] = Metric(
186 'untyped',
187 'disk_occupation',
188 'Associate Ceph daemon with disk used',
189 DISK_OCCUPATION
190 )
191
192 metrics['pool_metadata'] = Metric(
193 'untyped',
194 'pool_metadata',
195 'POOL Metadata',
196 POOL_METADATA
197 )
198 for state in OSD_STATUS:
199 path = 'osd_{}'.format(state)
200 self.log.debug("init: creating {}".format(path))
201 metrics[path] = Metric(
202 'untyped',
203 path,
204 'OSD status {}'.format(state),
205 ('ceph_daemon',)
206 )
207 for stat in OSD_STATS:
208 path = 'osd_{}'.format(stat)
209 self.log.debug("init: creating {}".format(path))
210 metrics[path] = Metric(
211 'gauge',
212 path,
213 'OSD stat {}'.format(stat),
214 ('ceph_daemon',)
215 )
216 for state in PG_STATES:
217 path = 'pg_{}'.format(state)
218 self.log.debug("init: creating {}".format(path))
219 metrics[path] = Metric(
220 'gauge',
221 path,
222 'PG {}'.format(state),
223 )
224 for state in DF_CLUSTER:
225 path = 'cluster_{}'.format(state)
226 self.log.debug("init: creating {}".format(path))
227 metrics[path] = Metric(
228 'gauge',
229 path,
230 'DF {}'.format(state),
231 )
232 for state in DF_POOL:
233 path = 'pool_{}'.format(state)
234 self.log.debug("init: creating {}".format(path))
235 metrics[path] = Metric(
236 'gauge',
237 path,
238 'DF pool {}'.format(state),
239 ('pool_id',)
240 )
241
242 return metrics
243
244 def shutdown(self):
245 self.serving = False
246 pass
247
248 def get_health(self):
249 health = json.loads(self.get('health')['json'])
250 self.metrics['health_status'].set(
251 health_status_to_number(health['status'])
252 )
253
254 def get_df(self):
255 # maybe get the to-be-exported metrics from a config?
256 df = self.get('df')
257 for stat in DF_CLUSTER:
258 path = 'cluster_{}'.format(stat)
259 self.metrics[path].set(df['stats'][stat])
260
261 for pool in df['pools']:
262 for stat in DF_POOL:
263 path = 'pool_{}'.format(stat)
264 self.metrics[path].set(pool['stats'][stat], (pool['id'],))
265
266 def get_quorum_status(self):
267 mon_status = json.loads(self.get('mon_status')['json'])
268 self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
269
270 def get_pg_status(self):
271 # TODO add per pool status?
272 pg_s = self.get('pg_summary')['all']
273 reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
274 key.split('+')]
275 for state, value in reported_pg_s:
276 path = 'pg_{}'.format(state)
277 try:
278 self.metrics[path].set(value)
279 except KeyError:
280 self.log.warn("skipping pg in unknown state {}".format(state))
281 reported_states = [s[0] for s in reported_pg_s]
282 for state in PG_STATES:
283 path = 'pg_{}'.format(state)
284 if state not in reported_states:
285 try:
286 self.metrics[path].set(0)
287 except KeyError:
288 self.log.warn("skipping pg in unknown state {}".format(state))
289
290 def get_osd_stats(self):
291 osd_stats = self.get('osd_stats')
292 for osd in osd_stats['osd_stats']:
293 id_ = osd['osd']
294 for stat in OSD_STATS:
295 status = osd['perf_stat'][stat]
296 self.metrics['osd_{}'.format(stat)].set(
297 status,
298 ('osd.{}'.format(id_),))
299
300 def get_metadata_and_osd_status(self):
301 osd_map = self.get('osd_map')
302 osd_devices = self.get('osd_map_crush')['devices']
303 for osd in osd_map['osds']:
304 id_ = osd['osd']
305 p_addr = osd['public_addr'].split(':')[0]
306 c_addr = osd['cluster_addr'].split(':')[0]
307 dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
308 self.metrics['osd_metadata'].set(0, (
309 c_addr,
310 dev_class['class'],
311 id_,
312 p_addr
313 ))
314 for state in OSD_STATUS:
315 status = osd[state]
316 self.metrics['osd_{}'.format(state)].set(
317 status,
318 ('osd.{}'.format(id_),))
319
320 osd_metadata = self.get_metadata("osd", str(id_))
321 dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
322 osd_dev_node = None
323 for dev_key in dev_keys:
324 val = osd_metadata.get(dev_key, None)
325 if val and val != "unknown":
326 osd_dev_node = val
327 break
328 osd_hostname = osd_metadata.get('hostname', None)
329 if osd_dev_node and osd_hostname:
330 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
331 id_, osd_hostname, osd_dev_node))
332 self.metrics['disk_occupation'].set(0, (
333 osd_hostname,
334 osd_dev_node,
335 "osd.{0}".format(id_)
336 ))
337 else:
338 self.log.info("Missing dev node metadata for osd {0}, skipping "
339 "occupation record for this osd".format(id_))
340
341 for pool in osd_map['pools']:
342 id_ = pool['pool']
343 name = pool['pool_name']
344 self.metrics['pool_metadata'].set(0, (id_, name))
345
346 def collect(self):
347 self.get_health()
348 self.get_df()
349 self.get_osd_stats()
350 self.get_quorum_status()
351 self.get_metadata_and_osd_status()
352 self.get_pg_status()
353
354 for daemon, counters in self.get_all_perf_counters().iteritems():
355 for path, counter_info in counters.items():
356 stattype = self._stattype_to_str(counter_info['type'])
357 # XXX simplify first effort: no histograms
358 # averages are already collapsed to one value for us
359 if not stattype or stattype == 'histogram':
360 self.log.debug('ignoring %s, type %s' % (path, stattype))
361 continue
362
363 if path not in self.metrics:
364 self.metrics[path] = Metric(
365 stattype,
366 path,
367 counter_info['description'],
368 ("ceph_daemon",),
369 )
370
371 self.metrics[path].set(
372 counter_info['value'],
373 (daemon,)
374 )
375
376 return self.metrics
377
378 def handle_command(self, cmd):
379 if cmd['prefix'] == 'prometheus self-test':
380 self.collect()
381 return 0, '', 'Self-test OK'
382 else:
383 return (-errno.EINVAL, '',
384 "Command not found '{0}'".format(cmd['prefix']))
385
386 def serve(self):
387
388 class Root(object):
389
390 # collapse everything to '/'
391 def _cp_dispatch(self, vpath):
392 cherrypy.request.path = ''
393 return self
394
395 def format_metrics(self, metrics):
396 formatted = ''
397 for m in metrics.values():
398 formatted += m.str_expfmt()
399 return formatted + '\n'
400
401 @cherrypy.expose
402 def index(self):
403 return '''<!DOCTYPE html>
404 <html>
405 <head><title>Ceph Exporter</title></head>
406 <body>
407 <h1>Ceph Exporter</h1>
408 <p><a href='/metrics'>Metrics</a></p>
409 </body>
410 </html>'''
411
412 @cherrypy.expose
413 def metrics(self):
414 metrics = global_instance().collect()
415 cherrypy.response.headers['Content-Type'] = 'text/plain'
416 if metrics:
417 return self.format_metrics(metrics)
418
419 server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
420 server_port = self.get_localized_config('server_port', DEFAULT_PORT)
421 self.log.info(
422 "server_addr: %s server_port: %s" %
423 (server_addr, server_port)
424 )
425
426 cherrypy.config.update({
427 'server.socket_host': server_addr,
428 'server.socket_port': int(server_port),
429 'engine.autoreload.on': False
430 })
431 cherrypy.tree.mount(Root(), "/")
432 cherrypy.engine.start()
433 cherrypy.engine.block()