]>
Commit | Line | Data |
---|---|---|
c07f9fc5 | 1 | import cherrypy |
3efd9988 FG |
2 | import json |
3 | import errno | |
c07f9fc5 FG |
4 | import math |
5 | import os | |
c07f9fc5 FG |
6 | from collections import OrderedDict |
7 | from mgr_module import MgrModule | |
8 | ||
9 | # Defaults for the Prometheus HTTP server. Can also set in config-key | |
10 | # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations | |
11 | # for Prometheus exporter port registry | |
12 | ||
13 | DEFAULT_ADDR = '::' | |
14 | DEFAULT_PORT = 9283 | |
15 | ||
16 | ||
17 | # cherrypy likes to sys.exit on error. don't let it take us down too! | |
3efd9988 | 18 | def os_exit_noop(*args, **kwargs): |
c07f9fc5 FG |
19 | pass |
20 | ||
21 | ||
22 | os._exit = os_exit_noop | |
23 | ||
24 | ||
25 | # to access things in class Module from subclass Root. Because | |
26 | # it's a dict, the writer doesn't need to declare 'global' for access | |
27 | ||
28 | _global_instance = {'plugin': None} | |
29 | ||
30 | ||
31 | def global_instance(): | |
32 | assert _global_instance['plugin'] is not None | |
33 | return _global_instance['plugin'] | |
34 | ||
35 | ||
3efd9988 | 36 | def health_status_to_number(status): |
c07f9fc5 | 37 | |
3efd9988 FG |
38 | if status == 'HEALTH_OK': |
39 | return 0 | |
40 | elif status == 'HEALTH_WARN': | |
41 | return 1 | |
42 | elif status == 'HEALTH_ERR': | |
43 | return 2 | |
c07f9fc5 | 44 | |
3efd9988 FG |
45 | PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded', |
46 | 'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery', | |
47 | 'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull', | |
48 | 'incomplete', 'stale', 'remapped', 'undersized', 'peered'] | |
c07f9fc5 | 49 | |
3efd9988 | 50 | DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects'] |
c07f9fc5 | 51 | |
3efd9988 FG |
52 | DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty', |
53 | 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes'] | |
c07f9fc5 | 54 | |
3efd9988 FG |
55 | OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr') |
56 | ||
57 | OSD_STATUS = ['weight', 'up', 'in'] | |
58 | ||
59 | POOL_METADATA = ('pool_id', 'name') | |
60 | ||
61 | DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon') | |
c07f9fc5 FG |
62 | |
63 | ||
64 | class Metric(object): | |
65 | def __init__(self, mtype, name, desc, labels=None): | |
66 | self.mtype = mtype | |
67 | self.name = name | |
68 | self.desc = desc | |
69 | self.labelnames = labels # tuple if present | |
70 | self.value = dict() # indexed by label values | |
71 | ||
72 | def set(self, value, labelvalues=None): | |
73 | # labelvalues must be a tuple | |
74 | labelvalues = labelvalues or ('',) | |
75 | self.value[labelvalues] = value | |
76 | ||
77 | def str_expfmt(self): | |
78 | ||
79 | def promethize(path): | |
80 | ''' replace illegal metric name characters ''' | |
3efd9988 FG |
81 | result = path.replace('.', '_').replace('+', '_plus').replace('::', '_') |
82 | ||
83 | # Hyphens usually turn into underscores, unless they are | |
84 | # trailing | |
85 | if result.endswith("-"): | |
86 | result = result[0:-1] + "_minus" | |
87 | else: | |
88 | result = result.replace("-", "_") | |
89 | ||
90 | return "ceph_{0}".format(result) | |
c07f9fc5 FG |
91 | |
92 | def floatstr(value): | |
93 | ''' represent as Go-compatible float ''' | |
94 | if value == float('inf'): | |
95 | return '+Inf' | |
96 | if value == float('-inf'): | |
97 | return '-Inf' | |
98 | if math.isnan(value): | |
99 | return 'NaN' | |
100 | return repr(float(value)) | |
101 | ||
102 | name = promethize(self.name) | |
103 | expfmt = ''' | |
104 | # HELP {name} {desc} | |
105 | # TYPE {name} {mtype}'''.format( | |
106 | name=name, | |
107 | desc=self.desc, | |
108 | mtype=self.mtype, | |
109 | ) | |
110 | ||
111 | for labelvalues, value in self.value.items(): | |
112 | if self.labelnames: | |
113 | labels = zip(self.labelnames, labelvalues) | |
114 | labels = ','.join('%s="%s"' % (k, v) for k, v in labels) | |
115 | else: | |
116 | labels = '' | |
117 | if labels: | |
118 | fmtstr = '\n{name}{{{labels}}} {value}' | |
119 | else: | |
120 | fmtstr = '\n{name} {value}' | |
121 | expfmt += fmtstr.format( | |
122 | name=name, | |
123 | labels=labels, | |
124 | value=floatstr(value), | |
125 | ) | |
126 | return expfmt | |
127 | ||
128 | ||
129 | class Module(MgrModule): | |
3efd9988 FG |
130 | COMMANDS = [ |
131 | { | |
132 | "cmd": "prometheus self-test", | |
133 | "desc": "Run a self test on the prometheus module", | |
134 | "perm": "rw" | |
135 | }, | |
136 | ] | |
c07f9fc5 FG |
137 | |
138 | def __init__(self, *args, **kwargs): | |
139 | super(Module, self).__init__(*args, **kwargs) | |
140 | self.notified = False | |
141 | self.serving = False | |
3efd9988 | 142 | self.metrics = self._setup_static_metrics() |
c07f9fc5 FG |
143 | self.schema = OrderedDict() |
144 | _global_instance['plugin'] = self | |
145 | ||
3efd9988 FG |
146 | def _stattype_to_str(self, stattype): |
147 | ||
148 | typeonly = stattype & self.PERFCOUNTER_TYPE_MASK | |
149 | if typeonly == 0: | |
150 | return 'gauge' | |
151 | if typeonly == self.PERFCOUNTER_LONGRUNAVG: | |
152 | # this lie matches the DaemonState decoding: only val, no counts | |
153 | return 'counter' | |
154 | if typeonly == self.PERFCOUNTER_COUNTER: | |
155 | return 'counter' | |
156 | if typeonly == self.PERFCOUNTER_HISTOGRAM: | |
157 | return 'histogram' | |
158 | ||
159 | return '' | |
160 | ||
161 | def _setup_static_metrics(self): | |
162 | metrics = {} | |
163 | metrics['health_status'] = Metric( | |
164 | 'untyped', | |
165 | 'health_status', | |
166 | 'Cluster health status' | |
167 | ) | |
168 | metrics['mon_quorum_count'] = Metric( | |
169 | 'gauge', | |
170 | 'mon_quorum_count', | |
171 | 'Monitors in quorum' | |
172 | ) | |
173 | metrics['osd_metadata'] = Metric( | |
174 | 'untyped', | |
175 | 'osd_metadata', | |
176 | 'OSD Metadata', | |
177 | OSD_METADATA | |
178 | ) | |
c07f9fc5 | 179 | |
3efd9988 FG |
180 | # The reason for having this separate to OSD_METADATA is |
181 | # so that we can stably use the same tag names that | |
182 | # the Prometheus node_exporter does | |
183 | metrics['disk_occupation'] = Metric( | |
184 | 'undef', | |
185 | 'disk_occupation', | |
186 | 'Associate Ceph daemon with disk used', | |
187 | DISK_OCCUPATION | |
188 | ) | |
c07f9fc5 | 189 | |
3efd9988 FG |
190 | metrics['pool_metadata'] = Metric( |
191 | 'untyped', | |
192 | 'pool_metadata', | |
193 | 'POOL Metadata', | |
194 | POOL_METADATA | |
195 | ) | |
196 | for state in OSD_STATUS: | |
197 | path = 'osd_{}'.format(state) | |
198 | self.log.debug("init: creating {}".format(path)) | |
199 | metrics[path] = Metric( | |
200 | 'untyped', | |
c07f9fc5 | 201 | path, |
3efd9988 FG |
202 | 'OSD status {}'.format(state), |
203 | ('ceph_daemon',) | |
c07f9fc5 | 204 | ) |
3efd9988 FG |
205 | for state in PG_STATES: |
206 | path = 'pg_{}'.format(state) | |
207 | self.log.debug("init: creating {}".format(path)) | |
208 | metrics[path] = Metric( | |
209 | 'gauge', | |
210 | path, | |
211 | 'PG {}'.format(state), | |
212 | ) | |
213 | for state in DF_CLUSTER: | |
214 | path = 'cluster_{}'.format(state) | |
215 | self.log.debug("init: creating {}".format(path)) | |
216 | metrics[path] = Metric( | |
217 | 'gauge', | |
218 | path, | |
219 | 'DF {}'.format(state), | |
220 | ) | |
221 | for state in DF_POOL: | |
222 | path = 'pool_{}'.format(state) | |
223 | self.log.debug("init: creating {}".format(path)) | |
224 | metrics[path] = Metric( | |
225 | 'gauge', | |
226 | path, | |
227 | 'DF pool {}'.format(state), | |
228 | ('pool_id',) | |
229 | ) | |
230 | ||
231 | return metrics | |
c07f9fc5 | 232 | |
3efd9988 FG |
233 | def shutdown(self): |
234 | self.serving = False | |
235 | pass | |
c07f9fc5 | 236 | |
3efd9988 FG |
237 | def get_health(self): |
238 | health = json.loads(self.get('health')['json']) | |
239 | self.metrics['health_status'].set( | |
240 | health_status_to_number(health['status']) | |
c07f9fc5 FG |
241 | ) |
242 | ||
3efd9988 FG |
243 | def get_df(self): |
244 | # maybe get the to-be-exported metrics from a config? | |
245 | df = self.get('df') | |
246 | for stat in DF_CLUSTER: | |
247 | path = 'cluster_{}'.format(stat) | |
248 | self.metrics[path].set(df['stats'][stat]) | |
249 | ||
250 | for pool in df['pools']: | |
251 | for stat in DF_POOL: | |
252 | path = 'pool_{}'.format(stat) | |
253 | self.metrics[path].set(pool['stats'][stat], (pool['id'],)) | |
254 | ||
255 | def get_quorum_status(self): | |
256 | mon_status = json.loads(self.get('mon_status')['json']) | |
257 | self.metrics['mon_quorum_count'].set(len(mon_status['quorum'])) | |
258 | ||
259 | def get_pg_status(self): | |
260 | # TODO add per pool status? | |
261 | pg_s = self.get('pg_summary')['all'] | |
262 | reported_pg_s = [(s,v) for key, v in pg_s.items() for s in | |
263 | key.split('+')] | |
264 | for state, value in reported_pg_s: | |
265 | path = 'pg_{}'.format(state) | |
266 | self.metrics[path].set(value) | |
267 | reported_states = [s[0] for s in reported_pg_s] | |
268 | for state in PG_STATES: | |
269 | path = 'pg_{}'.format(state) | |
270 | if state not in reported_states: | |
271 | self.metrics[path].set(0) | |
272 | ||
273 | def get_metadata_and_osd_status(self): | |
274 | osd_map = self.get('osd_map') | |
275 | osd_devices = self.get('osd_map_crush')['devices'] | |
276 | for osd in osd_map['osds']: | |
277 | id_ = osd['osd'] | |
278 | p_addr = osd['public_addr'].split(':')[0] | |
279 | c_addr = osd['cluster_addr'].split(':')[0] | |
280 | dev_class = next((osd for osd in osd_devices if osd['id'] == id_)) | |
281 | self.metrics['osd_metadata'].set(0, ( | |
282 | c_addr, | |
283 | dev_class['class'], | |
284 | id_, | |
285 | p_addr | |
286 | )) | |
287 | for state in OSD_STATUS: | |
288 | status = osd[state] | |
289 | self.metrics['osd_{}'.format(state)].set( | |
290 | status, | |
291 | ('osd.{}'.format(id_),)) | |
292 | ||
293 | osd_metadata = self.get_metadata("osd", str(id_)) | |
294 | dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node") | |
295 | osd_dev_node = None | |
296 | for dev_key in dev_keys: | |
297 | val = osd_metadata.get(dev_key, None) | |
298 | if val and val != "unknown": | |
299 | osd_dev_node = val | |
300 | break | |
301 | osd_hostname = osd_metadata.get('hostname', None) | |
302 | if osd_dev_node and osd_hostname: | |
303 | self.log.debug("Got dev for osd {0}: {1}/{2}".format( | |
304 | id_, osd_hostname, osd_dev_node)) | |
305 | self.metrics['disk_occupation'].set(0, ( | |
306 | osd_hostname, | |
307 | osd_dev_node, | |
308 | "osd.{0}".format(id_) | |
309 | )) | |
310 | else: | |
311 | self.log.info("Missing dev node metadata for osd {0}, skipping " | |
312 | "occupation record for this osd".format(id_)) | |
313 | ||
314 | for pool in osd_map['pools']: | |
315 | id_ = pool['pool'] | |
316 | name = pool['pool_name'] | |
317 | self.metrics['pool_metadata'].set(0, (id_, name)) | |
318 | ||
c07f9fc5 | 319 | def collect(self): |
3efd9988 FG |
320 | self.get_health() |
321 | self.get_df() | |
322 | self.get_quorum_status() | |
323 | self.get_metadata_and_osd_status() | |
324 | self.get_pg_status() | |
325 | ||
326 | for daemon, counters in self.get_all_perf_counters().iteritems(): | |
327 | for path, counter_info in counters.items(): | |
328 | stattype = self._stattype_to_str(counter_info['type']) | |
329 | # XXX simplify first effort: no histograms | |
330 | # averages are already collapsed to one value for us | |
331 | if not stattype or stattype == 'histogram': | |
332 | self.log.debug('ignoring %s, type %s' % (path, stattype)) | |
333 | continue | |
334 | ||
335 | if path not in self.metrics: | |
336 | self.metrics[path] = Metric( | |
337 | stattype, | |
338 | path, | |
339 | counter_info['description'], | |
340 | ("ceph_daemon",), | |
341 | ) | |
342 | ||
343 | self.metrics[path].set( | |
344 | counter_info['value'], | |
345 | (daemon,) | |
346 | ) | |
347 | ||
c07f9fc5 FG |
348 | return self.metrics |
349 | ||
3efd9988 FG |
350 | def handle_command(self, cmd): |
351 | if cmd['prefix'] == 'prometheus self-test': | |
352 | self.collect() | |
353 | return 0, '', 'Self-test OK' | |
354 | else: | |
355 | return (-errno.EINVAL, '', | |
356 | "Command not found '{0}'".format(cmd['prefix'])) | |
c07f9fc5 FG |
357 | |
358 | def serve(self): | |
359 | ||
360 | class Root(object): | |
361 | ||
362 | # collapse everything to '/' | |
363 | def _cp_dispatch(self, vpath): | |
364 | cherrypy.request.path = '' | |
365 | return self | |
366 | ||
367 | def format_metrics(self, metrics): | |
368 | formatted = '' | |
369 | for m in metrics.values(): | |
370 | formatted += m.str_expfmt() | |
371 | return formatted + '\n' | |
372 | ||
373 | @cherrypy.expose | |
374 | def index(self): | |
3efd9988 FG |
375 | return '''<!DOCTYPE html> |
376 | <html> | |
377 | <head><title>Ceph Exporter</title></head> | |
378 | <body> | |
379 | <h1>Ceph Exporter</h1> | |
380 | <p><a href='/metrics'>Metrics</a></p> | |
381 | </body> | |
382 | </html>''' | |
383 | ||
384 | @cherrypy.expose | |
385 | def metrics(self): | |
c07f9fc5 FG |
386 | metrics = global_instance().collect() |
387 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
388 | if metrics: | |
389 | return self.format_metrics(metrics) | |
390 | ||
391 | server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR) | |
392 | server_port = self.get_localized_config('server_port', DEFAULT_PORT) | |
393 | self.log.info( | |
394 | "server_addr: %s server_port: %s" % | |
395 | (server_addr, server_port) | |
396 | ) | |
c07f9fc5 FG |
397 | |
398 | cherrypy.config.update({ | |
399 | 'server.socket_host': server_addr, | |
3efd9988 | 400 | 'server.socket_port': int(server_port), |
c07f9fc5 FG |
401 | 'engine.autoreload.on': False |
402 | }) | |
403 | cherrypy.tree.mount(Root(), "/") | |
404 | cherrypy.engine.start() | |
405 | cherrypy.engine.block() |