]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/prometheus/module.py
update download target update for octopus release
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
CommitLineData
c07f9fc5 1import cherrypy
a8e16298 2from distutils.version import StrictVersion
3efd9988
FG
3import json
4import errno
c07f9fc5
FG
5import math
6import os
11fdf7f2 7import re
94b18763 8import socket
91327a77
AA
9import threading
10import time
11fdf7f2 11from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
494da23a 12from mgr_util import get_default_addr
11fdf7f2 13from rbd import RBD
c07f9fc5
FG
14
15# Defaults for the Prometheus HTTP server. Can also set in config-key
16# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
17# for Prometheus exporter port registry
18
c07f9fc5
FG
19DEFAULT_PORT = 9283
20
a8e16298
TL
21# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
22# that the ports its listening on are in fact bound. When using the any address
23# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
24# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
25# exception.
26if cherrypy is not None:
27 v = StrictVersion(cherrypy.__version__)
28 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
29 # centos:7) and back to at least 3.0.0.
30 if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
31 # https://github.com/cherrypy/cherrypy/issues/1100
32 from cherrypy.process import servers
33 servers.wait_for_occupied_port = lambda host, port: None
c07f9fc5
FG
34
35# cherrypy likes to sys.exit on error. don't let it take us down too!
3efd9988 36def os_exit_noop(*args, **kwargs):
c07f9fc5
FG
37 pass
38
39
40os._exit = os_exit_noop
41
c07f9fc5
FG
42# to access things in class Module from subclass Root. Because
43# it's a dict, the writer doesn't need to declare 'global' for access
44
45_global_instance = {'plugin': None}
46
47
48def global_instance():
49 assert _global_instance['plugin'] is not None
50 return _global_instance['plugin']
51
52
3efd9988 53def health_status_to_number(status):
3efd9988
FG
54 if status == 'HEALTH_OK':
55 return 0
56 elif status == 'HEALTH_WARN':
57 return 1
58 elif status == 'HEALTH_ERR':
59 return 2
c07f9fc5 60
11fdf7f2
TL
61
62DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
63
64DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty',
3efd9988 65 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
c07f9fc5 66
11fdf7f2
TL
67OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
68 'recovering_keys_per_sec', 'num_objects_recovered',
69 'num_bytes_recovered', 'num_bytes_recovered')
70
94b18763
FG
71OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
72 'norecover', 'noscrub', 'nodeep-scrub')
3efd9988 73
28e407b8 74FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
b32b8144 75
28e407b8
AA
76MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
77 'ceph_version')
3efd9988 78
11fdf7f2
TL
79MON_METADATA = ('ceph_daemon', 'hostname',
80 'public_addr', 'rank', 'ceph_version')
c07f9fc5 81
494da23a
TL
82MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
83
84MGR_STATUS = ('ceph_daemon',)
85
86MGR_MODULE_STATUS = ('name',)
87
88MGR_MODULE_CAN_RUN = ('name',)
89
a8e16298
TL
90OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
91 'front_iface', 'hostname', 'objectstore', 'public_addr',
92 'ceph_version')
c07f9fc5 93
94b18763 94OSD_STATUS = ['weight', 'up', 'in']
c07f9fc5 95
94b18763 96OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
c07f9fc5 97
94b18763 98POOL_METADATA = ('pool_id', 'name')
c07f9fc5 99
28e407b8 100RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
c07f9fc5 101
11fdf7f2
TL
102RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname',
103 'ceph_version')
104
105DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
106 'wal_device', 'instance')
28e407b8
AA
107
108NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
c07f9fc5 109
c07f9fc5 110
91327a77
AA
111class Metric(object):
112 def __init__(self, mtype, name, desc, labels=None):
113 self.mtype = mtype
114 self.name = name
115 self.desc = desc
116 self.labelnames = labels # tuple if present
117 self.value = {} # indexed by label values
118
119 def clear(self):
120 self.value = {}
121
122 def set(self, value, labelvalues=None):
123 # labelvalues must be a tuple
124 labelvalues = labelvalues or ('',)
125 self.value[labelvalues] = value
3efd9988 126
91327a77
AA
127 def str_expfmt(self):
128
129 def promethize(path):
130 ''' replace illegal metric name characters '''
81eedcae 131 result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus')
91327a77
AA
132
133 # Hyphens usually turn into underscores, unless they are
134 # trailing
135 if result.endswith("-"):
136 result = result[0:-1] + "_minus"
137 else:
138 result = result.replace("-", "_")
139
140 return "ceph_{0}".format(result)
141
142 def floatstr(value):
143 ''' represent as Go-compatible float '''
144 if value == float('inf'):
145 return '+Inf'
146 if value == float('-inf'):
147 return '-Inf'
148 if math.isnan(value):
149 return 'NaN'
150 return repr(float(value))
151
152 name = promethize(self.name)
153 expfmt = '''
154# HELP {name} {desc}
155# TYPE {name} {mtype}'''.format(
156 name=name,
157 desc=self.desc,
158 mtype=self.mtype,
159 )
160
161 for labelvalues, value in self.value.items():
162 if self.labelnames:
163 labels = zip(self.labelnames, labelvalues)
164 labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
165 else:
166 labels = ''
167 if labels:
168 fmtstr = '\n{name}{{{labels}}} {value}'
169 else:
170 fmtstr = '\n{name} {value}'
171 expfmt += fmtstr.format(
172 name=name,
173 labels=labels,
174 value=floatstr(value),
175 )
176 return expfmt
177
178
179class Module(MgrModule):
180 COMMANDS = [
181 {
11fdf7f2
TL
182 "cmd": "prometheus file_sd_config",
183 "desc": "Return file_sd compatible prometheus config for mgr cluster",
184 "perm": "r"
91327a77
AA
185 },
186 ]
187
11fdf7f2
TL
188 MODULE_OPTIONS = [
189 {'name': 'server_addr'},
190 {'name': 'server_port'},
191 {'name': 'scrape_interval'},
192 {'name': 'rbd_stats_pools'},
193 {'name': 'rbd_stats_pools_refresh_interval'},
91327a77
AA
194 ]
195
196 def __init__(self, *args, **kwargs):
197 super(Module, self).__init__(*args, **kwargs)
198 self.metrics = self._setup_static_metrics()
199 self.shutdown_event = threading.Event()
200 self.collect_lock = threading.RLock()
201 self.collect_time = 0
202 self.collect_timeout = 5.0
203 self.collect_cache = None
11fdf7f2
TL
204 self.rbd_stats = {
205 'pools': {},
206 'pools_refresh_time': 0,
207 'counters_info': {
208 'write_ops': {'type': self.PERFCOUNTER_COUNTER,
209 'desc': 'RBD image writes count'},
210 'read_ops': {'type': self.PERFCOUNTER_COUNTER,
211 'desc': 'RBD image reads count'},
212 'write_bytes': {'type': self.PERFCOUNTER_COUNTER,
213 'desc': 'RBD image bytes written'},
214 'read_bytes': {'type': self.PERFCOUNTER_COUNTER,
215 'desc': 'RBD image bytes read'},
216 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
217 'desc': 'RBD image writes latency (msec)'},
218 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
219 'desc': 'RBD image reads latency (msec)'},
220 },
221 }
91327a77 222 _global_instance['plugin'] = self
3efd9988
FG
223
224 def _setup_static_metrics(self):
225 metrics = {}
226 metrics['health_status'] = Metric(
227 'untyped',
228 'health_status',
229 'Cluster health status'
230 )
94b18763 231 metrics['mon_quorum_status'] = Metric(
3efd9988 232 'gauge',
94b18763
FG
233 'mon_quorum_status',
234 'Monitors in quorum',
235 ('ceph_daemon',)
236 )
237 metrics['fs_metadata'] = Metric(
238 'untyped',
239 'fs_metadata',
240 'FS Metadata',
241 FS_METADATA
242 )
243 metrics['mds_metadata'] = Metric(
244 'untyped',
245 'mds_metadata',
246 'MDS Metadata',
247 MDS_METADATA
248 )
249 metrics['mon_metadata'] = Metric(
250 'untyped',
251 'mon_metadata',
252 'MON Metadata',
253 MON_METADATA
3efd9988 254 )
494da23a
TL
255 metrics['mgr_metadata'] = Metric(
256 'gauge',
257 'mgr_metadata',
258 'MGR metadata',
259 MGR_METADATA
260 )
261 metrics['mgr_status'] = Metric(
262 'gauge',
263 'mgr_status',
264 'MGR status (0=standby, 1=active)',
265 MGR_STATUS
266 )
267 metrics['mgr_module_status'] = Metric(
268 'gauge',
269 'mgr_module_status',
270 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
271 MGR_MODULE_STATUS
272 )
273 metrics['mgr_module_can_run'] = Metric(
274 'gauge',
275 'mgr_module_can_run',
276 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
277 MGR_MODULE_CAN_RUN
278 )
3efd9988
FG
279 metrics['osd_metadata'] = Metric(
280 'untyped',
281 'osd_metadata',
282 'OSD Metadata',
283 OSD_METADATA
284 )
c07f9fc5 285
3efd9988
FG
286 # The reason for having this separate to OSD_METADATA is
287 # so that we can stably use the same tag names that
288 # the Prometheus node_exporter does
289 metrics['disk_occupation'] = Metric(
b32b8144 290 'untyped',
3efd9988
FG
291 'disk_occupation',
292 'Associate Ceph daemon with disk used',
293 DISK_OCCUPATION
294 )
c07f9fc5 295
3efd9988
FG
296 metrics['pool_metadata'] = Metric(
297 'untyped',
298 'pool_metadata',
299 'POOL Metadata',
300 POOL_METADATA
301 )
94b18763
FG
302
303 metrics['rgw_metadata'] = Metric(
304 'untyped',
305 'rgw_metadata',
306 'RGW Metadata',
307 RGW_METADATA
308 )
309
11fdf7f2
TL
310 metrics['rbd_mirror_metadata'] = Metric(
311 'untyped',
312 'rbd_mirror_metadata',
313 'RBD Mirror Metadata',
314 RBD_MIRROR_METADATA
315 )
316
94b18763
FG
317 metrics['pg_total'] = Metric(
318 'gauge',
319 'pg_total',
92f5a8d4
TL
320 'PG Total Count per Pool',
321 ('pool_id',)
94b18763
FG
322 )
323
11fdf7f2
TL
324 metrics['scrape_duration_seconds'] = Metric(
325 'gauge',
326 'scrape_duration_secs',
327 'Time taken to gather metrics from Ceph (secs)'
328 )
329
94b18763
FG
330 for flag in OSD_FLAGS:
331 path = 'osd_flag_{}'.format(flag)
332 metrics[path] = Metric(
333 'untyped',
334 path,
335 'OSD Flag {}'.format(flag)
336 )
3efd9988
FG
337 for state in OSD_STATUS:
338 path = 'osd_{}'.format(state)
3efd9988
FG
339 metrics[path] = Metric(
340 'untyped',
c07f9fc5 341 path,
3efd9988
FG
342 'OSD status {}'.format(state),
343 ('ceph_daemon',)
c07f9fc5 344 )
b32b8144
FG
345 for stat in OSD_STATS:
346 path = 'osd_{}'.format(stat)
b32b8144
FG
347 metrics[path] = Metric(
348 'gauge',
349 path,
350 'OSD stat {}'.format(stat),
351 ('ceph_daemon',)
352 )
11fdf7f2
TL
353 for stat in OSD_POOL_STATS:
354 path = 'pool_{}'.format(stat)
355 metrics[path] = Metric(
356 'gauge',
357 path,
358 "OSD POOL STATS: {}".format(stat),
359 ('pool_id',)
360 )
3efd9988
FG
361 for state in PG_STATES:
362 path = 'pg_{}'.format(state)
3efd9988
FG
363 metrics[path] = Metric(
364 'gauge',
365 path,
92f5a8d4
TL
366 'PG {} per pool'.format(state),
367 ('pool_id',)
3efd9988
FG
368 )
369 for state in DF_CLUSTER:
370 path = 'cluster_{}'.format(state)
3efd9988
FG
371 metrics[path] = Metric(
372 'gauge',
373 path,
374 'DF {}'.format(state),
375 )
376 for state in DF_POOL:
377 path = 'pool_{}'.format(state)
3efd9988
FG
378 metrics[path] = Metric(
379 'gauge',
380 path,
381 'DF pool {}'.format(state),
382 ('pool_id',)
383 )
28e407b8
AA
384 for state in NUM_OBJECTS:
385 path = 'num_objects_{}'.format(state)
386 metrics[path] = Metric(
387 'gauge',
388 path,
389 'Number of {} objects'.format(state),
390 )
3efd9988
FG
391
392 return metrics
c07f9fc5 393
3efd9988
FG
394 def get_health(self):
395 health = json.loads(self.get('health')['json'])
91327a77
AA
396 self.metrics['health_status'].set(
397 health_status_to_number(health['status'])
c07f9fc5
FG
398 )
399
11fdf7f2
TL
400 def get_pool_stats(self):
401 # retrieve pool stats to provide per pool recovery metrics
402 # (osd_pool_stats moved to mgr in Mimic)
403 pstats = self.get('osd_pool_stats')
404 for pool in pstats['pool_stats']:
405 for stat in OSD_POOL_STATS:
406 self.metrics['pool_{}'.format(stat)].set(
407 pool['recovery_rate'].get(stat, 0),
408 (pool['pool_id'],)
409 )
410
3efd9988
FG
411 def get_df(self):
412 # maybe get the to-be-exported metrics from a config?
413 df = self.get('df')
414 for stat in DF_CLUSTER:
91327a77 415 self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat])
3efd9988
FG
416
417 for pool in df['pools']:
418 for stat in DF_POOL:
91327a77
AA
419 self.metrics['pool_{}'.format(stat)].set(
420 pool['stats'][stat],
421 (pool['id'],)
422 )
94b18763
FG
423
424 def get_fs(self):
425 fs_map = self.get('fs_map')
426 servers = self.get_service_list()
427 active_daemons = []
428 for fs in fs_map['filesystems']:
429 # collect fs metadata
11fdf7f2
TL
430 data_pools = ",".join([str(pool)
431 for pool in fs['mdsmap']['data_pools']])
91327a77
AA
432 self.metrics['fs_metadata'].set(1, (
433 data_pools,
434 fs['id'],
435 fs['mdsmap']['metadata_pool'],
436 fs['mdsmap']['fs_name']
437 ))
28e407b8 438 self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
94b18763
FG
439 for gid, daemon in fs['mdsmap']['info'].items():
440 id_ = daemon['name']
11fdf7f2 441 host_version = servers.get((id_, 'mds'), ('', ''))
91327a77
AA
442 self.metrics['mds_metadata'].set(1, (
443 'mds.{}'.format(id_), fs['id'],
444 host_version[0], daemon['addr'],
445 daemon['rank'], host_version[1]
446 ))
3efd9988
FG
447
448 def get_quorum_status(self):
449 mon_status = json.loads(self.get('mon_status')['json'])
94b18763
FG
450 servers = self.get_service_list()
451 for mon in mon_status['monmap']['mons']:
452 rank = mon['rank']
453 id_ = mon['name']
11fdf7f2 454 host_version = servers.get((id_, 'mon'), ('', ''))
91327a77
AA
455 self.metrics['mon_metadata'].set(1, (
456 'mon.{}'.format(id_), host_version[0],
457 mon['public_addr'].split(':')[0], rank,
458 host_version[1]
459 ))
94b18763 460 in_quorum = int(rank in mon_status['quorum'])
91327a77
AA
461 self.metrics['mon_quorum_status'].set(in_quorum, (
462 'mon.{}'.format(id_),
463 ))
3efd9988 464
494da23a
TL
465 def get_mgr_status(self):
466 mgr_map = self.get('mgr_map')
467 servers = self.get_service_list()
468
469 active = mgr_map['active_name']
470 standbys = [s.get('name') for s in mgr_map['standbys']]
471
472 all_mgrs = list(standbys)
473 all_mgrs.append(active)
474
475 all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']}
476
eafe8130 477 ceph_release = None
494da23a
TL
478 for mgr in all_mgrs:
479 host_version = servers.get((mgr, 'mgr'), ('', ''))
480 if mgr == active:
481 _state = 1
482 ceph_release = host_version[1].split()[-2] # e.g. nautilus
483 else:
484 _state = 0
485
486 self.metrics['mgr_metadata'].set(1, (
487 'mgr.{}'.format(mgr), host_version[0],
488 host_version[1]
489 ))
490 self.metrics['mgr_status'].set(_state, (
491 'mgr.{}'.format(mgr),
492 ))
eafe8130 493 always_on_modules = mgr_map['always_on_modules'].get(ceph_release, [])
494da23a
TL
494 active_modules = list(always_on_modules)
495 active_modules.extend(mgr_map['modules'])
496
497 for mod_name in all_modules.keys():
498
499 if mod_name in always_on_modules:
500 _state = 2
501 elif mod_name in active_modules:
502 _state = 1
503 else:
504 _state = 0
505
506 _can_run = 1 if all_modules[mod_name] else 0
507 self.metrics['mgr_module_status'].set(_state, (mod_name,))
508 self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,))
509
3efd9988 510 def get_pg_status(self):
94b18763 511
92f5a8d4
TL
512 pg_summary = self.get('pg_summary')
513
514 for pool in pg_summary['by_pool']:
515 total = 0
516 for state_name, count in pg_summary['by_pool'][pool].items():
517 reported_states = {}
518
519 for state in state_name.split('+'):
520 reported_states[state] = reported_states.get(
521 state, 0) + count
522
523 for state in reported_states:
524 path = 'pg_{}'.format(state)
525 try:
526 self.metrics[path].set(reported_states[state],(pool,))
527 except KeyError:
528 self.log.warn("skipping pg in unknown state {}".format(state))
529
530 for state in PG_STATES:
531 if state not in reported_states:
532 try:
533 self.metrics['pg_{}'.format(state)].set(0,(pool,))
534 except KeyError:
535 self.log.warn(
536 "skipping pg in unknown state {}".format(state))
537 total = total + count
538 self.metrics['pg_total'].set(total,(pool,))
b32b8144
FG
539
540 def get_osd_stats(self):
541 osd_stats = self.get('osd_stats')
542 for osd in osd_stats['osd_stats']:
543 id_ = osd['osd']
544 for stat in OSD_STATS:
94b18763 545 val = osd['perf_stat'][stat]
91327a77
AA
546 self.metrics['osd_{}'.format(stat)].set(val, (
547 'osd.{}'.format(id_),
548 ))
94b18763
FG
549
550 def get_service_list(self):
551 ret = {}
552 for server in self.list_servers():
553 version = server.get('ceph_version', '')
554 host = server.get('hostname', '')
555 for service in server.get('services', []):
556 ret.update({(service['id'], service['type']): (host, version)})
557 return ret
3efd9988
FG
558
559 def get_metadata_and_osd_status(self):
560 osd_map = self.get('osd_map')
94b18763
FG
561 osd_flags = osd_map['flags'].split(',')
562 for flag in OSD_FLAGS:
91327a77
AA
563 self.metrics['osd_flag_{}'.format(flag)].set(
564 int(flag in osd_flags)
565 )
94b18763 566
3efd9988 567 osd_devices = self.get('osd_map_crush')['devices']
94b18763 568 servers = self.get_service_list()
3efd9988 569 for osd in osd_map['osds']:
94b18763 570 # id can be used to link osd metrics and metadata
3efd9988 571 id_ = osd['osd']
94b18763 572 # collect osd metadata
3efd9988
FG
573 p_addr = osd['public_addr'].split(':')[0]
574 c_addr = osd['cluster_addr'].split(':')[0]
94b18763
FG
575 if p_addr == "-" or c_addr == "-":
576 self.log.info(
577 "Missing address metadata for osd {0}, skipping occupation"
578 " and metadata records for this osd".format(id_)
579 )
580 continue
581
582 dev_class = None
583 for osd_device in osd_devices:
584 if osd_device['id'] == id_:
585 dev_class = osd_device.get('class', '')
586 break
587
588 if dev_class is None:
589 self.log.info(
590 "OSD {0} is missing from CRUSH map, skipping output".format(
591 id_))
592 continue
593
11fdf7f2 594 host_version = servers.get((str(id_), 'osd'), ('', ''))
94b18763 595
a8e16298
TL
596 # collect disk occupation metadata
597 osd_metadata = self.get_metadata("osd", str(id_))
598 if osd_metadata is None:
599 continue
600
601 obj_store = osd_metadata.get('osd_objectstore', '')
602 f_iface = osd_metadata.get('front_iface', '')
603 b_iface = osd_metadata.get('back_iface', '')
604
91327a77 605 self.metrics['osd_metadata'].set(1, (
a8e16298 606 b_iface,
28e407b8 607 'osd.{}'.format(id_),
3efd9988 608 c_addr,
94b18763 609 dev_class,
a8e16298 610 f_iface,
28e407b8 611 host_version[0],
a8e16298
TL
612 obj_store,
613 p_addr,
614 host_version[1]
3efd9988 615 ))
94b18763
FG
616
617 # collect osd status
3efd9988
FG
618 for state in OSD_STATUS:
619 status = osd[state]
91327a77
AA
620 self.metrics['osd_{}'.format(state)].set(status, (
621 'osd.{}'.format(id_),
622 ))
3efd9988 623
92f5a8d4 624 osd_dev_node = None
a8e16298 625 if obj_store == "filestore":
11fdf7f2
TL
626 # collect filestore backend device
627 osd_dev_node = osd_metadata.get(
628 'backend_filestore_dev_node', None)
629 # collect filestore journal device
f64942e4
AA
630 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
631 osd_db_dev_node = ''
a8e16298 632 elif obj_store == "bluestore":
11fdf7f2
TL
633 # collect bluestore backend device
634 osd_dev_node = osd_metadata.get(
635 'bluestore_bdev_dev_node', None)
636 # collect bluestore wal backend
f64942e4 637 osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
11fdf7f2 638 # collect bluestore db backend
f64942e4
AA
639 osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
640 if osd_dev_node and osd_dev_node == "unknown":
641 osd_dev_node = None
642
3efd9988
FG
643 osd_hostname = osd_metadata.get('hostname', None)
644 if osd_dev_node and osd_hostname:
645 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
646 id_, osd_hostname, osd_dev_node))
91327a77 647 self.metrics['disk_occupation'].set(1, (
28e407b8 648 "osd.{0}".format(id_),
3efd9988 649 osd_dev_node,
f64942e4
AA
650 osd_db_dev_node,
651 osd_wal_dev_node,
28e407b8 652 osd_hostname
3efd9988
FG
653 ))
654 else:
655 self.log.info("Missing dev node metadata for osd {0}, skipping "
11fdf7f2 656 "occupation record for this osd".format(id_))
3efd9988 657
94b18763 658 pool_meta = []
3efd9988 659 for pool in osd_map['pools']:
11fdf7f2
TL
660 self.metrics['pool_metadata'].set(
661 1, (pool['pool'], pool['pool_name']))
94b18763 662
11fdf7f2 663 # Populate other servers metadata
94b18763
FG
664 for key, value in servers.items():
665 service_id, service_type = key
11fdf7f2
TL
666 if service_type == 'rgw':
667 hostname, version = value
668 self.metrics['rgw_metadata'].set(
669 1,
670 ('{}.{}'.format(service_type, service_id), hostname, version)
671 )
672 elif service_type == 'rbd-mirror':
673 mirror_metadata = self.get_metadata('rbd-mirror', service_id)
674 if mirror_metadata is None:
675 continue
676 mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type,
677 service_id)
678 self.metrics['rbd_mirror_metadata'].set(
679 1, (mirror_metadata.get(k, '')
680 for k in RBD_MIRROR_METADATA)
681 )
3efd9988 682
28e407b8
AA
683 def get_num_objects(self):
684 pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
685 for obj in NUM_OBJECTS:
686 stat = 'num_objects_{}'.format(obj)
91327a77 687 self.metrics[stat].set(pg_sum[stat])
28e407b8 688
11fdf7f2
TL
689 def get_rbd_stats(self):
690 # Per RBD image stats is collected by registering a dynamic osd perf
691 # stats query that tells OSDs to group stats for requests associated
692 # with RBD objects by pool, namespace, and image id, which are
693 # extracted from the request object names or other attributes.
694 # The RBD object names have the following prefixes:
695 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
696 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
697 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
698 # The pool_id in the object name is the id of the pool with the image
699 # metdata, and should be used in the image spec. If there is no pool_id
700 # in the object name, the image pool is the pool where the object is
701 # located.
702
703 # Parse rbd_stats_pools option, which is a comma or space separated
704 # list of pool[/namespace] entries. If no namespace is specifed the
705 # stats are collected for every namespace in the pool.
706 pools_string = self.get_localized_module_option('rbd_stats_pools', '')
707 pools = {}
708 for p in [x for x in re.split('[\s,]+', pools_string) if x]:
709 s = p.split('/', 2)
710 pool_name = s[0]
711 if len(s) == 1:
712 # empty set means collect for all namespaces
713 pools[pool_name] = set()
714 continue
715 if pool_name not in pools:
716 pools[pool_name] = set()
717 elif not pools[pool_name]:
718 continue
719 pools[pool_name].add(s[1])
720
721 rbd_stats_pools = {}
722 for pool_id in list(self.rbd_stats['pools']):
723 name = self.rbd_stats['pools'][pool_id]['name']
724 if name not in pools:
725 del self.rbd_stats['pools'][pool_id]
726 else:
727 rbd_stats_pools[name] = \
728 self.rbd_stats['pools'][pool_id]['ns_names']
729
730 pools_refreshed = False
731 if pools:
732 next_refresh = self.rbd_stats['pools_refresh_time'] + \
733 self.get_localized_module_option(
734 'rbd_stats_pools_refresh_interval', 300)
735 if rbd_stats_pools != pools or time.time() >= next_refresh:
736 self.refresh_rbd_stats_pools(pools)
737 pools_refreshed = True
738
739 pool_ids = list(self.rbd_stats['pools'])
740 pool_ids.sort()
741 pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$'
742
743 nspace_names = []
744 for pool_id, pool in self.rbd_stats['pools'].items():
745 if pool['ns_names']:
746 nspace_names.extend(pool['ns_names'])
747 else:
748 nspace_names = []
749 break
750 if nspace_names:
751 namespace_regex = '^(' + \
752 "|".join([re.escape(x)
753 for x in set(nspace_names)]) + ')$'
754 else:
755 namespace_regex = '^(.*)$'
756
757 if 'query' in self.rbd_stats and \
758 (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or
759 namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']):
760 self.remove_osd_perf_query(self.rbd_stats['query_id'])
761 del self.rbd_stats['query_id']
762 del self.rbd_stats['query']
763
764 if not self.rbd_stats['pools']:
765 return
766
767 counters_info = self.rbd_stats['counters_info']
768
769 if 'query_id' not in self.rbd_stats:
770 query = {
771 'key_descriptor': [
772 {'type': 'pool_id', 'regex': pool_id_regex},
773 {'type': 'namespace', 'regex': namespace_regex},
774 {'type': 'object_name',
775 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
776 ],
777 'performance_counter_descriptors': list(counters_info),
778 }
779 query_id = self.add_osd_perf_query(query)
780 if query_id is None:
781 self.log.error('failed to add query %s' % query)
782 return
783 self.rbd_stats['query'] = query
784 self.rbd_stats['query_id'] = query_id
785
786 res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
787 for c in res['counters']:
788 # if the pool id is not found in the object name use id of the
789 # pool where the object is located
790 if c['k'][2][0]:
791 pool_id = int(c['k'][2][0])
792 else:
793 pool_id = int(c['k'][0][0])
794 if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
795 self.refresh_rbd_stats_pools(pools)
796 pools_refreshed = True
797 if pool_id not in self.rbd_stats['pools']:
798 continue
799 pool = self.rbd_stats['pools'][pool_id]
800 nspace_name = c['k'][1][0]
801 if nspace_name not in pool['images']:
802 continue
803 image_id = c['k'][2][1]
804 if image_id not in pool['images'][nspace_name] and \
805 not pools_refreshed:
806 self.refresh_rbd_stats_pools(pools)
807 pool = self.rbd_stats['pools'][pool_id]
808 pools_refreshed = True
809 if image_id not in pool['images'][nspace_name]:
810 continue
811 counters = pool['images'][nspace_name][image_id]['c']
812 for i in range(len(c['c'])):
813 counters[i][0] += c['c'][i][0]
814 counters[i][1] += c['c'][i][1]
815
816 label_names = ("pool", "namespace", "image")
817 for pool_id, pool in self.rbd_stats['pools'].items():
818 pool_name = pool['name']
819 for nspace_name, images in pool['images'].items():
820 for image_id in images:
821 image_name = images[image_id]['n']
822 counters = images[image_id]['c']
823 i = 0
824 for key in counters_info:
825 counter_info = counters_info[key]
826 stattype = self._stattype_to_str(counter_info['type'])
827 labels = (pool_name, nspace_name, image_name)
828 if counter_info['type'] == self.PERFCOUNTER_COUNTER:
829 path = 'rbd_' + key
830 if path not in self.metrics:
831 self.metrics[path] = Metric(
832 stattype,
833 path,
834 counter_info['desc'],
835 label_names,
836 )
837 self.metrics[path].set(counters[i][0], labels)
838 elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
839 path = 'rbd_' + key + '_sum'
840 if path not in self.metrics:
841 self.metrics[path] = Metric(
842 stattype,
843 path,
844 counter_info['desc'] + ' Total',
845 label_names,
846 )
847 self.metrics[path].set(counters[i][0], labels)
848 path = 'rbd_' + key + '_count'
849 if path not in self.metrics:
850 self.metrics[path] = Metric(
851 'counter',
852 path,
853 counter_info['desc'] + ' Count',
854 label_names,
855 )
856 self.metrics[path].set(counters[i][1], labels)
857 i += 1
858
859 def refresh_rbd_stats_pools(self, pools):
860 self.log.debug('refreshing rbd pools %s' % (pools))
861
862 rbd = RBD()
863 counters_info = self.rbd_stats['counters_info']
864 for pool_name, cfg_ns_names in pools.items():
865 try:
866 pool_id = self.rados.pool_lookup(pool_name)
867 with self.rados.open_ioctx(pool_name) as ioctx:
868 if pool_id not in self.rbd_stats['pools']:
869 self.rbd_stats['pools'][pool_id] = {'images': {}}
870 pool = self.rbd_stats['pools'][pool_id]
871 pool['name'] = pool_name
872 pool['ns_names'] = cfg_ns_names
873 if cfg_ns_names:
874 nspace_names = list(cfg_ns_names)
875 else:
876 nspace_names = [''] + rbd.namespace_list(ioctx)
877 for nspace_name in pool['images']:
878 if nspace_name not in nspace_names:
879 del pool['images'][nspace_name]
880 for nspace_name in nspace_names:
881 if (nspace_name and
882 not rbd.namespace_exists(ioctx, nspace_name)):
883 self.log.debug('unknown namespace %s for pool %s' %
884 (nspace_name, pool_name))
885 continue
886 ioctx.set_namespace(nspace_name)
887 if nspace_name not in pool['images']:
888 pool['images'][nspace_name] = {}
889 namespace = pool['images'][nspace_name]
890 images = {}
891 for image_meta in RBD().list2(ioctx):
892 image = {'n': image_meta['name']}
893 image_id = image_meta['id']
894 if image_id in namespace:
895 image['c'] = namespace[image_id]['c']
896 else:
897 image['c'] = [[0, 0] for x in counters_info]
898 images[image_id] = image
899 pool['images'][nspace_name] = images
900 except Exception as e:
901 self.log.error('failed listing pool %s: %s' % (pool_name, e))
902 self.rbd_stats['pools_refresh_time'] = time.time()
903
904 def shutdown_rbd_stats(self):
905 if 'query_id' in self.rbd_stats:
906 self.remove_osd_perf_query(self.rbd_stats['query_id'])
907 del self.rbd_stats['query_id']
908 del self.rbd_stats['query']
909 self.rbd_stats['pools'].clear()
910
c07f9fc5 911 def collect(self):
91327a77
AA
912 # Clear the metrics before scraping
913 for k in self.metrics.keys():
914 self.metrics[k].clear()
915
11fdf7f2
TL
916 _start_time = time.time()
917
3efd9988
FG
918 self.get_health()
919 self.get_df()
11fdf7f2 920 self.get_pool_stats()
94b18763 921 self.get_fs()
b32b8144 922 self.get_osd_stats()
3efd9988 923 self.get_quorum_status()
494da23a 924 self.get_mgr_status()
3efd9988
FG
925 self.get_metadata_and_osd_status()
926 self.get_pg_status()
28e407b8 927 self.get_num_objects()
3efd9988 928
94b18763 929 for daemon, counters in self.get_all_perf_counters().items():
3efd9988 930 for path, counter_info in counters.items():
28e407b8 931 # Skip histograms, they are represented by long running avgs
3efd9988 932 stattype = self._stattype_to_str(counter_info['type'])
3efd9988
FG
933 if not stattype or stattype == 'histogram':
934 self.log.debug('ignoring %s, type %s' % (path, stattype))
935 continue
936
81eedcae
TL
937 path, label_names, labels = self._perfpath_to_path_labels(
938 daemon, path)
939
28e407b8 940 # Get the value of the counter
11fdf7f2
TL
941 value = self._perfvalue_to_value(
942 counter_info['type'], counter_info['value'])
28e407b8
AA
943
944 # Represent the long running avgs as sum/count pairs
945 if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
946 _path = path + '_sum'
91327a77
AA
947 if _path not in self.metrics:
948 self.metrics[_path] = Metric(
949 stattype,
950 _path,
951 counter_info['description'] + ' Total',
81eedcae 952 label_names,
91327a77 953 )
81eedcae 954 self.metrics[_path].set(value, labels)
28e407b8
AA
955
956 _path = path + '_count'
91327a77
AA
957 if _path not in self.metrics:
958 self.metrics[_path] = Metric(
959 'counter',
960 _path,
961 counter_info['description'] + ' Count',
81eedcae 962 label_names,
91327a77 963 )
81eedcae 964 self.metrics[_path].set(counter_info['count'], labels,)
28e407b8 965 else:
91327a77
AA
966 if path not in self.metrics:
967 self.metrics[path] = Metric(
968 stattype,
969 path,
970 counter_info['description'],
81eedcae 971 label_names,
91327a77 972 )
81eedcae 973 self.metrics[path].set(value, labels)
91327a77 974
11fdf7f2
TL
975 self.get_rbd_stats()
976
977 _end_time = time.time()
978 self.metrics['scrape_duration_seconds'].set(_end_time - _start_time)
979
91327a77
AA
980 # Return formatted metrics and clear no longer used data
981 _metrics = [m.str_expfmt() for m in self.metrics.values()]
982 for k in self.metrics.keys():
983 self.metrics[k].clear()
984
985 return ''.join(_metrics) + '\n'
c07f9fc5 986
11fdf7f2
TL
987 def get_file_sd_config(self):
988 servers = self.list_servers()
989 targets = []
990 for server in servers:
991 hostname = server.get('hostname', '')
992 for service in server.get('services', []):
993 if service['type'] != 'mgr':
994 continue
995 id_ = service['id']
996 # get port for prometheus module at mgr with id_
997 # TODO use get_config_prefix or get_config here once
998 # https://github.com/ceph/ceph/pull/20458 is merged
999 result = CommandResult("")
1000 global_instance().send_command(
1001 result, "mon", '',
1002 json.dumps({
1003 "prefix": "config-key get",
1004 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_),
1005 }),
1006 "")
1007 r, outb, outs = result.wait()
1008 if r != 0:
1009 global_instance().log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs))
1010 targets.append('{}:{}'.format(hostname, DEFAULT_PORT))
1011 else:
1012 port = json.loads(outb)
1013 targets.append('{}:{}'.format(hostname, port))
1014
1015 ret = [
1016 {
1017 "targets": targets,
1018 "labels": {}
1019 }
1020 ]
1021 return 0, json.dumps(ret), ""
1022
1023 def self_test(self):
1024 self.collect()
1025 self.get_file_sd_config()
1026
1027 def handle_command(self, inbuf, cmd):
1028 if cmd['prefix'] == 'prometheus file_sd_config':
1029 return self.get_file_sd_config()
3efd9988
FG
1030 else:
1031 return (-errno.EINVAL, '',
1032 "Command not found '{0}'".format(cmd['prefix']))
c07f9fc5
FG
1033
1034 def serve(self):
1035
1036 class Root(object):
1037
1038 # collapse everything to '/'
1039 def _cp_dispatch(self, vpath):
1040 cherrypy.request.path = ''
1041 return self
1042
c07f9fc5
FG
1043 @cherrypy.expose
1044 def index(self):
3efd9988
FG
1045 return '''<!DOCTYPE html>
1046<html>
1047 <head><title>Ceph Exporter</title></head>
1048 <body>
1049 <h1>Ceph Exporter</h1>
1050 <p><a href='/metrics'>Metrics</a></p>
1051 </body>
1052</html>'''
1053
1054 @cherrypy.expose
1055 def metrics(self):
91327a77
AA
1056 instance = global_instance()
1057 # Lock the function execution
1058 try:
1059 instance.collect_lock.acquire()
1060 return self._metrics(instance)
1061 finally:
1062 instance.collect_lock.release()
1063
11fdf7f2
TL
1064 @staticmethod
1065 def _metrics(instance):
91327a77 1066 # Return cached data if available and collected before the cache times out
11fdf7f2 1067 if instance.collect_cache and time.time() - instance.collect_time < instance.collect_timeout:
94b18763 1068 cherrypy.response.headers['Content-Type'] = 'text/plain'
91327a77
AA
1069 return instance.collect_cache
1070
1071 if instance.have_mon_connection():
1072 instance.collect_cache = None
1073 instance.collect_time = time.time()
1074 instance.collect_cache = instance.collect()
1075 cherrypy.response.headers['Content-Type'] = 'text/plain'
1076 return instance.collect_cache
94b18763
FG
1077 else:
1078 raise cherrypy.HTTPError(503, 'No MON connection')
c07f9fc5 1079
91327a77 1080 # Make the cache timeout for collecting configurable
eafe8130
TL
1081 self.collect_timeout = float(self.get_localized_module_option(
1082 'scrape_interval', 5.0))
91327a77 1083
11fdf7f2 1084 server_addr = self.get_localized_module_option(
494da23a 1085 'server_addr', get_default_addr())
11fdf7f2
TL
1086 server_port = self.get_localized_module_option(
1087 'server_port', DEFAULT_PORT)
c07f9fc5
FG
1088 self.log.info(
1089 "server_addr: %s server_port: %s" %
1090 (server_addr, server_port)
1091 )
c07f9fc5 1092
94b18763
FG
1093 # Publish the URI that others may use to access the service we're
1094 # about to start serving
1095 self.set_uri('http://{0}:{1}/'.format(
eafe8130 1096 socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr,
94b18763
FG
1097 server_port
1098 ))
1099
c07f9fc5
FG
1100 cherrypy.config.update({
1101 'server.socket_host': server_addr,
3efd9988 1102 'server.socket_port': int(server_port),
c07f9fc5
FG
1103 'engine.autoreload.on': False
1104 })
1105 cherrypy.tree.mount(Root(), "/")
94b18763 1106 self.log.info('Starting engine...')
c07f9fc5 1107 cherrypy.engine.start()
94b18763 1108 self.log.info('Engine started.')
91327a77
AA
1109 # wait for the shutdown event
1110 self.shutdown_event.wait()
1111 self.shutdown_event.clear()
1112 cherrypy.engine.stop()
1113 self.log.info('Engine stopped.')
11fdf7f2 1114 self.shutdown_rbd_stats()
94b18763
FG
1115
1116 def shutdown(self):
1117 self.log.info('Stopping engine...')
91327a77 1118 self.shutdown_event.set()
94b18763
FG
1119
1120
1121class StandbyModule(MgrStandbyModule):
91327a77
AA
1122 def __init__(self, *args, **kwargs):
1123 super(StandbyModule, self).__init__(*args, **kwargs)
1124 self.shutdown_event = threading.Event()
1125
94b18763 1126 def serve(self):
494da23a
TL
1127 server_addr = self.get_localized_module_option(
1128 'server_addr', get_default_addr())
11fdf7f2
TL
1129 server_port = self.get_localized_module_option(
1130 'server_port', DEFAULT_PORT)
1131 self.log.info("server_addr: %s server_port: %s" %
1132 (server_addr, server_port))
94b18763
FG
1133 cherrypy.config.update({
1134 'server.socket_host': server_addr,
1135 'server.socket_port': int(server_port),
1136 'engine.autoreload.on': False
1137 })
1138
1139 module = self
1140
1141 class Root(object):
94b18763
FG
1142 @cherrypy.expose
1143 def index(self):
1144 active_uri = module.get_active_uri()
1145 return '''<!DOCTYPE html>
1146<html>
1147 <head><title>Ceph Exporter</title></head>
1148 <body>
1149 <h1>Ceph Exporter</h1>
1150 <p><a href='{}metrics'>Metrics</a></p>
1151 </body>
1152</html>'''.format(active_uri)
1153
1154 @cherrypy.expose
1155 def metrics(self):
1156 cherrypy.response.headers['Content-Type'] = 'text/plain'
1157 return ''
1158
1159 cherrypy.tree.mount(Root(), '/', {})
1160 self.log.info('Starting engine...')
1161 cherrypy.engine.start()
94b18763 1162 self.log.info('Engine started.')
91327a77
AA
1163 # Wait for shutdown event
1164 self.shutdown_event.wait()
1165 self.shutdown_event.clear()
1166 cherrypy.engine.stop()
1167 self.log.info('Engine stopped.')
94b18763
FG
1168
1169 def shutdown(self):
1170 self.log.info("Stopping engine...")
91327a77 1171 self.shutdown_event.set()
94b18763 1172 self.log.info("Stopped engine")