]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/prometheus/module.py
udpdate/drop patches for 15.2.0
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
CommitLineData
c07f9fc5 1import cherrypy
a8e16298 2from distutils.version import StrictVersion
3efd9988
FG
3import json
4import errno
c07f9fc5
FG
5import math
6import os
11fdf7f2 7import re
94b18763 8import socket
91327a77
AA
9import threading
10import time
11fdf7f2 11from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
494da23a 12from mgr_util import get_default_addr
11fdf7f2 13from rbd import RBD
c07f9fc5
FG
14
15# Defaults for the Prometheus HTTP server. Can also set in config-key
16# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
17# for Prometheus exporter port registry
18
c07f9fc5
FG
19DEFAULT_PORT = 9283
20
a8e16298
TL
21# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
22# that the ports its listening on are in fact bound. When using the any address
23# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
24# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
25# exception.
26if cherrypy is not None:
27 v = StrictVersion(cherrypy.__version__)
28 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
29 # centos:7) and back to at least 3.0.0.
30 if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
31 # https://github.com/cherrypy/cherrypy/issues/1100
32 from cherrypy.process import servers
33 servers.wait_for_occupied_port = lambda host, port: None
c07f9fc5 34
9f95a23c 35
c07f9fc5 36# cherrypy likes to sys.exit on error. don't let it take us down too!
3efd9988 37def os_exit_noop(*args, **kwargs):
c07f9fc5
FG
38 pass
39
40
41os._exit = os_exit_noop
42
c07f9fc5
FG
43# to access things in class Module from subclass Root. Because
44# it's a dict, the writer doesn't need to declare 'global' for access
45
46_global_instance = {'plugin': None}
47
48
49def global_instance():
50 assert _global_instance['plugin'] is not None
51 return _global_instance['plugin']
52
53
3efd9988 54def health_status_to_number(status):
3efd9988
FG
55 if status == 'HEALTH_OK':
56 return 0
57 elif status == 'HEALTH_WARN':
58 return 1
59 elif status == 'HEALTH_ERR':
60 return 2
c07f9fc5 61
11fdf7f2
TL
62
63DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
64
65DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty',
3efd9988 66 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
c07f9fc5 67
11fdf7f2
TL
68OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
69 'recovering_keys_per_sec', 'num_objects_recovered',
70 'num_bytes_recovered', 'num_bytes_recovered')
71
94b18763
FG
72OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
73 'norecover', 'noscrub', 'nodeep-scrub')
3efd9988 74
28e407b8 75FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
b32b8144 76
28e407b8
AA
77MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
78 'ceph_version')
3efd9988 79
11fdf7f2
TL
80MON_METADATA = ('ceph_daemon', 'hostname',
81 'public_addr', 'rank', 'ceph_version')
c07f9fc5 82
494da23a
TL
83MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
84
85MGR_STATUS = ('ceph_daemon',)
86
87MGR_MODULE_STATUS = ('name',)
88
89MGR_MODULE_CAN_RUN = ('name',)
90
a8e16298
TL
91OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
92 'front_iface', 'hostname', 'objectstore', 'public_addr',
93 'ceph_version')
c07f9fc5 94
94b18763 95OSD_STATUS = ['weight', 'up', 'in']
c07f9fc5 96
94b18763 97OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
c07f9fc5 98
94b18763 99POOL_METADATA = ('pool_id', 'name')
c07f9fc5 100
28e407b8 101RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
c07f9fc5 102
11fdf7f2
TL
103RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname',
104 'ceph_version')
105
106DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
107 'wal_device', 'instance')
28e407b8
AA
108
109NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
c07f9fc5 110
c07f9fc5 111
91327a77
AA
112class Metric(object):
113 def __init__(self, mtype, name, desc, labels=None):
114 self.mtype = mtype
115 self.name = name
116 self.desc = desc
117 self.labelnames = labels # tuple if present
118 self.value = {} # indexed by label values
119
120 def clear(self):
121 self.value = {}
122
123 def set(self, value, labelvalues=None):
124 # labelvalues must be a tuple
125 labelvalues = labelvalues or ('',)
126 self.value[labelvalues] = value
3efd9988 127
91327a77
AA
128 def str_expfmt(self):
129
130 def promethize(path):
131 ''' replace illegal metric name characters '''
81eedcae 132 result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus')
91327a77
AA
133
134 # Hyphens usually turn into underscores, unless they are
135 # trailing
136 if result.endswith("-"):
137 result = result[0:-1] + "_minus"
138 else:
139 result = result.replace("-", "_")
140
141 return "ceph_{0}".format(result)
142
143 def floatstr(value):
144 ''' represent as Go-compatible float '''
145 if value == float('inf'):
146 return '+Inf'
147 if value == float('-inf'):
148 return '-Inf'
149 if math.isnan(value):
150 return 'NaN'
151 return repr(float(value))
152
153 name = promethize(self.name)
154 expfmt = '''
155# HELP {name} {desc}
156# TYPE {name} {mtype}'''.format(
157 name=name,
158 desc=self.desc,
159 mtype=self.mtype,
160 )
161
162 for labelvalues, value in self.value.items():
163 if self.labelnames:
164 labels = zip(self.labelnames, labelvalues)
165 labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
166 else:
167 labels = ''
168 if labels:
169 fmtstr = '\n{name}{{{labels}}} {value}'
170 else:
171 fmtstr = '\n{name} {value}'
172 expfmt += fmtstr.format(
173 name=name,
174 labels=labels,
175 value=floatstr(value),
176 )
177 return expfmt
178
179
180class Module(MgrModule):
181 COMMANDS = [
182 {
11fdf7f2
TL
183 "cmd": "prometheus file_sd_config",
184 "desc": "Return file_sd compatible prometheus config for mgr cluster",
185 "perm": "r"
91327a77
AA
186 },
187 ]
188
11fdf7f2
TL
189 MODULE_OPTIONS = [
190 {'name': 'server_addr'},
191 {'name': 'server_port'},
192 {'name': 'scrape_interval'},
193 {'name': 'rbd_stats_pools'},
194 {'name': 'rbd_stats_pools_refresh_interval'},
91327a77
AA
195 ]
196
197 def __init__(self, *args, **kwargs):
198 super(Module, self).__init__(*args, **kwargs)
199 self.metrics = self._setup_static_metrics()
200 self.shutdown_event = threading.Event()
201 self.collect_lock = threading.RLock()
202 self.collect_time = 0
203 self.collect_timeout = 5.0
204 self.collect_cache = None
11fdf7f2
TL
205 self.rbd_stats = {
206 'pools': {},
207 'pools_refresh_time': 0,
208 'counters_info': {
209 'write_ops': {'type': self.PERFCOUNTER_COUNTER,
210 'desc': 'RBD image writes count'},
211 'read_ops': {'type': self.PERFCOUNTER_COUNTER,
212 'desc': 'RBD image reads count'},
213 'write_bytes': {'type': self.PERFCOUNTER_COUNTER,
214 'desc': 'RBD image bytes written'},
215 'read_bytes': {'type': self.PERFCOUNTER_COUNTER,
216 'desc': 'RBD image bytes read'},
217 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
218 'desc': 'RBD image writes latency (msec)'},
219 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
220 'desc': 'RBD image reads latency (msec)'},
221 },
222 }
91327a77 223 _global_instance['plugin'] = self
3efd9988
FG
224
225 def _setup_static_metrics(self):
226 metrics = {}
227 metrics['health_status'] = Metric(
228 'untyped',
229 'health_status',
230 'Cluster health status'
231 )
94b18763 232 metrics['mon_quorum_status'] = Metric(
3efd9988 233 'gauge',
94b18763
FG
234 'mon_quorum_status',
235 'Monitors in quorum',
236 ('ceph_daemon',)
237 )
238 metrics['fs_metadata'] = Metric(
239 'untyped',
240 'fs_metadata',
241 'FS Metadata',
242 FS_METADATA
243 )
244 metrics['mds_metadata'] = Metric(
245 'untyped',
246 'mds_metadata',
247 'MDS Metadata',
248 MDS_METADATA
249 )
250 metrics['mon_metadata'] = Metric(
251 'untyped',
252 'mon_metadata',
253 'MON Metadata',
254 MON_METADATA
3efd9988 255 )
494da23a
TL
256 metrics['mgr_metadata'] = Metric(
257 'gauge',
258 'mgr_metadata',
259 'MGR metadata',
260 MGR_METADATA
261 )
262 metrics['mgr_status'] = Metric(
263 'gauge',
264 'mgr_status',
265 'MGR status (0=standby, 1=active)',
266 MGR_STATUS
267 )
268 metrics['mgr_module_status'] = Metric(
269 'gauge',
270 'mgr_module_status',
271 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
272 MGR_MODULE_STATUS
273 )
274 metrics['mgr_module_can_run'] = Metric(
275 'gauge',
276 'mgr_module_can_run',
277 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
278 MGR_MODULE_CAN_RUN
279 )
3efd9988
FG
280 metrics['osd_metadata'] = Metric(
281 'untyped',
282 'osd_metadata',
283 'OSD Metadata',
284 OSD_METADATA
285 )
c07f9fc5 286
3efd9988
FG
287 # The reason for having this separate to OSD_METADATA is
288 # so that we can stably use the same tag names that
289 # the Prometheus node_exporter does
290 metrics['disk_occupation'] = Metric(
b32b8144 291 'untyped',
3efd9988
FG
292 'disk_occupation',
293 'Associate Ceph daemon with disk used',
294 DISK_OCCUPATION
295 )
c07f9fc5 296
3efd9988
FG
297 metrics['pool_metadata'] = Metric(
298 'untyped',
299 'pool_metadata',
300 'POOL Metadata',
301 POOL_METADATA
302 )
94b18763
FG
303
304 metrics['rgw_metadata'] = Metric(
305 'untyped',
306 'rgw_metadata',
307 'RGW Metadata',
308 RGW_METADATA
309 )
310
11fdf7f2
TL
311 metrics['rbd_mirror_metadata'] = Metric(
312 'untyped',
313 'rbd_mirror_metadata',
314 'RBD Mirror Metadata',
315 RBD_MIRROR_METADATA
316 )
317
94b18763
FG
318 metrics['pg_total'] = Metric(
319 'gauge',
320 'pg_total',
92f5a8d4
TL
321 'PG Total Count per Pool',
322 ('pool_id',)
94b18763
FG
323 )
324
325 for flag in OSD_FLAGS:
326 path = 'osd_flag_{}'.format(flag)
327 metrics[path] = Metric(
328 'untyped',
329 path,
330 'OSD Flag {}'.format(flag)
331 )
3efd9988
FG
332 for state in OSD_STATUS:
333 path = 'osd_{}'.format(state)
3efd9988
FG
334 metrics[path] = Metric(
335 'untyped',
c07f9fc5 336 path,
3efd9988
FG
337 'OSD status {}'.format(state),
338 ('ceph_daemon',)
c07f9fc5 339 )
b32b8144
FG
340 for stat in OSD_STATS:
341 path = 'osd_{}'.format(stat)
b32b8144
FG
342 metrics[path] = Metric(
343 'gauge',
344 path,
345 'OSD stat {}'.format(stat),
346 ('ceph_daemon',)
347 )
11fdf7f2
TL
348 for stat in OSD_POOL_STATS:
349 path = 'pool_{}'.format(stat)
350 metrics[path] = Metric(
351 'gauge',
352 path,
9f95a23c 353 "OSD pool stats: {}".format(stat),
11fdf7f2
TL
354 ('pool_id',)
355 )
3efd9988
FG
356 for state in PG_STATES:
357 path = 'pg_{}'.format(state)
3efd9988
FG
358 metrics[path] = Metric(
359 'gauge',
360 path,
92f5a8d4
TL
361 'PG {} per pool'.format(state),
362 ('pool_id',)
3efd9988
FG
363 )
364 for state in DF_CLUSTER:
365 path = 'cluster_{}'.format(state)
3efd9988
FG
366 metrics[path] = Metric(
367 'gauge',
368 path,
369 'DF {}'.format(state),
370 )
371 for state in DF_POOL:
372 path = 'pool_{}'.format(state)
3efd9988
FG
373 metrics[path] = Metric(
374 'gauge',
375 path,
376 'DF pool {}'.format(state),
377 ('pool_id',)
378 )
28e407b8
AA
379 for state in NUM_OBJECTS:
380 path = 'num_objects_{}'.format(state)
381 metrics[path] = Metric(
382 'gauge',
383 path,
384 'Number of {} objects'.format(state),
385 )
3efd9988
FG
386
387 return metrics
c07f9fc5 388
3efd9988
FG
389 def get_health(self):
390 health = json.loads(self.get('health')['json'])
91327a77
AA
391 self.metrics['health_status'].set(
392 health_status_to_number(health['status'])
c07f9fc5
FG
393 )
394
11fdf7f2
TL
395 def get_pool_stats(self):
396 # retrieve pool stats to provide per pool recovery metrics
397 # (osd_pool_stats moved to mgr in Mimic)
398 pstats = self.get('osd_pool_stats')
399 for pool in pstats['pool_stats']:
400 for stat in OSD_POOL_STATS:
401 self.metrics['pool_{}'.format(stat)].set(
402 pool['recovery_rate'].get(stat, 0),
403 (pool['pool_id'],)
404 )
405
3efd9988
FG
406 def get_df(self):
407 # maybe get the to-be-exported metrics from a config?
408 df = self.get('df')
409 for stat in DF_CLUSTER:
91327a77 410 self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat])
3efd9988
FG
411
412 for pool in df['pools']:
413 for stat in DF_POOL:
91327a77
AA
414 self.metrics['pool_{}'.format(stat)].set(
415 pool['stats'][stat],
416 (pool['id'],)
417 )
94b18763
FG
418
419 def get_fs(self):
420 fs_map = self.get('fs_map')
421 servers = self.get_service_list()
9f95a23c
TL
422 self.log.debug('standbys: {}'.format(fs_map['standbys']))
423 # export standby mds metadata, default standby fs_id is '-1'
424 for standby in fs_map['standbys']:
425 id_ = standby['name']
426 host_version = servers.get((id_, 'mds'), ('', ''))
427 self.metrics['mds_metadata'].set(1, (
428 'mds.{}'.format(id_), '-1',
429 host_version[0], standby['addr'],
430 standby['rank'], host_version[1]
431 ))
94b18763
FG
432 for fs in fs_map['filesystems']:
433 # collect fs metadata
11fdf7f2
TL
434 data_pools = ",".join([str(pool)
435 for pool in fs['mdsmap']['data_pools']])
91327a77
AA
436 self.metrics['fs_metadata'].set(1, (
437 data_pools,
438 fs['id'],
439 fs['mdsmap']['metadata_pool'],
440 fs['mdsmap']['fs_name']
441 ))
28e407b8 442 self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
94b18763
FG
443 for gid, daemon in fs['mdsmap']['info'].items():
444 id_ = daemon['name']
11fdf7f2 445 host_version = servers.get((id_, 'mds'), ('', ''))
91327a77
AA
446 self.metrics['mds_metadata'].set(1, (
447 'mds.{}'.format(id_), fs['id'],
448 host_version[0], daemon['addr'],
449 daemon['rank'], host_version[1]
450 ))
3efd9988
FG
451
452 def get_quorum_status(self):
453 mon_status = json.loads(self.get('mon_status')['json'])
94b18763
FG
454 servers = self.get_service_list()
455 for mon in mon_status['monmap']['mons']:
456 rank = mon['rank']
457 id_ = mon['name']
11fdf7f2 458 host_version = servers.get((id_, 'mon'), ('', ''))
91327a77
AA
459 self.metrics['mon_metadata'].set(1, (
460 'mon.{}'.format(id_), host_version[0],
461 mon['public_addr'].split(':')[0], rank,
462 host_version[1]
463 ))
94b18763 464 in_quorum = int(rank in mon_status['quorum'])
91327a77
AA
465 self.metrics['mon_quorum_status'].set(in_quorum, (
466 'mon.{}'.format(id_),
467 ))
3efd9988 468
494da23a
TL
469 def get_mgr_status(self):
470 mgr_map = self.get('mgr_map')
471 servers = self.get_service_list()
472
473 active = mgr_map['active_name']
474 standbys = [s.get('name') for s in mgr_map['standbys']]
475
476 all_mgrs = list(standbys)
477 all_mgrs.append(active)
478
479 all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']}
480
eafe8130 481 ceph_release = None
494da23a
TL
482 for mgr in all_mgrs:
483 host_version = servers.get((mgr, 'mgr'), ('', ''))
484 if mgr == active:
485 _state = 1
486 ceph_release = host_version[1].split()[-2] # e.g. nautilus
487 else:
488 _state = 0
489
490 self.metrics['mgr_metadata'].set(1, (
491 'mgr.{}'.format(mgr), host_version[0],
492 host_version[1]
493 ))
494 self.metrics['mgr_status'].set(_state, (
495 'mgr.{}'.format(mgr),
496 ))
eafe8130 497 always_on_modules = mgr_map['always_on_modules'].get(ceph_release, [])
494da23a
TL
498 active_modules = list(always_on_modules)
499 active_modules.extend(mgr_map['modules'])
500
501 for mod_name in all_modules.keys():
502
503 if mod_name in always_on_modules:
504 _state = 2
505 elif mod_name in active_modules:
506 _state = 1
507 else:
508 _state = 0
509
510 _can_run = 1 if all_modules[mod_name] else 0
511 self.metrics['mgr_module_status'].set(_state, (mod_name,))
512 self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,))
513
3efd9988 514 def get_pg_status(self):
94b18763 515
92f5a8d4
TL
516 pg_summary = self.get('pg_summary')
517
518 for pool in pg_summary['by_pool']:
519 total = 0
520 for state_name, count in pg_summary['by_pool'][pool].items():
521 reported_states = {}
522
523 for state in state_name.split('+'):
524 reported_states[state] = reported_states.get(
525 state, 0) + count
526
527 for state in reported_states:
528 path = 'pg_{}'.format(state)
529 try:
530 self.metrics[path].set(reported_states[state],(pool,))
531 except KeyError:
532 self.log.warn("skipping pg in unknown state {}".format(state))
533
534 for state in PG_STATES:
535 if state not in reported_states:
536 try:
537 self.metrics['pg_{}'.format(state)].set(0,(pool,))
538 except KeyError:
539 self.log.warn(
540 "skipping pg in unknown state {}".format(state))
541 total = total + count
542 self.metrics['pg_total'].set(total,(pool,))
b32b8144
FG
543
544 def get_osd_stats(self):
545 osd_stats = self.get('osd_stats')
546 for osd in osd_stats['osd_stats']:
547 id_ = osd['osd']
548 for stat in OSD_STATS:
94b18763 549 val = osd['perf_stat'][stat]
91327a77
AA
550 self.metrics['osd_{}'.format(stat)].set(val, (
551 'osd.{}'.format(id_),
552 ))
94b18763
FG
553
554 def get_service_list(self):
555 ret = {}
556 for server in self.list_servers():
557 version = server.get('ceph_version', '')
558 host = server.get('hostname', '')
559 for service in server.get('services', []):
560 ret.update({(service['id'], service['type']): (host, version)})
561 return ret
3efd9988
FG
562
563 def get_metadata_and_osd_status(self):
564 osd_map = self.get('osd_map')
94b18763
FG
565 osd_flags = osd_map['flags'].split(',')
566 for flag in OSD_FLAGS:
91327a77
AA
567 self.metrics['osd_flag_{}'.format(flag)].set(
568 int(flag in osd_flags)
569 )
94b18763 570
3efd9988 571 osd_devices = self.get('osd_map_crush')['devices']
94b18763 572 servers = self.get_service_list()
3efd9988 573 for osd in osd_map['osds']:
94b18763 574 # id can be used to link osd metrics and metadata
3efd9988 575 id_ = osd['osd']
94b18763 576 # collect osd metadata
3efd9988
FG
577 p_addr = osd['public_addr'].split(':')[0]
578 c_addr = osd['cluster_addr'].split(':')[0]
94b18763
FG
579 if p_addr == "-" or c_addr == "-":
580 self.log.info(
581 "Missing address metadata for osd {0}, skipping occupation"
582 " and metadata records for this osd".format(id_)
583 )
584 continue
585
586 dev_class = None
587 for osd_device in osd_devices:
588 if osd_device['id'] == id_:
589 dev_class = osd_device.get('class', '')
590 break
591
592 if dev_class is None:
9f95a23c
TL
593 self.log.info("OSD {0} is missing from CRUSH map, "
594 "skipping output".format(id_))
94b18763
FG
595 continue
596
11fdf7f2 597 host_version = servers.get((str(id_), 'osd'), ('', ''))
94b18763 598
a8e16298
TL
599 # collect disk occupation metadata
600 osd_metadata = self.get_metadata("osd", str(id_))
601 if osd_metadata is None:
602 continue
603
604 obj_store = osd_metadata.get('osd_objectstore', '')
605 f_iface = osd_metadata.get('front_iface', '')
606 b_iface = osd_metadata.get('back_iface', '')
607
91327a77 608 self.metrics['osd_metadata'].set(1, (
a8e16298 609 b_iface,
28e407b8 610 'osd.{}'.format(id_),
3efd9988 611 c_addr,
94b18763 612 dev_class,
a8e16298 613 f_iface,
28e407b8 614 host_version[0],
a8e16298
TL
615 obj_store,
616 p_addr,
617 host_version[1]
3efd9988 618 ))
94b18763
FG
619
620 # collect osd status
3efd9988
FG
621 for state in OSD_STATUS:
622 status = osd[state]
91327a77
AA
623 self.metrics['osd_{}'.format(state)].set(status, (
624 'osd.{}'.format(id_),
625 ))
3efd9988 626
92f5a8d4 627 osd_dev_node = None
a8e16298 628 if obj_store == "filestore":
11fdf7f2
TL
629 # collect filestore backend device
630 osd_dev_node = osd_metadata.get(
631 'backend_filestore_dev_node', None)
632 # collect filestore journal device
f64942e4
AA
633 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
634 osd_db_dev_node = ''
a8e16298 635 elif obj_store == "bluestore":
11fdf7f2
TL
636 # collect bluestore backend device
637 osd_dev_node = osd_metadata.get(
638 'bluestore_bdev_dev_node', None)
639 # collect bluestore wal backend
f64942e4 640 osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
11fdf7f2 641 # collect bluestore db backend
f64942e4
AA
642 osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
643 if osd_dev_node and osd_dev_node == "unknown":
644 osd_dev_node = None
645
3efd9988
FG
646 osd_hostname = osd_metadata.get('hostname', None)
647 if osd_dev_node and osd_hostname:
648 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
649 id_, osd_hostname, osd_dev_node))
91327a77 650 self.metrics['disk_occupation'].set(1, (
28e407b8 651 "osd.{0}".format(id_),
3efd9988 652 osd_dev_node,
f64942e4
AA
653 osd_db_dev_node,
654 osd_wal_dev_node,
28e407b8 655 osd_hostname
3efd9988
FG
656 ))
657 else:
658 self.log.info("Missing dev node metadata for osd {0}, skipping "
11fdf7f2 659 "occupation record for this osd".format(id_))
3efd9988
FG
660
661 for pool in osd_map['pools']:
11fdf7f2
TL
662 self.metrics['pool_metadata'].set(
663 1, (pool['pool'], pool['pool_name']))
94b18763 664
11fdf7f2 665 # Populate other servers metadata
94b18763
FG
666 for key, value in servers.items():
667 service_id, service_type = key
11fdf7f2
TL
668 if service_type == 'rgw':
669 hostname, version = value
670 self.metrics['rgw_metadata'].set(
671 1,
9f95a23c
TL
672 ('{}.{}'.format(service_type, service_id),
673 hostname, version)
11fdf7f2
TL
674 )
675 elif service_type == 'rbd-mirror':
676 mirror_metadata = self.get_metadata('rbd-mirror', service_id)
677 if mirror_metadata is None:
678 continue
679 mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type,
680 service_id)
681 self.metrics['rbd_mirror_metadata'].set(
682 1, (mirror_metadata.get(k, '')
683 for k in RBD_MIRROR_METADATA)
684 )
3efd9988 685
28e407b8
AA
686 def get_num_objects(self):
687 pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
688 for obj in NUM_OBJECTS:
689 stat = 'num_objects_{}'.format(obj)
91327a77 690 self.metrics[stat].set(pg_sum[stat])
28e407b8 691
11fdf7f2
TL
692 def get_rbd_stats(self):
693 # Per RBD image stats is collected by registering a dynamic osd perf
694 # stats query that tells OSDs to group stats for requests associated
695 # with RBD objects by pool, namespace, and image id, which are
696 # extracted from the request object names or other attributes.
697 # The RBD object names have the following prefixes:
698 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
699 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
700 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
701 # The pool_id in the object name is the id of the pool with the image
702 # metdata, and should be used in the image spec. If there is no pool_id
703 # in the object name, the image pool is the pool where the object is
704 # located.
705
706 # Parse rbd_stats_pools option, which is a comma or space separated
707 # list of pool[/namespace] entries. If no namespace is specifed the
708 # stats are collected for every namespace in the pool.
709 pools_string = self.get_localized_module_option('rbd_stats_pools', '')
710 pools = {}
711 for p in [x for x in re.split('[\s,]+', pools_string) if x]:
712 s = p.split('/', 2)
713 pool_name = s[0]
714 if len(s) == 1:
715 # empty set means collect for all namespaces
716 pools[pool_name] = set()
717 continue
718 if pool_name not in pools:
719 pools[pool_name] = set()
720 elif not pools[pool_name]:
721 continue
722 pools[pool_name].add(s[1])
723
724 rbd_stats_pools = {}
725 for pool_id in list(self.rbd_stats['pools']):
726 name = self.rbd_stats['pools'][pool_id]['name']
727 if name not in pools:
728 del self.rbd_stats['pools'][pool_id]
729 else:
730 rbd_stats_pools[name] = \
731 self.rbd_stats['pools'][pool_id]['ns_names']
732
733 pools_refreshed = False
734 if pools:
735 next_refresh = self.rbd_stats['pools_refresh_time'] + \
736 self.get_localized_module_option(
737 'rbd_stats_pools_refresh_interval', 300)
738 if rbd_stats_pools != pools or time.time() >= next_refresh:
739 self.refresh_rbd_stats_pools(pools)
740 pools_refreshed = True
741
742 pool_ids = list(self.rbd_stats['pools'])
743 pool_ids.sort()
744 pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$'
745
746 nspace_names = []
747 for pool_id, pool in self.rbd_stats['pools'].items():
748 if pool['ns_names']:
749 nspace_names.extend(pool['ns_names'])
750 else:
751 nspace_names = []
752 break
753 if nspace_names:
754 namespace_regex = '^(' + \
755 "|".join([re.escape(x)
756 for x in set(nspace_names)]) + ')$'
757 else:
758 namespace_regex = '^(.*)$'
759
760 if 'query' in self.rbd_stats and \
761 (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or
762 namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']):
763 self.remove_osd_perf_query(self.rbd_stats['query_id'])
764 del self.rbd_stats['query_id']
765 del self.rbd_stats['query']
766
767 if not self.rbd_stats['pools']:
768 return
769
770 counters_info = self.rbd_stats['counters_info']
771
772 if 'query_id' not in self.rbd_stats:
773 query = {
774 'key_descriptor': [
775 {'type': 'pool_id', 'regex': pool_id_regex},
776 {'type': 'namespace', 'regex': namespace_regex},
777 {'type': 'object_name',
778 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
779 ],
780 'performance_counter_descriptors': list(counters_info),
781 }
782 query_id = self.add_osd_perf_query(query)
783 if query_id is None:
784 self.log.error('failed to add query %s' % query)
785 return
786 self.rbd_stats['query'] = query
787 self.rbd_stats['query_id'] = query_id
788
789 res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
790 for c in res['counters']:
791 # if the pool id is not found in the object name use id of the
792 # pool where the object is located
793 if c['k'][2][0]:
794 pool_id = int(c['k'][2][0])
795 else:
796 pool_id = int(c['k'][0][0])
797 if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
798 self.refresh_rbd_stats_pools(pools)
799 pools_refreshed = True
800 if pool_id not in self.rbd_stats['pools']:
801 continue
802 pool = self.rbd_stats['pools'][pool_id]
803 nspace_name = c['k'][1][0]
804 if nspace_name not in pool['images']:
805 continue
806 image_id = c['k'][2][1]
807 if image_id not in pool['images'][nspace_name] and \
808 not pools_refreshed:
809 self.refresh_rbd_stats_pools(pools)
810 pool = self.rbd_stats['pools'][pool_id]
811 pools_refreshed = True
812 if image_id not in pool['images'][nspace_name]:
813 continue
814 counters = pool['images'][nspace_name][image_id]['c']
815 for i in range(len(c['c'])):
816 counters[i][0] += c['c'][i][0]
817 counters[i][1] += c['c'][i][1]
818
819 label_names = ("pool", "namespace", "image")
820 for pool_id, pool in self.rbd_stats['pools'].items():
821 pool_name = pool['name']
822 for nspace_name, images in pool['images'].items():
823 for image_id in images:
824 image_name = images[image_id]['n']
825 counters = images[image_id]['c']
826 i = 0
827 for key in counters_info:
828 counter_info = counters_info[key]
829 stattype = self._stattype_to_str(counter_info['type'])
830 labels = (pool_name, nspace_name, image_name)
831 if counter_info['type'] == self.PERFCOUNTER_COUNTER:
832 path = 'rbd_' + key
833 if path not in self.metrics:
834 self.metrics[path] = Metric(
835 stattype,
836 path,
837 counter_info['desc'],
838 label_names,
839 )
840 self.metrics[path].set(counters[i][0], labels)
841 elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
842 path = 'rbd_' + key + '_sum'
843 if path not in self.metrics:
844 self.metrics[path] = Metric(
845 stattype,
846 path,
847 counter_info['desc'] + ' Total',
848 label_names,
849 )
850 self.metrics[path].set(counters[i][0], labels)
851 path = 'rbd_' + key + '_count'
852 if path not in self.metrics:
853 self.metrics[path] = Metric(
854 'counter',
855 path,
856 counter_info['desc'] + ' Count',
857 label_names,
858 )
859 self.metrics[path].set(counters[i][1], labels)
860 i += 1
861
862 def refresh_rbd_stats_pools(self, pools):
863 self.log.debug('refreshing rbd pools %s' % (pools))
864
865 rbd = RBD()
866 counters_info = self.rbd_stats['counters_info']
867 for pool_name, cfg_ns_names in pools.items():
868 try:
869 pool_id = self.rados.pool_lookup(pool_name)
870 with self.rados.open_ioctx(pool_name) as ioctx:
871 if pool_id not in self.rbd_stats['pools']:
872 self.rbd_stats['pools'][pool_id] = {'images': {}}
873 pool = self.rbd_stats['pools'][pool_id]
874 pool['name'] = pool_name
875 pool['ns_names'] = cfg_ns_names
876 if cfg_ns_names:
877 nspace_names = list(cfg_ns_names)
878 else:
879 nspace_names = [''] + rbd.namespace_list(ioctx)
880 for nspace_name in pool['images']:
881 if nspace_name not in nspace_names:
882 del pool['images'][nspace_name]
883 for nspace_name in nspace_names:
884 if (nspace_name and
885 not rbd.namespace_exists(ioctx, nspace_name)):
886 self.log.debug('unknown namespace %s for pool %s' %
887 (nspace_name, pool_name))
888 continue
889 ioctx.set_namespace(nspace_name)
890 if nspace_name not in pool['images']:
891 pool['images'][nspace_name] = {}
892 namespace = pool['images'][nspace_name]
893 images = {}
894 for image_meta in RBD().list2(ioctx):
895 image = {'n': image_meta['name']}
896 image_id = image_meta['id']
897 if image_id in namespace:
898 image['c'] = namespace[image_id]['c']
899 else:
900 image['c'] = [[0, 0] for x in counters_info]
901 images[image_id] = image
902 pool['images'][nspace_name] = images
903 except Exception as e:
904 self.log.error('failed listing pool %s: %s' % (pool_name, e))
905 self.rbd_stats['pools_refresh_time'] = time.time()
906
907 def shutdown_rbd_stats(self):
908 if 'query_id' in self.rbd_stats:
909 self.remove_osd_perf_query(self.rbd_stats['query_id'])
910 del self.rbd_stats['query_id']
911 del self.rbd_stats['query']
912 self.rbd_stats['pools'].clear()
913
c07f9fc5 914 def collect(self):
91327a77
AA
915 # Clear the metrics before scraping
916 for k in self.metrics.keys():
917 self.metrics[k].clear()
918
3efd9988
FG
919 self.get_health()
920 self.get_df()
11fdf7f2 921 self.get_pool_stats()
94b18763 922 self.get_fs()
b32b8144 923 self.get_osd_stats()
3efd9988 924 self.get_quorum_status()
494da23a 925 self.get_mgr_status()
3efd9988
FG
926 self.get_metadata_and_osd_status()
927 self.get_pg_status()
28e407b8 928 self.get_num_objects()
3efd9988 929
94b18763 930 for daemon, counters in self.get_all_perf_counters().items():
3efd9988 931 for path, counter_info in counters.items():
28e407b8 932 # Skip histograms, they are represented by long running avgs
3efd9988 933 stattype = self._stattype_to_str(counter_info['type'])
3efd9988
FG
934 if not stattype or stattype == 'histogram':
935 self.log.debug('ignoring %s, type %s' % (path, stattype))
936 continue
937
81eedcae
TL
938 path, label_names, labels = self._perfpath_to_path_labels(
939 daemon, path)
940
28e407b8 941 # Get the value of the counter
11fdf7f2
TL
942 value = self._perfvalue_to_value(
943 counter_info['type'], counter_info['value'])
28e407b8
AA
944
945 # Represent the long running avgs as sum/count pairs
946 if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
947 _path = path + '_sum'
91327a77
AA
948 if _path not in self.metrics:
949 self.metrics[_path] = Metric(
950 stattype,
951 _path,
952 counter_info['description'] + ' Total',
81eedcae 953 label_names,
91327a77 954 )
81eedcae 955 self.metrics[_path].set(value, labels)
28e407b8
AA
956
957 _path = path + '_count'
91327a77
AA
958 if _path not in self.metrics:
959 self.metrics[_path] = Metric(
960 'counter',
961 _path,
962 counter_info['description'] + ' Count',
81eedcae 963 label_names,
91327a77 964 )
81eedcae 965 self.metrics[_path].set(counter_info['count'], labels,)
28e407b8 966 else:
91327a77
AA
967 if path not in self.metrics:
968 self.metrics[path] = Metric(
969 stattype,
970 path,
971 counter_info['description'],
81eedcae 972 label_names,
91327a77 973 )
81eedcae 974 self.metrics[path].set(value, labels)
91327a77 975
11fdf7f2
TL
976 self.get_rbd_stats()
977
91327a77
AA
978 # Return formatted metrics and clear no longer used data
979 _metrics = [m.str_expfmt() for m in self.metrics.values()]
980 for k in self.metrics.keys():
981 self.metrics[k].clear()
982
983 return ''.join(_metrics) + '\n'
c07f9fc5 984
11fdf7f2
TL
985 def get_file_sd_config(self):
986 servers = self.list_servers()
987 targets = []
988 for server in servers:
989 hostname = server.get('hostname', '')
990 for service in server.get('services', []):
991 if service['type'] != 'mgr':
992 continue
993 id_ = service['id']
994 # get port for prometheus module at mgr with id_
995 # TODO use get_config_prefix or get_config here once
996 # https://github.com/ceph/ceph/pull/20458 is merged
997 result = CommandResult("")
998 global_instance().send_command(
999 result, "mon", '',
1000 json.dumps({
1001 "prefix": "config-key get",
1002 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_),
1003 }),
1004 "")
1005 r, outb, outs = result.wait()
1006 if r != 0:
1007 global_instance().log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs))
1008 targets.append('{}:{}'.format(hostname, DEFAULT_PORT))
1009 else:
1010 port = json.loads(outb)
1011 targets.append('{}:{}'.format(hostname, port))
1012
1013 ret = [
1014 {
1015 "targets": targets,
1016 "labels": {}
1017 }
1018 ]
1019 return 0, json.dumps(ret), ""
1020
1021 def self_test(self):
1022 self.collect()
1023 self.get_file_sd_config()
1024
1025 def handle_command(self, inbuf, cmd):
1026 if cmd['prefix'] == 'prometheus file_sd_config':
1027 return self.get_file_sd_config()
3efd9988
FG
1028 else:
1029 return (-errno.EINVAL, '',
1030 "Command not found '{0}'".format(cmd['prefix']))
c07f9fc5
FG
1031
1032 def serve(self):
1033
1034 class Root(object):
1035
1036 # collapse everything to '/'
1037 def _cp_dispatch(self, vpath):
1038 cherrypy.request.path = ''
1039 return self
1040
c07f9fc5
FG
1041 @cherrypy.expose
1042 def index(self):
3efd9988
FG
1043 return '''<!DOCTYPE html>
1044<html>
9f95a23c
TL
1045 <head><title>Ceph Exporter</title></head>
1046 <body>
1047 <h1>Ceph Exporter</h1>
1048 <p><a href='/metrics'>Metrics</a></p>
1049 </body>
3efd9988
FG
1050</html>'''
1051
1052 @cherrypy.expose
1053 def metrics(self):
91327a77
AA
1054 instance = global_instance()
1055 # Lock the function execution
1056 try:
1057 instance.collect_lock.acquire()
1058 return self._metrics(instance)
1059 finally:
1060 instance.collect_lock.release()
1061
11fdf7f2
TL
1062 @staticmethod
1063 def _metrics(instance):
9f95a23c
TL
1064 # Return cached data if available and collected before the
1065 # cache times out
11fdf7f2 1066 if instance.collect_cache and time.time() - instance.collect_time < instance.collect_timeout:
94b18763 1067 cherrypy.response.headers['Content-Type'] = 'text/plain'
91327a77
AA
1068 return instance.collect_cache
1069
1070 if instance.have_mon_connection():
1071 instance.collect_cache = None
1072 instance.collect_time = time.time()
1073 instance.collect_cache = instance.collect()
1074 cherrypy.response.headers['Content-Type'] = 'text/plain'
1075 return instance.collect_cache
94b18763
FG
1076 else:
1077 raise cherrypy.HTTPError(503, 'No MON connection')
c07f9fc5 1078
91327a77 1079 # Make the cache timeout for collecting configurable
eafe8130
TL
1080 self.collect_timeout = float(self.get_localized_module_option(
1081 'scrape_interval', 5.0))
91327a77 1082
11fdf7f2 1083 server_addr = self.get_localized_module_option(
494da23a 1084 'server_addr', get_default_addr())
11fdf7f2
TL
1085 server_port = self.get_localized_module_option(
1086 'server_port', DEFAULT_PORT)
c07f9fc5
FG
1087 self.log.info(
1088 "server_addr: %s server_port: %s" %
1089 (server_addr, server_port)
1090 )
c07f9fc5 1091
94b18763
FG
1092 # Publish the URI that others may use to access the service we're
1093 # about to start serving
1094 self.set_uri('http://{0}:{1}/'.format(
eafe8130 1095 socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr,
94b18763
FG
1096 server_port
1097 ))
1098
c07f9fc5
FG
1099 cherrypy.config.update({
1100 'server.socket_host': server_addr,
3efd9988 1101 'server.socket_port': int(server_port),
c07f9fc5
FG
1102 'engine.autoreload.on': False
1103 })
1104 cherrypy.tree.mount(Root(), "/")
94b18763 1105 self.log.info('Starting engine...')
c07f9fc5 1106 cherrypy.engine.start()
94b18763 1107 self.log.info('Engine started.')
91327a77
AA
1108 # wait for the shutdown event
1109 self.shutdown_event.wait()
1110 self.shutdown_event.clear()
1111 cherrypy.engine.stop()
1112 self.log.info('Engine stopped.')
11fdf7f2 1113 self.shutdown_rbd_stats()
94b18763
FG
1114
1115 def shutdown(self):
1116 self.log.info('Stopping engine...')
91327a77 1117 self.shutdown_event.set()
94b18763
FG
1118
1119
1120class StandbyModule(MgrStandbyModule):
91327a77
AA
1121 def __init__(self, *args, **kwargs):
1122 super(StandbyModule, self).__init__(*args, **kwargs)
1123 self.shutdown_event = threading.Event()
1124
94b18763 1125 def serve(self):
494da23a
TL
1126 server_addr = self.get_localized_module_option(
1127 'server_addr', get_default_addr())
11fdf7f2
TL
1128 server_port = self.get_localized_module_option(
1129 'server_port', DEFAULT_PORT)
1130 self.log.info("server_addr: %s server_port: %s" %
1131 (server_addr, server_port))
94b18763
FG
1132 cherrypy.config.update({
1133 'server.socket_host': server_addr,
1134 'server.socket_port': int(server_port),
1135 'engine.autoreload.on': False
1136 })
1137
1138 module = self
1139
1140 class Root(object):
94b18763
FG
1141 @cherrypy.expose
1142 def index(self):
1143 active_uri = module.get_active_uri()
1144 return '''<!DOCTYPE html>
1145<html>
9f95a23c
TL
1146 <head><title>Ceph Exporter</title></head>
1147 <body>
1148 <h1>Ceph Exporter</h1>
94b18763 1149 <p><a href='{}metrics'>Metrics</a></p>
9f95a23c 1150 </body>
94b18763
FG
1151</html>'''.format(active_uri)
1152
1153 @cherrypy.expose
1154 def metrics(self):
1155 cherrypy.response.headers['Content-Type'] = 'text/plain'
1156 return ''
1157
1158 cherrypy.tree.mount(Root(), '/', {})
1159 self.log.info('Starting engine...')
1160 cherrypy.engine.start()
94b18763 1161 self.log.info('Engine started.')
91327a77
AA
1162 # Wait for shutdown event
1163 self.shutdown_event.wait()
1164 self.shutdown_event.clear()
1165 cherrypy.engine.stop()
1166 self.log.info('Engine stopped.')
94b18763
FG
1167
1168 def shutdown(self):
1169 self.log.info("Stopping engine...")
91327a77 1170 self.shutdown_event.set()
94b18763 1171 self.log.info("Stopped engine")