]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/prometheus/module.py
bump version to 15.2.6-pve1
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
CommitLineData
c07f9fc5 1import cherrypy
a8e16298 2from distutils.version import StrictVersion
3efd9988
FG
3import json
4import errno
c07f9fc5
FG
5import math
6import os
11fdf7f2 7import re
94b18763 8import socket
91327a77
AA
9import threading
10import time
11fdf7f2 11from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
f6b5b4d7 12from mgr_util import get_default_addr, profile_method
11fdf7f2 13from rbd import RBD
f6b5b4d7
TL
14try:
15 from typing import Optional, Dict, Any, Set
16except:
17 pass
c07f9fc5
FG
18
19# Defaults for the Prometheus HTTP server. Can also set in config-key
20# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
21# for Prometheus exporter port registry
22
c07f9fc5
FG
23DEFAULT_PORT = 9283
24
a8e16298
TL
25# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
26# that the ports its listening on are in fact bound. When using the any address
27# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
28# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
29# exception.
30if cherrypy is not None:
31 v = StrictVersion(cherrypy.__version__)
32 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
33 # centos:7) and back to at least 3.0.0.
34 if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
35 # https://github.com/cherrypy/cherrypy/issues/1100
36 from cherrypy.process import servers
37 servers.wait_for_occupied_port = lambda host, port: None
c07f9fc5 38
9f95a23c 39
c07f9fc5 40# cherrypy likes to sys.exit on error. don't let it take us down too!
3efd9988 41def os_exit_noop(*args, **kwargs):
c07f9fc5
FG
42 pass
43
44
45os._exit = os_exit_noop
46
c07f9fc5
FG
47# to access things in class Module from subclass Root. Because
48# it's a dict, the writer doesn't need to declare 'global' for access
49
f6b5b4d7 50_global_instance = None # type: Optional[Module]
c07f9fc5
FG
51
52
3efd9988 53def health_status_to_number(status):
3efd9988
FG
54 if status == 'HEALTH_OK':
55 return 0
56 elif status == 'HEALTH_WARN':
57 return 1
58 elif status == 'HEALTH_ERR':
59 return 2
c07f9fc5 60
11fdf7f2
TL
61
62DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
63
64DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty',
3efd9988 65 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
c07f9fc5 66
11fdf7f2
TL
67OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
68 'recovering_keys_per_sec', 'num_objects_recovered',
69 'num_bytes_recovered', 'num_bytes_recovered')
70
94b18763
FG
71OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
72 'norecover', 'noscrub', 'nodeep-scrub')
3efd9988 73
28e407b8 74FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
b32b8144 75
28e407b8
AA
76MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
77 'ceph_version')
3efd9988 78
11fdf7f2
TL
79MON_METADATA = ('ceph_daemon', 'hostname',
80 'public_addr', 'rank', 'ceph_version')
c07f9fc5 81
494da23a
TL
82MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
83
84MGR_STATUS = ('ceph_daemon',)
85
86MGR_MODULE_STATUS = ('name',)
87
88MGR_MODULE_CAN_RUN = ('name',)
89
a8e16298
TL
90OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
91 'front_iface', 'hostname', 'objectstore', 'public_addr',
92 'ceph_version')
c07f9fc5 93
94b18763 94OSD_STATUS = ['weight', 'up', 'in']
c07f9fc5 95
94b18763 96OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
c07f9fc5 97
94b18763 98POOL_METADATA = ('pool_id', 'name')
c07f9fc5 99
28e407b8 100RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
c07f9fc5 101
11fdf7f2
TL
102RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname',
103 'ceph_version')
104
105DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
106 'wal_device', 'instance')
28e407b8
AA
107
108NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
c07f9fc5 109
c07f9fc5 110
91327a77
AA
111class Metric(object):
112 def __init__(self, mtype, name, desc, labels=None):
113 self.mtype = mtype
114 self.name = name
115 self.desc = desc
116 self.labelnames = labels # tuple if present
117 self.value = {} # indexed by label values
118
119 def clear(self):
120 self.value = {}
121
122 def set(self, value, labelvalues=None):
123 # labelvalues must be a tuple
124 labelvalues = labelvalues or ('',)
125 self.value[labelvalues] = value
3efd9988 126
91327a77
AA
127 def str_expfmt(self):
128
129 def promethize(path):
130 ''' replace illegal metric name characters '''
81eedcae 131 result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus')
91327a77
AA
132
133 # Hyphens usually turn into underscores, unless they are
134 # trailing
135 if result.endswith("-"):
136 result = result[0:-1] + "_minus"
137 else:
138 result = result.replace("-", "_")
139
140 return "ceph_{0}".format(result)
141
142 def floatstr(value):
143 ''' represent as Go-compatible float '''
144 if value == float('inf'):
145 return '+Inf'
146 if value == float('-inf'):
147 return '-Inf'
148 if math.isnan(value):
149 return 'NaN'
150 return repr(float(value))
151
152 name = promethize(self.name)
153 expfmt = '''
154# HELP {name} {desc}
155# TYPE {name} {mtype}'''.format(
156 name=name,
157 desc=self.desc,
158 mtype=self.mtype,
159 )
160
161 for labelvalues, value in self.value.items():
162 if self.labelnames:
f6b5b4d7
TL
163 labels_list = zip(self.labelnames, labelvalues)
164 labels = ','.join('%s="%s"' % (k, v) for k, v in labels_list)
91327a77
AA
165 else:
166 labels = ''
167 if labels:
168 fmtstr = '\n{name}{{{labels}}} {value}'
169 else:
170 fmtstr = '\n{name} {value}'
171 expfmt += fmtstr.format(
172 name=name,
173 labels=labels,
174 value=floatstr(value),
175 )
176 return expfmt
177
178
f6b5b4d7
TL
179class MetricCollectionThread(threading.Thread):
180 def __init__(self, module):
181 # type: (Module) -> None
182 self.mod = module
183 super(MetricCollectionThread, self).__init__(target=self.collect)
184
185 def collect(self):
186 self.mod.log.info('starting metric collection thread')
187 while True:
188 self.mod.log.debug('collecting cache in thread')
189 if self.mod.have_mon_connection():
190 start_time = time.time()
191 data = self.mod.collect()
192 duration = time.time() - start_time
193
194 self.mod.log.debug('collecting cache in thread done')
195
196 sleep_time = self.mod.scrape_interval - duration
197 if sleep_time < 0:
198 self.mod.log.warning(
199 'Collecting data took more time than configured scrape interval. '
200 'This possibly results in stale data. Please check the '
201 '`stale_cache_strategy` configuration option. '
202 'Collecting data took {:.2f} seconds but scrape interval is configured '
203 'to be {:.0f} seconds.'.format(
204 duration,
205 self.mod.scrape_interval,
206 )
207 )
208 sleep_time = 0
209
210 with self.mod.collect_lock:
211 self.mod.collect_cache = data
212 self.mod.collect_time = duration
213
214 time.sleep(sleep_time)
215 else:
216 self.mod.log.error('No MON connection')
217 time.sleep(self.mod.scrape_interval)
218
219
91327a77
AA
220class Module(MgrModule):
221 COMMANDS = [
222 {
11fdf7f2
TL
223 "cmd": "prometheus file_sd_config",
224 "desc": "Return file_sd compatible prometheus config for mgr cluster",
225 "perm": "r"
91327a77
AA
226 },
227 ]
228
11fdf7f2
TL
229 MODULE_OPTIONS = [
230 {'name': 'server_addr'},
231 {'name': 'server_port'},
232 {'name': 'scrape_interval'},
f6b5b4d7 233 {'name': 'stale_cache_strategy'},
11fdf7f2 234 {'name': 'rbd_stats_pools'},
e306af50 235 {'name': 'rbd_stats_pools_refresh_interval', 'type': 'int', 'default': 300},
91327a77
AA
236 ]
237
f6b5b4d7
TL
238 STALE_CACHE_FAIL = 'fail'
239 STALE_CACHE_RETURN = 'return'
240
91327a77
AA
241 def __init__(self, *args, **kwargs):
242 super(Module, self).__init__(*args, **kwargs)
243 self.metrics = self._setup_static_metrics()
244 self.shutdown_event = threading.Event()
f6b5b4d7
TL
245 self.collect_lock = threading.Lock()
246 self.collect_time = 0.0
247 self.scrape_interval = 15.0
248 self.stale_cache_strategy = self.STALE_CACHE_FAIL
91327a77 249 self.collect_cache = None
11fdf7f2
TL
250 self.rbd_stats = {
251 'pools': {},
252 'pools_refresh_time': 0,
253 'counters_info': {
254 'write_ops': {'type': self.PERFCOUNTER_COUNTER,
255 'desc': 'RBD image writes count'},
256 'read_ops': {'type': self.PERFCOUNTER_COUNTER,
257 'desc': 'RBD image reads count'},
258 'write_bytes': {'type': self.PERFCOUNTER_COUNTER,
259 'desc': 'RBD image bytes written'},
260 'read_bytes': {'type': self.PERFCOUNTER_COUNTER,
261 'desc': 'RBD image bytes read'},
262 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
263 'desc': 'RBD image writes latency (msec)'},
264 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
265 'desc': 'RBD image reads latency (msec)'},
266 },
f6b5b4d7
TL
267 } # type: Dict[str, Any]
268 global _global_instance
269 _global_instance = self
270 MetricCollectionThread(_global_instance).start()
3efd9988
FG
271
272 def _setup_static_metrics(self):
273 metrics = {}
274 metrics['health_status'] = Metric(
275 'untyped',
276 'health_status',
277 'Cluster health status'
278 )
94b18763 279 metrics['mon_quorum_status'] = Metric(
3efd9988 280 'gauge',
94b18763
FG
281 'mon_quorum_status',
282 'Monitors in quorum',
283 ('ceph_daemon',)
284 )
285 metrics['fs_metadata'] = Metric(
286 'untyped',
287 'fs_metadata',
288 'FS Metadata',
289 FS_METADATA
290 )
291 metrics['mds_metadata'] = Metric(
292 'untyped',
293 'mds_metadata',
294 'MDS Metadata',
295 MDS_METADATA
296 )
297 metrics['mon_metadata'] = Metric(
298 'untyped',
299 'mon_metadata',
300 'MON Metadata',
301 MON_METADATA
3efd9988 302 )
494da23a
TL
303 metrics['mgr_metadata'] = Metric(
304 'gauge',
305 'mgr_metadata',
306 'MGR metadata',
307 MGR_METADATA
308 )
309 metrics['mgr_status'] = Metric(
310 'gauge',
311 'mgr_status',
312 'MGR status (0=standby, 1=active)',
313 MGR_STATUS
314 )
315 metrics['mgr_module_status'] = Metric(
316 'gauge',
317 'mgr_module_status',
318 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
319 MGR_MODULE_STATUS
320 )
321 metrics['mgr_module_can_run'] = Metric(
322 'gauge',
323 'mgr_module_can_run',
324 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
325 MGR_MODULE_CAN_RUN
326 )
3efd9988
FG
327 metrics['osd_metadata'] = Metric(
328 'untyped',
329 'osd_metadata',
330 'OSD Metadata',
331 OSD_METADATA
332 )
c07f9fc5 333
3efd9988
FG
334 # The reason for having this separate to OSD_METADATA is
335 # so that we can stably use the same tag names that
336 # the Prometheus node_exporter does
337 metrics['disk_occupation'] = Metric(
b32b8144 338 'untyped',
3efd9988
FG
339 'disk_occupation',
340 'Associate Ceph daemon with disk used',
341 DISK_OCCUPATION
342 )
c07f9fc5 343
3efd9988
FG
344 metrics['pool_metadata'] = Metric(
345 'untyped',
346 'pool_metadata',
347 'POOL Metadata',
348 POOL_METADATA
349 )
94b18763
FG
350
351 metrics['rgw_metadata'] = Metric(
352 'untyped',
353 'rgw_metadata',
354 'RGW Metadata',
355 RGW_METADATA
356 )
357
11fdf7f2
TL
358 metrics['rbd_mirror_metadata'] = Metric(
359 'untyped',
360 'rbd_mirror_metadata',
361 'RBD Mirror Metadata',
362 RBD_MIRROR_METADATA
363 )
364
94b18763
FG
365 metrics['pg_total'] = Metric(
366 'gauge',
367 'pg_total',
92f5a8d4
TL
368 'PG Total Count per Pool',
369 ('pool_id',)
94b18763
FG
370 )
371
372 for flag in OSD_FLAGS:
373 path = 'osd_flag_{}'.format(flag)
374 metrics[path] = Metric(
375 'untyped',
376 path,
377 'OSD Flag {}'.format(flag)
378 )
3efd9988
FG
379 for state in OSD_STATUS:
380 path = 'osd_{}'.format(state)
3efd9988
FG
381 metrics[path] = Metric(
382 'untyped',
c07f9fc5 383 path,
3efd9988
FG
384 'OSD status {}'.format(state),
385 ('ceph_daemon',)
c07f9fc5 386 )
b32b8144
FG
387 for stat in OSD_STATS:
388 path = 'osd_{}'.format(stat)
b32b8144
FG
389 metrics[path] = Metric(
390 'gauge',
391 path,
392 'OSD stat {}'.format(stat),
393 ('ceph_daemon',)
394 )
11fdf7f2
TL
395 for stat in OSD_POOL_STATS:
396 path = 'pool_{}'.format(stat)
397 metrics[path] = Metric(
398 'gauge',
399 path,
9f95a23c 400 "OSD pool stats: {}".format(stat),
11fdf7f2
TL
401 ('pool_id',)
402 )
3efd9988
FG
403 for state in PG_STATES:
404 path = 'pg_{}'.format(state)
3efd9988
FG
405 metrics[path] = Metric(
406 'gauge',
407 path,
92f5a8d4
TL
408 'PG {} per pool'.format(state),
409 ('pool_id',)
3efd9988
FG
410 )
411 for state in DF_CLUSTER:
412 path = 'cluster_{}'.format(state)
3efd9988
FG
413 metrics[path] = Metric(
414 'gauge',
415 path,
416 'DF {}'.format(state),
417 )
418 for state in DF_POOL:
419 path = 'pool_{}'.format(state)
3efd9988
FG
420 metrics[path] = Metric(
421 'gauge',
422 path,
423 'DF pool {}'.format(state),
424 ('pool_id',)
425 )
28e407b8
AA
426 for state in NUM_OBJECTS:
427 path = 'num_objects_{}'.format(state)
428 metrics[path] = Metric(
429 'gauge',
430 path,
431 'Number of {} objects'.format(state),
432 )
3efd9988
FG
433
434 return metrics
c07f9fc5 435
f6b5b4d7 436 @profile_method()
3efd9988
FG
437 def get_health(self):
438 health = json.loads(self.get('health')['json'])
91327a77
AA
439 self.metrics['health_status'].set(
440 health_status_to_number(health['status'])
c07f9fc5
FG
441 )
442
f6b5b4d7 443 @profile_method()
11fdf7f2
TL
444 def get_pool_stats(self):
445 # retrieve pool stats to provide per pool recovery metrics
446 # (osd_pool_stats moved to mgr in Mimic)
447 pstats = self.get('osd_pool_stats')
448 for pool in pstats['pool_stats']:
449 for stat in OSD_POOL_STATS:
450 self.metrics['pool_{}'.format(stat)].set(
451 pool['recovery_rate'].get(stat, 0),
452 (pool['pool_id'],)
453 )
454
f6b5b4d7 455 @profile_method()
3efd9988
FG
456 def get_df(self):
457 # maybe get the to-be-exported metrics from a config?
458 df = self.get('df')
459 for stat in DF_CLUSTER:
91327a77 460 self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat])
3efd9988
FG
461
462 for pool in df['pools']:
463 for stat in DF_POOL:
91327a77
AA
464 self.metrics['pool_{}'.format(stat)].set(
465 pool['stats'][stat],
466 (pool['id'],)
467 )
94b18763 468
f6b5b4d7 469 @profile_method()
94b18763
FG
470 def get_fs(self):
471 fs_map = self.get('fs_map')
472 servers = self.get_service_list()
9f95a23c
TL
473 self.log.debug('standbys: {}'.format(fs_map['standbys']))
474 # export standby mds metadata, default standby fs_id is '-1'
475 for standby in fs_map['standbys']:
476 id_ = standby['name']
477 host_version = servers.get((id_, 'mds'), ('', ''))
478 self.metrics['mds_metadata'].set(1, (
479 'mds.{}'.format(id_), '-1',
480 host_version[0], standby['addr'],
481 standby['rank'], host_version[1]
482 ))
94b18763
FG
483 for fs in fs_map['filesystems']:
484 # collect fs metadata
11fdf7f2
TL
485 data_pools = ",".join([str(pool)
486 for pool in fs['mdsmap']['data_pools']])
91327a77
AA
487 self.metrics['fs_metadata'].set(1, (
488 data_pools,
489 fs['id'],
490 fs['mdsmap']['metadata_pool'],
491 fs['mdsmap']['fs_name']
492 ))
28e407b8 493 self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
94b18763
FG
494 for gid, daemon in fs['mdsmap']['info'].items():
495 id_ = daemon['name']
11fdf7f2 496 host_version = servers.get((id_, 'mds'), ('', ''))
91327a77
AA
497 self.metrics['mds_metadata'].set(1, (
498 'mds.{}'.format(id_), fs['id'],
499 host_version[0], daemon['addr'],
500 daemon['rank'], host_version[1]
501 ))
3efd9988 502
f6b5b4d7 503 @profile_method()
3efd9988
FG
504 def get_quorum_status(self):
505 mon_status = json.loads(self.get('mon_status')['json'])
94b18763
FG
506 servers = self.get_service_list()
507 for mon in mon_status['monmap']['mons']:
508 rank = mon['rank']
509 id_ = mon['name']
11fdf7f2 510 host_version = servers.get((id_, 'mon'), ('', ''))
91327a77
AA
511 self.metrics['mon_metadata'].set(1, (
512 'mon.{}'.format(id_), host_version[0],
513 mon['public_addr'].split(':')[0], rank,
514 host_version[1]
515 ))
94b18763 516 in_quorum = int(rank in mon_status['quorum'])
91327a77
AA
517 self.metrics['mon_quorum_status'].set(in_quorum, (
518 'mon.{}'.format(id_),
519 ))
3efd9988 520
f6b5b4d7 521 @profile_method()
494da23a
TL
522 def get_mgr_status(self):
523 mgr_map = self.get('mgr_map')
524 servers = self.get_service_list()
525
526 active = mgr_map['active_name']
527 standbys = [s.get('name') for s in mgr_map['standbys']]
528
529 all_mgrs = list(standbys)
530 all_mgrs.append(active)
531
532 all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']}
533
eafe8130 534 ceph_release = None
494da23a
TL
535 for mgr in all_mgrs:
536 host_version = servers.get((mgr, 'mgr'), ('', ''))
537 if mgr == active:
538 _state = 1
539 ceph_release = host_version[1].split()[-2] # e.g. nautilus
540 else:
541 _state = 0
801d1391 542
494da23a
TL
543 self.metrics['mgr_metadata'].set(1, (
544 'mgr.{}'.format(mgr), host_version[0],
545 host_version[1]
546 ))
547 self.metrics['mgr_status'].set(_state, (
801d1391 548 'mgr.{}'.format(mgr),
494da23a 549 ))
eafe8130 550 always_on_modules = mgr_map['always_on_modules'].get(ceph_release, [])
494da23a
TL
551 active_modules = list(always_on_modules)
552 active_modules.extend(mgr_map['modules'])
553
554 for mod_name in all_modules.keys():
555
556 if mod_name in always_on_modules:
557 _state = 2
558 elif mod_name in active_modules:
559 _state = 1
560 else:
561 _state = 0
562
563 _can_run = 1 if all_modules[mod_name] else 0
564 self.metrics['mgr_module_status'].set(_state, (mod_name,))
565 self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,))
566
f6b5b4d7 567 @profile_method()
3efd9988 568 def get_pg_status(self):
94b18763 569
92f5a8d4
TL
570 pg_summary = self.get('pg_summary')
571
572 for pool in pg_summary['by_pool']:
801d1391
TL
573 num_by_state = dict((state, 0) for state in PG_STATES)
574 num_by_state['total'] = 0
92f5a8d4 575
801d1391 576 for state_name, count in pg_summary['by_pool'][pool].items():
92f5a8d4 577 for state in state_name.split('+'):
801d1391
TL
578 num_by_state[state] += count
579 num_by_state['total'] += count
580
581 for state, num in num_by_state.items():
582 try:
583 self.metrics["pg_{}".format(state)].set(num, (pool,))
584 except KeyError:
e306af50 585 self.log.warning("skipping pg in unknown state {}".format(state))
b32b8144 586
f6b5b4d7 587 @profile_method()
b32b8144
FG
588 def get_osd_stats(self):
589 osd_stats = self.get('osd_stats')
590 for osd in osd_stats['osd_stats']:
591 id_ = osd['osd']
592 for stat in OSD_STATS:
94b18763 593 val = osd['perf_stat'][stat]
91327a77
AA
594 self.metrics['osd_{}'.format(stat)].set(val, (
595 'osd.{}'.format(id_),
596 ))
94b18763
FG
597
598 def get_service_list(self):
599 ret = {}
600 for server in self.list_servers():
601 version = server.get('ceph_version', '')
602 host = server.get('hostname', '')
603 for service in server.get('services', []):
604 ret.update({(service['id'], service['type']): (host, version)})
605 return ret
3efd9988 606
f6b5b4d7 607 @profile_method()
3efd9988
FG
608 def get_metadata_and_osd_status(self):
609 osd_map = self.get('osd_map')
94b18763
FG
610 osd_flags = osd_map['flags'].split(',')
611 for flag in OSD_FLAGS:
91327a77
AA
612 self.metrics['osd_flag_{}'.format(flag)].set(
613 int(flag in osd_flags)
614 )
94b18763 615
3efd9988 616 osd_devices = self.get('osd_map_crush')['devices']
94b18763 617 servers = self.get_service_list()
3efd9988 618 for osd in osd_map['osds']:
94b18763 619 # id can be used to link osd metrics and metadata
3efd9988 620 id_ = osd['osd']
94b18763 621 # collect osd metadata
3efd9988
FG
622 p_addr = osd['public_addr'].split(':')[0]
623 c_addr = osd['cluster_addr'].split(':')[0]
94b18763
FG
624 if p_addr == "-" or c_addr == "-":
625 self.log.info(
626 "Missing address metadata for osd {0}, skipping occupation"
627 " and metadata records for this osd".format(id_)
628 )
629 continue
630
631 dev_class = None
632 for osd_device in osd_devices:
633 if osd_device['id'] == id_:
634 dev_class = osd_device.get('class', '')
635 break
636
637 if dev_class is None:
9f95a23c
TL
638 self.log.info("OSD {0} is missing from CRUSH map, "
639 "skipping output".format(id_))
94b18763
FG
640 continue
641
11fdf7f2 642 host_version = servers.get((str(id_), 'osd'), ('', ''))
94b18763 643
a8e16298
TL
644 # collect disk occupation metadata
645 osd_metadata = self.get_metadata("osd", str(id_))
646 if osd_metadata is None:
647 continue
648
649 obj_store = osd_metadata.get('osd_objectstore', '')
650 f_iface = osd_metadata.get('front_iface', '')
651 b_iface = osd_metadata.get('back_iface', '')
652
91327a77 653 self.metrics['osd_metadata'].set(1, (
a8e16298 654 b_iface,
28e407b8 655 'osd.{}'.format(id_),
3efd9988 656 c_addr,
94b18763 657 dev_class,
a8e16298 658 f_iface,
28e407b8 659 host_version[0],
a8e16298
TL
660 obj_store,
661 p_addr,
662 host_version[1]
3efd9988 663 ))
94b18763
FG
664
665 # collect osd status
3efd9988
FG
666 for state in OSD_STATUS:
667 status = osd[state]
91327a77
AA
668 self.metrics['osd_{}'.format(state)].set(status, (
669 'osd.{}'.format(id_),
670 ))
3efd9988 671
92f5a8d4 672 osd_dev_node = None
a8e16298 673 if obj_store == "filestore":
11fdf7f2
TL
674 # collect filestore backend device
675 osd_dev_node = osd_metadata.get(
676 'backend_filestore_dev_node', None)
677 # collect filestore journal device
f64942e4
AA
678 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
679 osd_db_dev_node = ''
a8e16298 680 elif obj_store == "bluestore":
11fdf7f2
TL
681 # collect bluestore backend device
682 osd_dev_node = osd_metadata.get(
683 'bluestore_bdev_dev_node', None)
684 # collect bluestore wal backend
f64942e4 685 osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
11fdf7f2 686 # collect bluestore db backend
f64942e4
AA
687 osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
688 if osd_dev_node and osd_dev_node == "unknown":
689 osd_dev_node = None
690
3efd9988
FG
691 osd_hostname = osd_metadata.get('hostname', None)
692 if osd_dev_node and osd_hostname:
693 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
694 id_, osd_hostname, osd_dev_node))
91327a77 695 self.metrics['disk_occupation'].set(1, (
28e407b8 696 "osd.{0}".format(id_),
3efd9988 697 osd_dev_node,
f64942e4
AA
698 osd_db_dev_node,
699 osd_wal_dev_node,
28e407b8 700 osd_hostname
3efd9988
FG
701 ))
702 else:
703 self.log.info("Missing dev node metadata for osd {0}, skipping "
11fdf7f2 704 "occupation record for this osd".format(id_))
3efd9988
FG
705
706 for pool in osd_map['pools']:
11fdf7f2
TL
707 self.metrics['pool_metadata'].set(
708 1, (pool['pool'], pool['pool_name']))
94b18763 709
11fdf7f2 710 # Populate other servers metadata
94b18763
FG
711 for key, value in servers.items():
712 service_id, service_type = key
11fdf7f2
TL
713 if service_type == 'rgw':
714 hostname, version = value
715 self.metrics['rgw_metadata'].set(
716 1,
9f95a23c
TL
717 ('{}.{}'.format(service_type, service_id),
718 hostname, version)
11fdf7f2
TL
719 )
720 elif service_type == 'rbd-mirror':
721 mirror_metadata = self.get_metadata('rbd-mirror', service_id)
722 if mirror_metadata is None:
723 continue
724 mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type,
725 service_id)
726 self.metrics['rbd_mirror_metadata'].set(
727 1, (mirror_metadata.get(k, '')
728 for k in RBD_MIRROR_METADATA)
729 )
3efd9988 730
f6b5b4d7 731 @profile_method()
28e407b8
AA
732 def get_num_objects(self):
733 pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
734 for obj in NUM_OBJECTS:
735 stat = 'num_objects_{}'.format(obj)
91327a77 736 self.metrics[stat].set(pg_sum[stat])
28e407b8 737
f6b5b4d7 738 @profile_method()
11fdf7f2
TL
739 def get_rbd_stats(self):
740 # Per RBD image stats is collected by registering a dynamic osd perf
741 # stats query that tells OSDs to group stats for requests associated
742 # with RBD objects by pool, namespace, and image id, which are
743 # extracted from the request object names or other attributes.
744 # The RBD object names have the following prefixes:
745 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
746 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
747 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
748 # The pool_id in the object name is the id of the pool with the image
749 # metdata, and should be used in the image spec. If there is no pool_id
750 # in the object name, the image pool is the pool where the object is
751 # located.
752
753 # Parse rbd_stats_pools option, which is a comma or space separated
754 # list of pool[/namespace] entries. If no namespace is specifed the
f6b5b4d7
TL
755 # stats are collected for every namespace in the pool. The wildcard
756 # '*' can be used to indicate all pools or namespaces
11fdf7f2 757 pools_string = self.get_localized_module_option('rbd_stats_pools', '')
f6b5b4d7
TL
758 pool_keys = []
759 for x in re.split('[\s,]+', pools_string):
760 if not x:
761 continue
762
763 s = x.split('/', 2)
11fdf7f2 764 pool_name = s[0]
f6b5b4d7
TL
765 namespace_name = None
766 if len(s) == 2:
767 namespace_name = s[1]
768
769 if pool_name == "*":
770 # collect for all pools
771 osd_map = self.get('osd_map')
772 for pool in osd_map['pools']:
773 if 'rbd' not in pool.get('application_metadata', {}):
774 continue
775 pool_keys.append((pool['pool_name'], namespace_name))
776 else:
777 pool_keys.append((pool_name, namespace_name))
778
779 pools = {} # type: Dict[str, Set[str]]
780 for pool_key in pool_keys:
781 pool_name = pool_key[0]
782 namespace_name = pool_key[1]
783 if not namespace_name or namespace_name == "*":
11fdf7f2
TL
784 # empty set means collect for all namespaces
785 pools[pool_name] = set()
786 continue
f6b5b4d7 787
11fdf7f2
TL
788 if pool_name not in pools:
789 pools[pool_name] = set()
790 elif not pools[pool_name]:
791 continue
f6b5b4d7 792 pools[pool_name].add(namespace_name)
11fdf7f2
TL
793
794 rbd_stats_pools = {}
f6b5b4d7 795 for pool_id in self.rbd_stats['pools'].keys():
11fdf7f2
TL
796 name = self.rbd_stats['pools'][pool_id]['name']
797 if name not in pools:
798 del self.rbd_stats['pools'][pool_id]
799 else:
800 rbd_stats_pools[name] = \
801 self.rbd_stats['pools'][pool_id]['ns_names']
802
803 pools_refreshed = False
804 if pools:
805 next_refresh = self.rbd_stats['pools_refresh_time'] + \
806 self.get_localized_module_option(
807 'rbd_stats_pools_refresh_interval', 300)
808 if rbd_stats_pools != pools or time.time() >= next_refresh:
809 self.refresh_rbd_stats_pools(pools)
810 pools_refreshed = True
811
812 pool_ids = list(self.rbd_stats['pools'])
813 pool_ids.sort()
814 pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$'
815
816 nspace_names = []
817 for pool_id, pool in self.rbd_stats['pools'].items():
818 if pool['ns_names']:
819 nspace_names.extend(pool['ns_names'])
820 else:
821 nspace_names = []
822 break
823 if nspace_names:
824 namespace_regex = '^(' + \
825 "|".join([re.escape(x)
826 for x in set(nspace_names)]) + ')$'
827 else:
828 namespace_regex = '^(.*)$'
829
830 if 'query' in self.rbd_stats and \
831 (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or
832 namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']):
833 self.remove_osd_perf_query(self.rbd_stats['query_id'])
834 del self.rbd_stats['query_id']
835 del self.rbd_stats['query']
836
837 if not self.rbd_stats['pools']:
838 return
839
840 counters_info = self.rbd_stats['counters_info']
841
842 if 'query_id' not in self.rbd_stats:
843 query = {
844 'key_descriptor': [
845 {'type': 'pool_id', 'regex': pool_id_regex},
846 {'type': 'namespace', 'regex': namespace_regex},
847 {'type': 'object_name',
848 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
849 ],
850 'performance_counter_descriptors': list(counters_info),
851 }
852 query_id = self.add_osd_perf_query(query)
853 if query_id is None:
854 self.log.error('failed to add query %s' % query)
855 return
856 self.rbd_stats['query'] = query
857 self.rbd_stats['query_id'] = query_id
858
859 res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
860 for c in res['counters']:
861 # if the pool id is not found in the object name use id of the
862 # pool where the object is located
863 if c['k'][2][0]:
864 pool_id = int(c['k'][2][0])
865 else:
866 pool_id = int(c['k'][0][0])
867 if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
868 self.refresh_rbd_stats_pools(pools)
869 pools_refreshed = True
870 if pool_id not in self.rbd_stats['pools']:
871 continue
872 pool = self.rbd_stats['pools'][pool_id]
873 nspace_name = c['k'][1][0]
874 if nspace_name not in pool['images']:
875 continue
876 image_id = c['k'][2][1]
877 if image_id not in pool['images'][nspace_name] and \
878 not pools_refreshed:
879 self.refresh_rbd_stats_pools(pools)
880 pool = self.rbd_stats['pools'][pool_id]
881 pools_refreshed = True
882 if image_id not in pool['images'][nspace_name]:
883 continue
884 counters = pool['images'][nspace_name][image_id]['c']
885 for i in range(len(c['c'])):
886 counters[i][0] += c['c'][i][0]
887 counters[i][1] += c['c'][i][1]
888
889 label_names = ("pool", "namespace", "image")
890 for pool_id, pool in self.rbd_stats['pools'].items():
891 pool_name = pool['name']
892 for nspace_name, images in pool['images'].items():
893 for image_id in images:
894 image_name = images[image_id]['n']
895 counters = images[image_id]['c']
896 i = 0
897 for key in counters_info:
898 counter_info = counters_info[key]
899 stattype = self._stattype_to_str(counter_info['type'])
900 labels = (pool_name, nspace_name, image_name)
901 if counter_info['type'] == self.PERFCOUNTER_COUNTER:
902 path = 'rbd_' + key
903 if path not in self.metrics:
904 self.metrics[path] = Metric(
905 stattype,
906 path,
907 counter_info['desc'],
908 label_names,
909 )
910 self.metrics[path].set(counters[i][0], labels)
911 elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
912 path = 'rbd_' + key + '_sum'
913 if path not in self.metrics:
914 self.metrics[path] = Metric(
915 stattype,
916 path,
917 counter_info['desc'] + ' Total',
918 label_names,
919 )
920 self.metrics[path].set(counters[i][0], labels)
921 path = 'rbd_' + key + '_count'
922 if path not in self.metrics:
923 self.metrics[path] = Metric(
924 'counter',
925 path,
926 counter_info['desc'] + ' Count',
927 label_names,
928 )
929 self.metrics[path].set(counters[i][1], labels)
930 i += 1
931
932 def refresh_rbd_stats_pools(self, pools):
933 self.log.debug('refreshing rbd pools %s' % (pools))
934
935 rbd = RBD()
936 counters_info = self.rbd_stats['counters_info']
937 for pool_name, cfg_ns_names in pools.items():
938 try:
939 pool_id = self.rados.pool_lookup(pool_name)
940 with self.rados.open_ioctx(pool_name) as ioctx:
941 if pool_id not in self.rbd_stats['pools']:
942 self.rbd_stats['pools'][pool_id] = {'images': {}}
943 pool = self.rbd_stats['pools'][pool_id]
944 pool['name'] = pool_name
945 pool['ns_names'] = cfg_ns_names
946 if cfg_ns_names:
947 nspace_names = list(cfg_ns_names)
948 else:
949 nspace_names = [''] + rbd.namespace_list(ioctx)
950 for nspace_name in pool['images']:
951 if nspace_name not in nspace_names:
952 del pool['images'][nspace_name]
953 for nspace_name in nspace_names:
954 if (nspace_name and
955 not rbd.namespace_exists(ioctx, nspace_name)):
956 self.log.debug('unknown namespace %s for pool %s' %
957 (nspace_name, pool_name))
958 continue
959 ioctx.set_namespace(nspace_name)
960 if nspace_name not in pool['images']:
961 pool['images'][nspace_name] = {}
962 namespace = pool['images'][nspace_name]
963 images = {}
964 for image_meta in RBD().list2(ioctx):
965 image = {'n': image_meta['name']}
966 image_id = image_meta['id']
967 if image_id in namespace:
968 image['c'] = namespace[image_id]['c']
969 else:
970 image['c'] = [[0, 0] for x in counters_info]
971 images[image_id] = image
972 pool['images'][nspace_name] = images
973 except Exception as e:
974 self.log.error('failed listing pool %s: %s' % (pool_name, e))
975 self.rbd_stats['pools_refresh_time'] = time.time()
976
977 def shutdown_rbd_stats(self):
978 if 'query_id' in self.rbd_stats:
979 self.remove_osd_perf_query(self.rbd_stats['query_id'])
980 del self.rbd_stats['query_id']
981 del self.rbd_stats['query']
982 self.rbd_stats['pools'].clear()
983
e306af50
TL
984 def add_fixed_name_metrics(self):
985 """
986 Add fixed name metrics from existing ones that have details in their names
987 that should be in labels (not in name).
988 For backward compatibility, a new fixed name metric is created (instead of replacing)
989 and details are put in new labels.
990 Intended for RGW sync perf. counters but extendable as required.
991 See: https://tracker.ceph.com/issues/45311
992 """
993 new_metrics = {}
994 for metric_path in self.metrics.keys():
995 # Address RGW sync perf. counters.
996 match = re.search('^data-sync-from-(.*)\.', metric_path)
997 if match:
998 new_path = re.sub('from-([^.]*)', 'from-zone', metric_path)
999 if new_path not in new_metrics:
1000 new_metrics[new_path] = Metric(
1001 self.metrics[metric_path].mtype,
1002 new_path,
1003 self.metrics[metric_path].desc,
1004 self.metrics[metric_path].labelnames + ('source_zone',)
1005 )
1006 for label_values, value in self.metrics[metric_path].value.items():
1007 new_metrics[new_path].set(value, label_values + (match.group(1),))
1008
1009 self.metrics.update(new_metrics)
1010
f6b5b4d7 1011 @profile_method(True)
c07f9fc5 1012 def collect(self):
91327a77
AA
1013 # Clear the metrics before scraping
1014 for k in self.metrics.keys():
1015 self.metrics[k].clear()
1016
3efd9988
FG
1017 self.get_health()
1018 self.get_df()
11fdf7f2 1019 self.get_pool_stats()
94b18763 1020 self.get_fs()
b32b8144 1021 self.get_osd_stats()
3efd9988 1022 self.get_quorum_status()
494da23a 1023 self.get_mgr_status()
3efd9988
FG
1024 self.get_metadata_and_osd_status()
1025 self.get_pg_status()
28e407b8 1026 self.get_num_objects()
3efd9988 1027
94b18763 1028 for daemon, counters in self.get_all_perf_counters().items():
3efd9988 1029 for path, counter_info in counters.items():
28e407b8 1030 # Skip histograms, they are represented by long running avgs
3efd9988 1031 stattype = self._stattype_to_str(counter_info['type'])
3efd9988
FG
1032 if not stattype or stattype == 'histogram':
1033 self.log.debug('ignoring %s, type %s' % (path, stattype))
1034 continue
1035
81eedcae
TL
1036 path, label_names, labels = self._perfpath_to_path_labels(
1037 daemon, path)
1038
28e407b8 1039 # Get the value of the counter
11fdf7f2
TL
1040 value = self._perfvalue_to_value(
1041 counter_info['type'], counter_info['value'])
28e407b8
AA
1042
1043 # Represent the long running avgs as sum/count pairs
1044 if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
1045 _path = path + '_sum'
91327a77
AA
1046 if _path not in self.metrics:
1047 self.metrics[_path] = Metric(
1048 stattype,
1049 _path,
1050 counter_info['description'] + ' Total',
81eedcae 1051 label_names,
91327a77 1052 )
81eedcae 1053 self.metrics[_path].set(value, labels)
28e407b8
AA
1054
1055 _path = path + '_count'
91327a77
AA
1056 if _path not in self.metrics:
1057 self.metrics[_path] = Metric(
1058 'counter',
1059 _path,
1060 counter_info['description'] + ' Count',
81eedcae 1061 label_names,
91327a77 1062 )
81eedcae 1063 self.metrics[_path].set(counter_info['count'], labels,)
28e407b8 1064 else:
91327a77
AA
1065 if path not in self.metrics:
1066 self.metrics[path] = Metric(
1067 stattype,
1068 path,
1069 counter_info['description'],
81eedcae 1070 label_names,
91327a77 1071 )
81eedcae 1072 self.metrics[path].set(value, labels)
91327a77 1073
e306af50 1074 self.add_fixed_name_metrics()
11fdf7f2
TL
1075 self.get_rbd_stats()
1076
91327a77
AA
1077 # Return formatted metrics and clear no longer used data
1078 _metrics = [m.str_expfmt() for m in self.metrics.values()]
1079 for k in self.metrics.keys():
1080 self.metrics[k].clear()
1081
1082 return ''.join(_metrics) + '\n'
c07f9fc5 1083
11fdf7f2
TL
1084 def get_file_sd_config(self):
1085 servers = self.list_servers()
1086 targets = []
1087 for server in servers:
1088 hostname = server.get('hostname', '')
1089 for service in server.get('services', []):
1090 if service['type'] != 'mgr':
1091 continue
1092 id_ = service['id']
1093 # get port for prometheus module at mgr with id_
1094 # TODO use get_config_prefix or get_config here once
1095 # https://github.com/ceph/ceph/pull/20458 is merged
1096 result = CommandResult("")
f6b5b4d7
TL
1097 assert isinstance(_global_instance, Module)
1098 _global_instance.send_command(
11fdf7f2
TL
1099 result, "mon", '',
1100 json.dumps({
1101 "prefix": "config-key get",
1102 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_),
1103 }),
1104 "")
1105 r, outb, outs = result.wait()
1106 if r != 0:
f6b5b4d7 1107 _global_instance.log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs))
11fdf7f2
TL
1108 targets.append('{}:{}'.format(hostname, DEFAULT_PORT))
1109 else:
1110 port = json.loads(outb)
1111 targets.append('{}:{}'.format(hostname, port))
1112
1113 ret = [
1114 {
1115 "targets": targets,
1116 "labels": {}
1117 }
1118 ]
1119 return 0, json.dumps(ret), ""
1120
1121 def self_test(self):
1122 self.collect()
1123 self.get_file_sd_config()
1124
1125 def handle_command(self, inbuf, cmd):
1126 if cmd['prefix'] == 'prometheus file_sd_config':
1127 return self.get_file_sd_config()
3efd9988
FG
1128 else:
1129 return (-errno.EINVAL, '',
1130 "Command not found '{0}'".format(cmd['prefix']))
c07f9fc5
FG
1131
1132 def serve(self):
1133
1134 class Root(object):
1135
1136 # collapse everything to '/'
1137 def _cp_dispatch(self, vpath):
1138 cherrypy.request.path = ''
1139 return self
1140
c07f9fc5
FG
1141 @cherrypy.expose
1142 def index(self):
3efd9988
FG
1143 return '''<!DOCTYPE html>
1144<html>
9f95a23c
TL
1145 <head><title>Ceph Exporter</title></head>
1146 <body>
1147 <h1>Ceph Exporter</h1>
1148 <p><a href='/metrics'>Metrics</a></p>
1149 </body>
3efd9988
FG
1150</html>'''
1151
1152 @cherrypy.expose
1153 def metrics(self):
91327a77 1154 # Lock the function execution
f6b5b4d7
TL
1155 assert isinstance(_global_instance, Module)
1156 with _global_instance.collect_lock:
1157 return self._metrics(_global_instance)
91327a77 1158
11fdf7f2
TL
1159 @staticmethod
1160 def _metrics(instance):
f6b5b4d7
TL
1161 # type: (Module) -> Any
1162 # Return cached data if available
1163 if not instance.collect_cache:
1164 raise cherrypy.HTTPError(503, 'No cached data available yet')
91327a77 1165
f6b5b4d7
TL
1166 def respond():
1167 assert isinstance(instance, Module)
91327a77
AA
1168 cherrypy.response.headers['Content-Type'] = 'text/plain'
1169 return instance.collect_cache
f6b5b4d7
TL
1170
1171 if instance.collect_time < instance.scrape_interval:
1172 # Respond if cache isn't stale
1173 return respond()
1174
1175 if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN:
1176 # Respond even if cache is stale
1177 instance.log.info(
1178 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, '
1179 'returning metrics from stale cache.'.format(
1180 instance.collect_time,
1181 instance.collect_time - instance.scrape_interval
1182 )
1183 )
1184 return respond()
1185
1186 if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL:
1187 # Fail if cache is stale
1188 msg = (
1189 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, '
1190 'returning "service unavailable".'.format(
1191 instance.collect_time,
1192 instance.collect_time - instance.scrape_interval,
1193 )
1194 )
1195 instance.log.error(msg)
1196 raise cherrypy.HTTPError(503, msg)
c07f9fc5 1197
91327a77 1198 # Make the cache timeout for collecting configurable
f6b5b4d7
TL
1199 self.scrape_interval = float(self.get_localized_module_option('scrape_interval', 15.0))
1200
1201 self.stale_cache_strategy = self.get_localized_module_option('stale_cache_strategy', 'log')
1202 if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL,
1203 self.STALE_CACHE_RETURN]:
1204 self.stale_cache_strategy = self.STALE_CACHE_FAIL
91327a77 1205
11fdf7f2 1206 server_addr = self.get_localized_module_option(
494da23a 1207 'server_addr', get_default_addr())
11fdf7f2
TL
1208 server_port = self.get_localized_module_option(
1209 'server_port', DEFAULT_PORT)
c07f9fc5
FG
1210 self.log.info(
1211 "server_addr: %s server_port: %s" %
1212 (server_addr, server_port)
1213 )
c07f9fc5 1214
94b18763
FG
1215 # Publish the URI that others may use to access the service we're
1216 # about to start serving
1217 self.set_uri('http://{0}:{1}/'.format(
eafe8130 1218 socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr,
94b18763
FG
1219 server_port
1220 ))
1221
c07f9fc5
FG
1222 cherrypy.config.update({
1223 'server.socket_host': server_addr,
3efd9988 1224 'server.socket_port': int(server_port),
c07f9fc5
FG
1225 'engine.autoreload.on': False
1226 })
1227 cherrypy.tree.mount(Root(), "/")
94b18763 1228 self.log.info('Starting engine...')
c07f9fc5 1229 cherrypy.engine.start()
94b18763 1230 self.log.info('Engine started.')
91327a77
AA
1231 # wait for the shutdown event
1232 self.shutdown_event.wait()
1233 self.shutdown_event.clear()
1234 cherrypy.engine.stop()
1235 self.log.info('Engine stopped.')
11fdf7f2 1236 self.shutdown_rbd_stats()
94b18763
FG
1237
1238 def shutdown(self):
1239 self.log.info('Stopping engine...')
91327a77 1240 self.shutdown_event.set()
94b18763
FG
1241
1242
1243class StandbyModule(MgrStandbyModule):
91327a77
AA
1244 def __init__(self, *args, **kwargs):
1245 super(StandbyModule, self).__init__(*args, **kwargs)
1246 self.shutdown_event = threading.Event()
1247
94b18763 1248 def serve(self):
494da23a
TL
1249 server_addr = self.get_localized_module_option(
1250 'server_addr', get_default_addr())
11fdf7f2
TL
1251 server_port = self.get_localized_module_option(
1252 'server_port', DEFAULT_PORT)
1253 self.log.info("server_addr: %s server_port: %s" %
1254 (server_addr, server_port))
94b18763
FG
1255 cherrypy.config.update({
1256 'server.socket_host': server_addr,
1257 'server.socket_port': int(server_port),
1258 'engine.autoreload.on': False
1259 })
1260
1261 module = self
1262
1263 class Root(object):
94b18763
FG
1264 @cherrypy.expose
1265 def index(self):
1266 active_uri = module.get_active_uri()
1267 return '''<!DOCTYPE html>
1268<html>
9f95a23c
TL
1269 <head><title>Ceph Exporter</title></head>
1270 <body>
1271 <h1>Ceph Exporter</h1>
94b18763 1272 <p><a href='{}metrics'>Metrics</a></p>
9f95a23c 1273 </body>
94b18763
FG
1274</html>'''.format(active_uri)
1275
1276 @cherrypy.expose
1277 def metrics(self):
1278 cherrypy.response.headers['Content-Type'] = 'text/plain'
1279 return ''
1280
1281 cherrypy.tree.mount(Root(), '/', {})
1282 self.log.info('Starting engine...')
1283 cherrypy.engine.start()
94b18763 1284 self.log.info('Engine started.')
91327a77
AA
1285 # Wait for shutdown event
1286 self.shutdown_event.wait()
1287 self.shutdown_event.clear()
1288 cherrypy.engine.stop()
1289 self.log.info('Engine stopped.')
94b18763
FG
1290
1291 def shutdown(self):
1292 self.log.info("Stopping engine...")
91327a77 1293 self.shutdown_event.set()
94b18763 1294 self.log.info("Stopped engine")