]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/prometheus/module.py
Import ceph 15.2.8
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
CommitLineData
c07f9fc5 1import cherrypy
a8e16298 2from distutils.version import StrictVersion
3efd9988
FG
3import json
4import errno
c07f9fc5
FG
5import math
6import os
11fdf7f2 7import re
94b18763 8import socket
91327a77
AA
9import threading
10import time
11fdf7f2 11from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
f6b5b4d7 12from mgr_util import get_default_addr, profile_method
11fdf7f2 13from rbd import RBD
f6b5b4d7
TL
14try:
15 from typing import Optional, Dict, Any, Set
16except:
17 pass
c07f9fc5
FG
18
19# Defaults for the Prometheus HTTP server. Can also set in config-key
20# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
21# for Prometheus exporter port registry
22
c07f9fc5
FG
23DEFAULT_PORT = 9283
24
a8e16298
TL
25# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
26# that the ports its listening on are in fact bound. When using the any address
27# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
28# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
29# exception.
30if cherrypy is not None:
31 v = StrictVersion(cherrypy.__version__)
32 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
33 # centos:7) and back to at least 3.0.0.
34 if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
35 # https://github.com/cherrypy/cherrypy/issues/1100
36 from cherrypy.process import servers
37 servers.wait_for_occupied_port = lambda host, port: None
c07f9fc5 38
9f95a23c 39
c07f9fc5 40# cherrypy likes to sys.exit on error. don't let it take us down too!
3efd9988 41def os_exit_noop(*args, **kwargs):
c07f9fc5
FG
42 pass
43
44
45os._exit = os_exit_noop
46
c07f9fc5
FG
47# to access things in class Module from subclass Root. Because
48# it's a dict, the writer doesn't need to declare 'global' for access
49
f6b5b4d7 50_global_instance = None # type: Optional[Module]
c07f9fc5
FG
51
52
3efd9988 53def health_status_to_number(status):
3efd9988
FG
54 if status == 'HEALTH_OK':
55 return 0
56 elif status == 'HEALTH_WARN':
57 return 1
58 elif status == 'HEALTH_ERR':
59 return 2
c07f9fc5 60
11fdf7f2
TL
61
62DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
63
64DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty',
f91f0fd5
TL
65 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes',
66 'compress_bytes_used', 'compress_under_bytes']
c07f9fc5 67
11fdf7f2
TL
68OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
69 'recovering_keys_per_sec', 'num_objects_recovered',
70 'num_bytes_recovered', 'num_bytes_recovered')
71
94b18763
FG
72OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
73 'norecover', 'noscrub', 'nodeep-scrub')
3efd9988 74
28e407b8 75FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
b32b8144 76
28e407b8
AA
77MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
78 'ceph_version')
3efd9988 79
11fdf7f2
TL
80MON_METADATA = ('ceph_daemon', 'hostname',
81 'public_addr', 'rank', 'ceph_version')
c07f9fc5 82
494da23a
TL
83MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
84
85MGR_STATUS = ('ceph_daemon',)
86
87MGR_MODULE_STATUS = ('name',)
88
89MGR_MODULE_CAN_RUN = ('name',)
90
a8e16298
TL
91OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
92 'front_iface', 'hostname', 'objectstore', 'public_addr',
93 'ceph_version')
c07f9fc5 94
94b18763 95OSD_STATUS = ['weight', 'up', 'in']
c07f9fc5 96
94b18763 97OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
c07f9fc5 98
94b18763 99POOL_METADATA = ('pool_id', 'name')
c07f9fc5 100
28e407b8 101RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
c07f9fc5 102
11fdf7f2
TL
103RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname',
104 'ceph_version')
105
106DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
107 'wal_device', 'instance')
28e407b8
AA
108
109NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
c07f9fc5 110
c07f9fc5 111
91327a77
AA
112class Metric(object):
113 def __init__(self, mtype, name, desc, labels=None):
114 self.mtype = mtype
115 self.name = name
116 self.desc = desc
117 self.labelnames = labels # tuple if present
118 self.value = {} # indexed by label values
119
120 def clear(self):
121 self.value = {}
122
123 def set(self, value, labelvalues=None):
124 # labelvalues must be a tuple
125 labelvalues = labelvalues or ('',)
126 self.value[labelvalues] = value
3efd9988 127
91327a77
AA
128 def str_expfmt(self):
129
130 def promethize(path):
131 ''' replace illegal metric name characters '''
81eedcae 132 result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus')
91327a77
AA
133
134 # Hyphens usually turn into underscores, unless they are
135 # trailing
136 if result.endswith("-"):
137 result = result[0:-1] + "_minus"
138 else:
139 result = result.replace("-", "_")
140
141 return "ceph_{0}".format(result)
142
143 def floatstr(value):
144 ''' represent as Go-compatible float '''
145 if value == float('inf'):
146 return '+Inf'
147 if value == float('-inf'):
148 return '-Inf'
149 if math.isnan(value):
150 return 'NaN'
151 return repr(float(value))
152
153 name = promethize(self.name)
154 expfmt = '''
155# HELP {name} {desc}
156# TYPE {name} {mtype}'''.format(
157 name=name,
158 desc=self.desc,
159 mtype=self.mtype,
160 )
161
162 for labelvalues, value in self.value.items():
163 if self.labelnames:
f6b5b4d7
TL
164 labels_list = zip(self.labelnames, labelvalues)
165 labels = ','.join('%s="%s"' % (k, v) for k, v in labels_list)
91327a77
AA
166 else:
167 labels = ''
168 if labels:
169 fmtstr = '\n{name}{{{labels}}} {value}'
170 else:
171 fmtstr = '\n{name} {value}'
172 expfmt += fmtstr.format(
173 name=name,
174 labels=labels,
175 value=floatstr(value),
176 )
177 return expfmt
178
179
f6b5b4d7
TL
180class MetricCollectionThread(threading.Thread):
181 def __init__(self, module):
182 # type: (Module) -> None
183 self.mod = module
184 super(MetricCollectionThread, self).__init__(target=self.collect)
185
186 def collect(self):
187 self.mod.log.info('starting metric collection thread')
188 while True:
189 self.mod.log.debug('collecting cache in thread')
190 if self.mod.have_mon_connection():
191 start_time = time.time()
192 data = self.mod.collect()
193 duration = time.time() - start_time
194
195 self.mod.log.debug('collecting cache in thread done')
196
197 sleep_time = self.mod.scrape_interval - duration
198 if sleep_time < 0:
199 self.mod.log.warning(
200 'Collecting data took more time than configured scrape interval. '
201 'This possibly results in stale data. Please check the '
202 '`stale_cache_strategy` configuration option. '
203 'Collecting data took {:.2f} seconds but scrape interval is configured '
204 'to be {:.0f} seconds.'.format(
205 duration,
206 self.mod.scrape_interval,
207 )
208 )
209 sleep_time = 0
210
211 with self.mod.collect_lock:
212 self.mod.collect_cache = data
213 self.mod.collect_time = duration
214
215 time.sleep(sleep_time)
216 else:
217 self.mod.log.error('No MON connection')
218 time.sleep(self.mod.scrape_interval)
219
220
91327a77
AA
221class Module(MgrModule):
222 COMMANDS = [
223 {
11fdf7f2
TL
224 "cmd": "prometheus file_sd_config",
225 "desc": "Return file_sd compatible prometheus config for mgr cluster",
226 "perm": "r"
91327a77
AA
227 },
228 ]
229
11fdf7f2
TL
230 MODULE_OPTIONS = [
231 {'name': 'server_addr'},
232 {'name': 'server_port'},
233 {'name': 'scrape_interval'},
f6b5b4d7 234 {'name': 'stale_cache_strategy'},
11fdf7f2 235 {'name': 'rbd_stats_pools'},
e306af50 236 {'name': 'rbd_stats_pools_refresh_interval', 'type': 'int', 'default': 300},
91327a77
AA
237 ]
238
f6b5b4d7
TL
239 STALE_CACHE_FAIL = 'fail'
240 STALE_CACHE_RETURN = 'return'
241
91327a77
AA
242 def __init__(self, *args, **kwargs):
243 super(Module, self).__init__(*args, **kwargs)
244 self.metrics = self._setup_static_metrics()
245 self.shutdown_event = threading.Event()
f6b5b4d7
TL
246 self.collect_lock = threading.Lock()
247 self.collect_time = 0.0
248 self.scrape_interval = 15.0
249 self.stale_cache_strategy = self.STALE_CACHE_FAIL
91327a77 250 self.collect_cache = None
11fdf7f2
TL
251 self.rbd_stats = {
252 'pools': {},
253 'pools_refresh_time': 0,
254 'counters_info': {
255 'write_ops': {'type': self.PERFCOUNTER_COUNTER,
256 'desc': 'RBD image writes count'},
257 'read_ops': {'type': self.PERFCOUNTER_COUNTER,
258 'desc': 'RBD image reads count'},
259 'write_bytes': {'type': self.PERFCOUNTER_COUNTER,
260 'desc': 'RBD image bytes written'},
261 'read_bytes': {'type': self.PERFCOUNTER_COUNTER,
262 'desc': 'RBD image bytes read'},
263 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
264 'desc': 'RBD image writes latency (msec)'},
265 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
266 'desc': 'RBD image reads latency (msec)'},
267 },
f6b5b4d7
TL
268 } # type: Dict[str, Any]
269 global _global_instance
270 _global_instance = self
271 MetricCollectionThread(_global_instance).start()
3efd9988
FG
272
273 def _setup_static_metrics(self):
274 metrics = {}
275 metrics['health_status'] = Metric(
276 'untyped',
277 'health_status',
278 'Cluster health status'
279 )
94b18763 280 metrics['mon_quorum_status'] = Metric(
3efd9988 281 'gauge',
94b18763
FG
282 'mon_quorum_status',
283 'Monitors in quorum',
284 ('ceph_daemon',)
285 )
286 metrics['fs_metadata'] = Metric(
287 'untyped',
288 'fs_metadata',
289 'FS Metadata',
290 FS_METADATA
291 )
292 metrics['mds_metadata'] = Metric(
293 'untyped',
294 'mds_metadata',
295 'MDS Metadata',
296 MDS_METADATA
297 )
298 metrics['mon_metadata'] = Metric(
299 'untyped',
300 'mon_metadata',
301 'MON Metadata',
302 MON_METADATA
3efd9988 303 )
494da23a
TL
304 metrics['mgr_metadata'] = Metric(
305 'gauge',
306 'mgr_metadata',
307 'MGR metadata',
308 MGR_METADATA
309 )
310 metrics['mgr_status'] = Metric(
311 'gauge',
312 'mgr_status',
313 'MGR status (0=standby, 1=active)',
314 MGR_STATUS
315 )
316 metrics['mgr_module_status'] = Metric(
317 'gauge',
318 'mgr_module_status',
319 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
320 MGR_MODULE_STATUS
321 )
322 metrics['mgr_module_can_run'] = Metric(
323 'gauge',
324 'mgr_module_can_run',
325 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
326 MGR_MODULE_CAN_RUN
327 )
3efd9988
FG
328 metrics['osd_metadata'] = Metric(
329 'untyped',
330 'osd_metadata',
331 'OSD Metadata',
332 OSD_METADATA
333 )
c07f9fc5 334
3efd9988
FG
335 # The reason for having this separate to OSD_METADATA is
336 # so that we can stably use the same tag names that
337 # the Prometheus node_exporter does
338 metrics['disk_occupation'] = Metric(
b32b8144 339 'untyped',
3efd9988
FG
340 'disk_occupation',
341 'Associate Ceph daemon with disk used',
342 DISK_OCCUPATION
343 )
c07f9fc5 344
3efd9988
FG
345 metrics['pool_metadata'] = Metric(
346 'untyped',
347 'pool_metadata',
348 'POOL Metadata',
349 POOL_METADATA
350 )
94b18763
FG
351
352 metrics['rgw_metadata'] = Metric(
353 'untyped',
354 'rgw_metadata',
355 'RGW Metadata',
356 RGW_METADATA
357 )
358
11fdf7f2
TL
359 metrics['rbd_mirror_metadata'] = Metric(
360 'untyped',
361 'rbd_mirror_metadata',
362 'RBD Mirror Metadata',
363 RBD_MIRROR_METADATA
364 )
365
94b18763
FG
366 metrics['pg_total'] = Metric(
367 'gauge',
368 'pg_total',
92f5a8d4
TL
369 'PG Total Count per Pool',
370 ('pool_id',)
94b18763
FG
371 )
372
373 for flag in OSD_FLAGS:
374 path = 'osd_flag_{}'.format(flag)
375 metrics[path] = Metric(
376 'untyped',
377 path,
378 'OSD Flag {}'.format(flag)
379 )
3efd9988
FG
380 for state in OSD_STATUS:
381 path = 'osd_{}'.format(state)
3efd9988
FG
382 metrics[path] = Metric(
383 'untyped',
c07f9fc5 384 path,
3efd9988
FG
385 'OSD status {}'.format(state),
386 ('ceph_daemon',)
c07f9fc5 387 )
b32b8144
FG
388 for stat in OSD_STATS:
389 path = 'osd_{}'.format(stat)
b32b8144
FG
390 metrics[path] = Metric(
391 'gauge',
392 path,
393 'OSD stat {}'.format(stat),
394 ('ceph_daemon',)
395 )
11fdf7f2
TL
396 for stat in OSD_POOL_STATS:
397 path = 'pool_{}'.format(stat)
398 metrics[path] = Metric(
399 'gauge',
400 path,
9f95a23c 401 "OSD pool stats: {}".format(stat),
11fdf7f2
TL
402 ('pool_id',)
403 )
3efd9988
FG
404 for state in PG_STATES:
405 path = 'pg_{}'.format(state)
3efd9988
FG
406 metrics[path] = Metric(
407 'gauge',
408 path,
92f5a8d4
TL
409 'PG {} per pool'.format(state),
410 ('pool_id',)
3efd9988
FG
411 )
412 for state in DF_CLUSTER:
413 path = 'cluster_{}'.format(state)
3efd9988
FG
414 metrics[path] = Metric(
415 'gauge',
416 path,
417 'DF {}'.format(state),
418 )
419 for state in DF_POOL:
420 path = 'pool_{}'.format(state)
3efd9988
FG
421 metrics[path] = Metric(
422 'gauge',
423 path,
424 'DF pool {}'.format(state),
425 ('pool_id',)
426 )
28e407b8
AA
427 for state in NUM_OBJECTS:
428 path = 'num_objects_{}'.format(state)
429 metrics[path] = Metric(
430 'gauge',
431 path,
432 'Number of {} objects'.format(state),
433 )
3efd9988
FG
434
435 return metrics
c07f9fc5 436
f6b5b4d7 437 @profile_method()
3efd9988
FG
438 def get_health(self):
439 health = json.loads(self.get('health')['json'])
91327a77
AA
440 self.metrics['health_status'].set(
441 health_status_to_number(health['status'])
c07f9fc5
FG
442 )
443
f6b5b4d7 444 @profile_method()
11fdf7f2
TL
445 def get_pool_stats(self):
446 # retrieve pool stats to provide per pool recovery metrics
447 # (osd_pool_stats moved to mgr in Mimic)
448 pstats = self.get('osd_pool_stats')
449 for pool in pstats['pool_stats']:
450 for stat in OSD_POOL_STATS:
451 self.metrics['pool_{}'.format(stat)].set(
452 pool['recovery_rate'].get(stat, 0),
453 (pool['pool_id'],)
454 )
455
f6b5b4d7 456 @profile_method()
3efd9988
FG
457 def get_df(self):
458 # maybe get the to-be-exported metrics from a config?
459 df = self.get('df')
460 for stat in DF_CLUSTER:
91327a77 461 self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat])
3efd9988
FG
462
463 for pool in df['pools']:
464 for stat in DF_POOL:
91327a77
AA
465 self.metrics['pool_{}'.format(stat)].set(
466 pool['stats'][stat],
467 (pool['id'],)
468 )
94b18763 469
f6b5b4d7 470 @profile_method()
94b18763
FG
471 def get_fs(self):
472 fs_map = self.get('fs_map')
473 servers = self.get_service_list()
9f95a23c
TL
474 self.log.debug('standbys: {}'.format(fs_map['standbys']))
475 # export standby mds metadata, default standby fs_id is '-1'
476 for standby in fs_map['standbys']:
477 id_ = standby['name']
478 host_version = servers.get((id_, 'mds'), ('', ''))
479 self.metrics['mds_metadata'].set(1, (
480 'mds.{}'.format(id_), '-1',
481 host_version[0], standby['addr'],
482 standby['rank'], host_version[1]
483 ))
94b18763
FG
484 for fs in fs_map['filesystems']:
485 # collect fs metadata
11fdf7f2
TL
486 data_pools = ",".join([str(pool)
487 for pool in fs['mdsmap']['data_pools']])
91327a77
AA
488 self.metrics['fs_metadata'].set(1, (
489 data_pools,
490 fs['id'],
491 fs['mdsmap']['metadata_pool'],
492 fs['mdsmap']['fs_name']
493 ))
28e407b8 494 self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
94b18763
FG
495 for gid, daemon in fs['mdsmap']['info'].items():
496 id_ = daemon['name']
11fdf7f2 497 host_version = servers.get((id_, 'mds'), ('', ''))
91327a77
AA
498 self.metrics['mds_metadata'].set(1, (
499 'mds.{}'.format(id_), fs['id'],
500 host_version[0], daemon['addr'],
501 daemon['rank'], host_version[1]
502 ))
3efd9988 503
f6b5b4d7 504 @profile_method()
3efd9988
FG
505 def get_quorum_status(self):
506 mon_status = json.loads(self.get('mon_status')['json'])
94b18763
FG
507 servers = self.get_service_list()
508 for mon in mon_status['monmap']['mons']:
509 rank = mon['rank']
510 id_ = mon['name']
11fdf7f2 511 host_version = servers.get((id_, 'mon'), ('', ''))
91327a77
AA
512 self.metrics['mon_metadata'].set(1, (
513 'mon.{}'.format(id_), host_version[0],
f91f0fd5 514 mon['public_addr'].rsplit(':', 1)[0], rank,
91327a77
AA
515 host_version[1]
516 ))
94b18763 517 in_quorum = int(rank in mon_status['quorum'])
91327a77
AA
518 self.metrics['mon_quorum_status'].set(in_quorum, (
519 'mon.{}'.format(id_),
520 ))
3efd9988 521
f6b5b4d7 522 @profile_method()
494da23a
TL
523 def get_mgr_status(self):
524 mgr_map = self.get('mgr_map')
525 servers = self.get_service_list()
526
527 active = mgr_map['active_name']
528 standbys = [s.get('name') for s in mgr_map['standbys']]
529
530 all_mgrs = list(standbys)
531 all_mgrs.append(active)
532
533 all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']}
534
eafe8130 535 ceph_release = None
494da23a
TL
536 for mgr in all_mgrs:
537 host_version = servers.get((mgr, 'mgr'), ('', ''))
538 if mgr == active:
539 _state = 1
540 ceph_release = host_version[1].split()[-2] # e.g. nautilus
541 else:
542 _state = 0
801d1391 543
494da23a
TL
544 self.metrics['mgr_metadata'].set(1, (
545 'mgr.{}'.format(mgr), host_version[0],
546 host_version[1]
547 ))
548 self.metrics['mgr_status'].set(_state, (
801d1391 549 'mgr.{}'.format(mgr),
494da23a 550 ))
eafe8130 551 always_on_modules = mgr_map['always_on_modules'].get(ceph_release, [])
494da23a
TL
552 active_modules = list(always_on_modules)
553 active_modules.extend(mgr_map['modules'])
554
555 for mod_name in all_modules.keys():
556
557 if mod_name in always_on_modules:
558 _state = 2
559 elif mod_name in active_modules:
560 _state = 1
561 else:
562 _state = 0
563
564 _can_run = 1 if all_modules[mod_name] else 0
565 self.metrics['mgr_module_status'].set(_state, (mod_name,))
566 self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,))
567
f6b5b4d7 568 @profile_method()
3efd9988 569 def get_pg_status(self):
94b18763 570
92f5a8d4
TL
571 pg_summary = self.get('pg_summary')
572
573 for pool in pg_summary['by_pool']:
801d1391
TL
574 num_by_state = dict((state, 0) for state in PG_STATES)
575 num_by_state['total'] = 0
92f5a8d4 576
801d1391 577 for state_name, count in pg_summary['by_pool'][pool].items():
92f5a8d4 578 for state in state_name.split('+'):
801d1391
TL
579 num_by_state[state] += count
580 num_by_state['total'] += count
581
582 for state, num in num_by_state.items():
583 try:
584 self.metrics["pg_{}".format(state)].set(num, (pool,))
585 except KeyError:
e306af50 586 self.log.warning("skipping pg in unknown state {}".format(state))
b32b8144 587
f6b5b4d7 588 @profile_method()
b32b8144
FG
589 def get_osd_stats(self):
590 osd_stats = self.get('osd_stats')
591 for osd in osd_stats['osd_stats']:
592 id_ = osd['osd']
593 for stat in OSD_STATS:
94b18763 594 val = osd['perf_stat'][stat]
91327a77
AA
595 self.metrics['osd_{}'.format(stat)].set(val, (
596 'osd.{}'.format(id_),
597 ))
94b18763
FG
598
599 def get_service_list(self):
600 ret = {}
601 for server in self.list_servers():
602 version = server.get('ceph_version', '')
603 host = server.get('hostname', '')
604 for service in server.get('services', []):
605 ret.update({(service['id'], service['type']): (host, version)})
606 return ret
3efd9988 607
f6b5b4d7 608 @profile_method()
3efd9988
FG
609 def get_metadata_and_osd_status(self):
610 osd_map = self.get('osd_map')
94b18763
FG
611 osd_flags = osd_map['flags'].split(',')
612 for flag in OSD_FLAGS:
91327a77
AA
613 self.metrics['osd_flag_{}'.format(flag)].set(
614 int(flag in osd_flags)
615 )
94b18763 616
3efd9988 617 osd_devices = self.get('osd_map_crush')['devices']
94b18763 618 servers = self.get_service_list()
3efd9988 619 for osd in osd_map['osds']:
94b18763 620 # id can be used to link osd metrics and metadata
3efd9988 621 id_ = osd['osd']
94b18763 622 # collect osd metadata
f91f0fd5
TL
623 p_addr = osd['public_addr'].rsplit(':', 1)[0]
624 c_addr = osd['cluster_addr'].rsplit(':', 1)[0]
94b18763
FG
625 if p_addr == "-" or c_addr == "-":
626 self.log.info(
627 "Missing address metadata for osd {0}, skipping occupation"
628 " and metadata records for this osd".format(id_)
629 )
630 continue
631
632 dev_class = None
633 for osd_device in osd_devices:
634 if osd_device['id'] == id_:
635 dev_class = osd_device.get('class', '')
636 break
637
638 if dev_class is None:
9f95a23c
TL
639 self.log.info("OSD {0} is missing from CRUSH map, "
640 "skipping output".format(id_))
94b18763
FG
641 continue
642
11fdf7f2 643 host_version = servers.get((str(id_), 'osd'), ('', ''))
94b18763 644
a8e16298
TL
645 # collect disk occupation metadata
646 osd_metadata = self.get_metadata("osd", str(id_))
647 if osd_metadata is None:
648 continue
649
650 obj_store = osd_metadata.get('osd_objectstore', '')
651 f_iface = osd_metadata.get('front_iface', '')
652 b_iface = osd_metadata.get('back_iface', '')
653
91327a77 654 self.metrics['osd_metadata'].set(1, (
a8e16298 655 b_iface,
28e407b8 656 'osd.{}'.format(id_),
3efd9988 657 c_addr,
94b18763 658 dev_class,
a8e16298 659 f_iface,
28e407b8 660 host_version[0],
a8e16298
TL
661 obj_store,
662 p_addr,
663 host_version[1]
3efd9988 664 ))
94b18763
FG
665
666 # collect osd status
3efd9988
FG
667 for state in OSD_STATUS:
668 status = osd[state]
91327a77
AA
669 self.metrics['osd_{}'.format(state)].set(status, (
670 'osd.{}'.format(id_),
671 ))
3efd9988 672
92f5a8d4 673 osd_dev_node = None
a8e16298 674 if obj_store == "filestore":
11fdf7f2
TL
675 # collect filestore backend device
676 osd_dev_node = osd_metadata.get(
677 'backend_filestore_dev_node', None)
678 # collect filestore journal device
f64942e4
AA
679 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
680 osd_db_dev_node = ''
a8e16298 681 elif obj_store == "bluestore":
11fdf7f2
TL
682 # collect bluestore backend device
683 osd_dev_node = osd_metadata.get(
684 'bluestore_bdev_dev_node', None)
685 # collect bluestore wal backend
f64942e4 686 osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
11fdf7f2 687 # collect bluestore db backend
f64942e4
AA
688 osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
689 if osd_dev_node and osd_dev_node == "unknown":
690 osd_dev_node = None
691
3efd9988
FG
692 osd_hostname = osd_metadata.get('hostname', None)
693 if osd_dev_node and osd_hostname:
694 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
695 id_, osd_hostname, osd_dev_node))
91327a77 696 self.metrics['disk_occupation'].set(1, (
28e407b8 697 "osd.{0}".format(id_),
3efd9988 698 osd_dev_node,
f64942e4
AA
699 osd_db_dev_node,
700 osd_wal_dev_node,
28e407b8 701 osd_hostname
3efd9988
FG
702 ))
703 else:
704 self.log.info("Missing dev node metadata for osd {0}, skipping "
11fdf7f2 705 "occupation record for this osd".format(id_))
3efd9988
FG
706
707 for pool in osd_map['pools']:
11fdf7f2
TL
708 self.metrics['pool_metadata'].set(
709 1, (pool['pool'], pool['pool_name']))
94b18763 710
11fdf7f2 711 # Populate other servers metadata
94b18763
FG
712 for key, value in servers.items():
713 service_id, service_type = key
11fdf7f2
TL
714 if service_type == 'rgw':
715 hostname, version = value
716 self.metrics['rgw_metadata'].set(
717 1,
9f95a23c
TL
718 ('{}.{}'.format(service_type, service_id),
719 hostname, version)
11fdf7f2
TL
720 )
721 elif service_type == 'rbd-mirror':
722 mirror_metadata = self.get_metadata('rbd-mirror', service_id)
723 if mirror_metadata is None:
724 continue
725 mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type,
726 service_id)
727 self.metrics['rbd_mirror_metadata'].set(
728 1, (mirror_metadata.get(k, '')
729 for k in RBD_MIRROR_METADATA)
730 )
3efd9988 731
f6b5b4d7 732 @profile_method()
28e407b8
AA
733 def get_num_objects(self):
734 pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
735 for obj in NUM_OBJECTS:
736 stat = 'num_objects_{}'.format(obj)
91327a77 737 self.metrics[stat].set(pg_sum[stat])
28e407b8 738
f6b5b4d7 739 @profile_method()
11fdf7f2
TL
740 def get_rbd_stats(self):
741 # Per RBD image stats is collected by registering a dynamic osd perf
742 # stats query that tells OSDs to group stats for requests associated
743 # with RBD objects by pool, namespace, and image id, which are
744 # extracted from the request object names or other attributes.
745 # The RBD object names have the following prefixes:
746 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
747 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
748 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
749 # The pool_id in the object name is the id of the pool with the image
750 # metdata, and should be used in the image spec. If there is no pool_id
751 # in the object name, the image pool is the pool where the object is
752 # located.
753
754 # Parse rbd_stats_pools option, which is a comma or space separated
755 # list of pool[/namespace] entries. If no namespace is specifed the
f6b5b4d7
TL
756 # stats are collected for every namespace in the pool. The wildcard
757 # '*' can be used to indicate all pools or namespaces
11fdf7f2 758 pools_string = self.get_localized_module_option('rbd_stats_pools', '')
f6b5b4d7
TL
759 pool_keys = []
760 for x in re.split('[\s,]+', pools_string):
761 if not x:
762 continue
763
764 s = x.split('/', 2)
11fdf7f2 765 pool_name = s[0]
f6b5b4d7
TL
766 namespace_name = None
767 if len(s) == 2:
768 namespace_name = s[1]
769
770 if pool_name == "*":
771 # collect for all pools
772 osd_map = self.get('osd_map')
773 for pool in osd_map['pools']:
774 if 'rbd' not in pool.get('application_metadata', {}):
775 continue
776 pool_keys.append((pool['pool_name'], namespace_name))
777 else:
778 pool_keys.append((pool_name, namespace_name))
779
780 pools = {} # type: Dict[str, Set[str]]
781 for pool_key in pool_keys:
782 pool_name = pool_key[0]
783 namespace_name = pool_key[1]
784 if not namespace_name or namespace_name == "*":
11fdf7f2
TL
785 # empty set means collect for all namespaces
786 pools[pool_name] = set()
787 continue
f6b5b4d7 788
11fdf7f2
TL
789 if pool_name not in pools:
790 pools[pool_name] = set()
791 elif not pools[pool_name]:
792 continue
f6b5b4d7 793 pools[pool_name].add(namespace_name)
11fdf7f2
TL
794
795 rbd_stats_pools = {}
f6b5b4d7 796 for pool_id in self.rbd_stats['pools'].keys():
11fdf7f2
TL
797 name = self.rbd_stats['pools'][pool_id]['name']
798 if name not in pools:
799 del self.rbd_stats['pools'][pool_id]
800 else:
801 rbd_stats_pools[name] = \
802 self.rbd_stats['pools'][pool_id]['ns_names']
803
804 pools_refreshed = False
805 if pools:
806 next_refresh = self.rbd_stats['pools_refresh_time'] + \
807 self.get_localized_module_option(
808 'rbd_stats_pools_refresh_interval', 300)
809 if rbd_stats_pools != pools or time.time() >= next_refresh:
810 self.refresh_rbd_stats_pools(pools)
811 pools_refreshed = True
812
813 pool_ids = list(self.rbd_stats['pools'])
814 pool_ids.sort()
815 pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$'
816
817 nspace_names = []
818 for pool_id, pool in self.rbd_stats['pools'].items():
819 if pool['ns_names']:
820 nspace_names.extend(pool['ns_names'])
821 else:
822 nspace_names = []
823 break
824 if nspace_names:
825 namespace_regex = '^(' + \
826 "|".join([re.escape(x)
827 for x in set(nspace_names)]) + ')$'
828 else:
829 namespace_regex = '^(.*)$'
830
831 if 'query' in self.rbd_stats and \
832 (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or
833 namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']):
834 self.remove_osd_perf_query(self.rbd_stats['query_id'])
835 del self.rbd_stats['query_id']
836 del self.rbd_stats['query']
837
838 if not self.rbd_stats['pools']:
839 return
840
841 counters_info = self.rbd_stats['counters_info']
842
843 if 'query_id' not in self.rbd_stats:
844 query = {
845 'key_descriptor': [
846 {'type': 'pool_id', 'regex': pool_id_regex},
847 {'type': 'namespace', 'regex': namespace_regex},
848 {'type': 'object_name',
849 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
850 ],
851 'performance_counter_descriptors': list(counters_info),
852 }
853 query_id = self.add_osd_perf_query(query)
854 if query_id is None:
855 self.log.error('failed to add query %s' % query)
856 return
857 self.rbd_stats['query'] = query
858 self.rbd_stats['query_id'] = query_id
859
860 res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
861 for c in res['counters']:
862 # if the pool id is not found in the object name use id of the
863 # pool where the object is located
864 if c['k'][2][0]:
865 pool_id = int(c['k'][2][0])
866 else:
867 pool_id = int(c['k'][0][0])
868 if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
869 self.refresh_rbd_stats_pools(pools)
870 pools_refreshed = True
871 if pool_id not in self.rbd_stats['pools']:
872 continue
873 pool = self.rbd_stats['pools'][pool_id]
874 nspace_name = c['k'][1][0]
875 if nspace_name not in pool['images']:
876 continue
877 image_id = c['k'][2][1]
878 if image_id not in pool['images'][nspace_name] and \
879 not pools_refreshed:
880 self.refresh_rbd_stats_pools(pools)
881 pool = self.rbd_stats['pools'][pool_id]
882 pools_refreshed = True
883 if image_id not in pool['images'][nspace_name]:
884 continue
885 counters = pool['images'][nspace_name][image_id]['c']
886 for i in range(len(c['c'])):
887 counters[i][0] += c['c'][i][0]
888 counters[i][1] += c['c'][i][1]
889
890 label_names = ("pool", "namespace", "image")
891 for pool_id, pool in self.rbd_stats['pools'].items():
892 pool_name = pool['name']
893 for nspace_name, images in pool['images'].items():
894 for image_id in images:
895 image_name = images[image_id]['n']
896 counters = images[image_id]['c']
897 i = 0
898 for key in counters_info:
899 counter_info = counters_info[key]
900 stattype = self._stattype_to_str(counter_info['type'])
901 labels = (pool_name, nspace_name, image_name)
902 if counter_info['type'] == self.PERFCOUNTER_COUNTER:
903 path = 'rbd_' + key
904 if path not in self.metrics:
905 self.metrics[path] = Metric(
906 stattype,
907 path,
908 counter_info['desc'],
909 label_names,
910 )
911 self.metrics[path].set(counters[i][0], labels)
912 elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
913 path = 'rbd_' + key + '_sum'
914 if path not in self.metrics:
915 self.metrics[path] = Metric(
916 stattype,
917 path,
918 counter_info['desc'] + ' Total',
919 label_names,
920 )
921 self.metrics[path].set(counters[i][0], labels)
922 path = 'rbd_' + key + '_count'
923 if path not in self.metrics:
924 self.metrics[path] = Metric(
925 'counter',
926 path,
927 counter_info['desc'] + ' Count',
928 label_names,
929 )
930 self.metrics[path].set(counters[i][1], labels)
931 i += 1
932
933 def refresh_rbd_stats_pools(self, pools):
934 self.log.debug('refreshing rbd pools %s' % (pools))
935
936 rbd = RBD()
937 counters_info = self.rbd_stats['counters_info']
938 for pool_name, cfg_ns_names in pools.items():
939 try:
940 pool_id = self.rados.pool_lookup(pool_name)
941 with self.rados.open_ioctx(pool_name) as ioctx:
942 if pool_id not in self.rbd_stats['pools']:
943 self.rbd_stats['pools'][pool_id] = {'images': {}}
944 pool = self.rbd_stats['pools'][pool_id]
945 pool['name'] = pool_name
946 pool['ns_names'] = cfg_ns_names
947 if cfg_ns_names:
948 nspace_names = list(cfg_ns_names)
949 else:
950 nspace_names = [''] + rbd.namespace_list(ioctx)
951 for nspace_name in pool['images']:
952 if nspace_name not in nspace_names:
953 del pool['images'][nspace_name]
954 for nspace_name in nspace_names:
955 if (nspace_name and
956 not rbd.namespace_exists(ioctx, nspace_name)):
957 self.log.debug('unknown namespace %s for pool %s' %
958 (nspace_name, pool_name))
959 continue
960 ioctx.set_namespace(nspace_name)
961 if nspace_name not in pool['images']:
962 pool['images'][nspace_name] = {}
963 namespace = pool['images'][nspace_name]
964 images = {}
965 for image_meta in RBD().list2(ioctx):
966 image = {'n': image_meta['name']}
967 image_id = image_meta['id']
968 if image_id in namespace:
969 image['c'] = namespace[image_id]['c']
970 else:
971 image['c'] = [[0, 0] for x in counters_info]
972 images[image_id] = image
973 pool['images'][nspace_name] = images
974 except Exception as e:
975 self.log.error('failed listing pool %s: %s' % (pool_name, e))
976 self.rbd_stats['pools_refresh_time'] = time.time()
977
978 def shutdown_rbd_stats(self):
979 if 'query_id' in self.rbd_stats:
980 self.remove_osd_perf_query(self.rbd_stats['query_id'])
981 del self.rbd_stats['query_id']
982 del self.rbd_stats['query']
983 self.rbd_stats['pools'].clear()
984
e306af50
TL
985 def add_fixed_name_metrics(self):
986 """
987 Add fixed name metrics from existing ones that have details in their names
988 that should be in labels (not in name).
989 For backward compatibility, a new fixed name metric is created (instead of replacing)
990 and details are put in new labels.
991 Intended for RGW sync perf. counters but extendable as required.
992 See: https://tracker.ceph.com/issues/45311
993 """
994 new_metrics = {}
995 for metric_path in self.metrics.keys():
996 # Address RGW sync perf. counters.
997 match = re.search('^data-sync-from-(.*)\.', metric_path)
998 if match:
999 new_path = re.sub('from-([^.]*)', 'from-zone', metric_path)
1000 if new_path not in new_metrics:
1001 new_metrics[new_path] = Metric(
1002 self.metrics[metric_path].mtype,
1003 new_path,
1004 self.metrics[metric_path].desc,
1005 self.metrics[metric_path].labelnames + ('source_zone',)
1006 )
1007 for label_values, value in self.metrics[metric_path].value.items():
1008 new_metrics[new_path].set(value, label_values + (match.group(1),))
1009
1010 self.metrics.update(new_metrics)
1011
f6b5b4d7 1012 @profile_method(True)
c07f9fc5 1013 def collect(self):
91327a77
AA
1014 # Clear the metrics before scraping
1015 for k in self.metrics.keys():
1016 self.metrics[k].clear()
1017
3efd9988
FG
1018 self.get_health()
1019 self.get_df()
11fdf7f2 1020 self.get_pool_stats()
94b18763 1021 self.get_fs()
b32b8144 1022 self.get_osd_stats()
3efd9988 1023 self.get_quorum_status()
494da23a 1024 self.get_mgr_status()
3efd9988
FG
1025 self.get_metadata_and_osd_status()
1026 self.get_pg_status()
28e407b8 1027 self.get_num_objects()
3efd9988 1028
94b18763 1029 for daemon, counters in self.get_all_perf_counters().items():
3efd9988 1030 for path, counter_info in counters.items():
28e407b8 1031 # Skip histograms, they are represented by long running avgs
3efd9988 1032 stattype = self._stattype_to_str(counter_info['type'])
3efd9988
FG
1033 if not stattype or stattype == 'histogram':
1034 self.log.debug('ignoring %s, type %s' % (path, stattype))
1035 continue
1036
81eedcae
TL
1037 path, label_names, labels = self._perfpath_to_path_labels(
1038 daemon, path)
1039
28e407b8 1040 # Get the value of the counter
11fdf7f2
TL
1041 value = self._perfvalue_to_value(
1042 counter_info['type'], counter_info['value'])
28e407b8
AA
1043
1044 # Represent the long running avgs as sum/count pairs
1045 if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
1046 _path = path + '_sum'
91327a77
AA
1047 if _path not in self.metrics:
1048 self.metrics[_path] = Metric(
1049 stattype,
1050 _path,
1051 counter_info['description'] + ' Total',
81eedcae 1052 label_names,
91327a77 1053 )
81eedcae 1054 self.metrics[_path].set(value, labels)
28e407b8
AA
1055
1056 _path = path + '_count'
91327a77
AA
1057 if _path not in self.metrics:
1058 self.metrics[_path] = Metric(
1059 'counter',
1060 _path,
1061 counter_info['description'] + ' Count',
81eedcae 1062 label_names,
91327a77 1063 )
81eedcae 1064 self.metrics[_path].set(counter_info['count'], labels,)
28e407b8 1065 else:
91327a77
AA
1066 if path not in self.metrics:
1067 self.metrics[path] = Metric(
1068 stattype,
1069 path,
1070 counter_info['description'],
81eedcae 1071 label_names,
91327a77 1072 )
81eedcae 1073 self.metrics[path].set(value, labels)
91327a77 1074
e306af50 1075 self.add_fixed_name_metrics()
11fdf7f2
TL
1076 self.get_rbd_stats()
1077
91327a77
AA
1078 # Return formatted metrics and clear no longer used data
1079 _metrics = [m.str_expfmt() for m in self.metrics.values()]
1080 for k in self.metrics.keys():
1081 self.metrics[k].clear()
1082
1083 return ''.join(_metrics) + '\n'
c07f9fc5 1084
11fdf7f2
TL
1085 def get_file_sd_config(self):
1086 servers = self.list_servers()
1087 targets = []
1088 for server in servers:
1089 hostname = server.get('hostname', '')
1090 for service in server.get('services', []):
1091 if service['type'] != 'mgr':
1092 continue
1093 id_ = service['id']
1094 # get port for prometheus module at mgr with id_
1095 # TODO use get_config_prefix or get_config here once
1096 # https://github.com/ceph/ceph/pull/20458 is merged
1097 result = CommandResult("")
f6b5b4d7
TL
1098 assert isinstance(_global_instance, Module)
1099 _global_instance.send_command(
11fdf7f2
TL
1100 result, "mon", '',
1101 json.dumps({
1102 "prefix": "config-key get",
1103 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_),
1104 }),
1105 "")
1106 r, outb, outs = result.wait()
1107 if r != 0:
f6b5b4d7 1108 _global_instance.log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs))
11fdf7f2
TL
1109 targets.append('{}:{}'.format(hostname, DEFAULT_PORT))
1110 else:
1111 port = json.loads(outb)
1112 targets.append('{}:{}'.format(hostname, port))
1113
1114 ret = [
1115 {
1116 "targets": targets,
1117 "labels": {}
1118 }
1119 ]
1120 return 0, json.dumps(ret), ""
1121
1122 def self_test(self):
1123 self.collect()
1124 self.get_file_sd_config()
1125
1126 def handle_command(self, inbuf, cmd):
1127 if cmd['prefix'] == 'prometheus file_sd_config':
1128 return self.get_file_sd_config()
3efd9988
FG
1129 else:
1130 return (-errno.EINVAL, '',
1131 "Command not found '{0}'".format(cmd['prefix']))
c07f9fc5
FG
1132
1133 def serve(self):
1134
1135 class Root(object):
1136
1137 # collapse everything to '/'
1138 def _cp_dispatch(self, vpath):
1139 cherrypy.request.path = ''
1140 return self
1141
c07f9fc5
FG
1142 @cherrypy.expose
1143 def index(self):
3efd9988
FG
1144 return '''<!DOCTYPE html>
1145<html>
9f95a23c
TL
1146 <head><title>Ceph Exporter</title></head>
1147 <body>
1148 <h1>Ceph Exporter</h1>
1149 <p><a href='/metrics'>Metrics</a></p>
1150 </body>
3efd9988
FG
1151</html>'''
1152
1153 @cherrypy.expose
1154 def metrics(self):
91327a77 1155 # Lock the function execution
f6b5b4d7
TL
1156 assert isinstance(_global_instance, Module)
1157 with _global_instance.collect_lock:
1158 return self._metrics(_global_instance)
91327a77 1159
11fdf7f2
TL
1160 @staticmethod
1161 def _metrics(instance):
f6b5b4d7
TL
1162 # type: (Module) -> Any
1163 # Return cached data if available
1164 if not instance.collect_cache:
1165 raise cherrypy.HTTPError(503, 'No cached data available yet')
91327a77 1166
f6b5b4d7
TL
1167 def respond():
1168 assert isinstance(instance, Module)
91327a77
AA
1169 cherrypy.response.headers['Content-Type'] = 'text/plain'
1170 return instance.collect_cache
f6b5b4d7
TL
1171
1172 if instance.collect_time < instance.scrape_interval:
1173 # Respond if cache isn't stale
1174 return respond()
1175
1176 if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN:
1177 # Respond even if cache is stale
1178 instance.log.info(
1179 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, '
1180 'returning metrics from stale cache.'.format(
1181 instance.collect_time,
1182 instance.collect_time - instance.scrape_interval
1183 )
1184 )
1185 return respond()
1186
1187 if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL:
1188 # Fail if cache is stale
1189 msg = (
1190 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, '
1191 'returning "service unavailable".'.format(
1192 instance.collect_time,
1193 instance.collect_time - instance.scrape_interval,
1194 )
1195 )
1196 instance.log.error(msg)
1197 raise cherrypy.HTTPError(503, msg)
c07f9fc5 1198
91327a77 1199 # Make the cache timeout for collecting configurable
f6b5b4d7
TL
1200 self.scrape_interval = float(self.get_localized_module_option('scrape_interval', 15.0))
1201
1202 self.stale_cache_strategy = self.get_localized_module_option('stale_cache_strategy', 'log')
1203 if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL,
1204 self.STALE_CACHE_RETURN]:
1205 self.stale_cache_strategy = self.STALE_CACHE_FAIL
91327a77 1206
11fdf7f2 1207 server_addr = self.get_localized_module_option(
494da23a 1208 'server_addr', get_default_addr())
11fdf7f2
TL
1209 server_port = self.get_localized_module_option(
1210 'server_port', DEFAULT_PORT)
c07f9fc5
FG
1211 self.log.info(
1212 "server_addr: %s server_port: %s" %
1213 (server_addr, server_port)
1214 )
c07f9fc5 1215
94b18763
FG
1216 # Publish the URI that others may use to access the service we're
1217 # about to start serving
1218 self.set_uri('http://{0}:{1}/'.format(
eafe8130 1219 socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr,
94b18763
FG
1220 server_port
1221 ))
1222
c07f9fc5
FG
1223 cherrypy.config.update({
1224 'server.socket_host': server_addr,
3efd9988 1225 'server.socket_port': int(server_port),
c07f9fc5
FG
1226 'engine.autoreload.on': False
1227 })
1228 cherrypy.tree.mount(Root(), "/")
94b18763 1229 self.log.info('Starting engine...')
c07f9fc5 1230 cherrypy.engine.start()
94b18763 1231 self.log.info('Engine started.')
91327a77
AA
1232 # wait for the shutdown event
1233 self.shutdown_event.wait()
1234 self.shutdown_event.clear()
1235 cherrypy.engine.stop()
1236 self.log.info('Engine stopped.')
11fdf7f2 1237 self.shutdown_rbd_stats()
94b18763
FG
1238
1239 def shutdown(self):
1240 self.log.info('Stopping engine...')
91327a77 1241 self.shutdown_event.set()
94b18763
FG
1242
1243
1244class StandbyModule(MgrStandbyModule):
91327a77
AA
1245 def __init__(self, *args, **kwargs):
1246 super(StandbyModule, self).__init__(*args, **kwargs)
1247 self.shutdown_event = threading.Event()
1248
94b18763 1249 def serve(self):
494da23a
TL
1250 server_addr = self.get_localized_module_option(
1251 'server_addr', get_default_addr())
11fdf7f2
TL
1252 server_port = self.get_localized_module_option(
1253 'server_port', DEFAULT_PORT)
1254 self.log.info("server_addr: %s server_port: %s" %
1255 (server_addr, server_port))
94b18763
FG
1256 cherrypy.config.update({
1257 'server.socket_host': server_addr,
1258 'server.socket_port': int(server_port),
1259 'engine.autoreload.on': False
1260 })
1261
1262 module = self
1263
1264 class Root(object):
94b18763
FG
1265 @cherrypy.expose
1266 def index(self):
1267 active_uri = module.get_active_uri()
1268 return '''<!DOCTYPE html>
1269<html>
9f95a23c
TL
1270 <head><title>Ceph Exporter</title></head>
1271 <body>
1272 <h1>Ceph Exporter</h1>
94b18763 1273 <p><a href='{}metrics'>Metrics</a></p>
9f95a23c 1274 </body>
94b18763
FG
1275</html>'''.format(active_uri)
1276
1277 @cherrypy.expose
1278 def metrics(self):
1279 cherrypy.response.headers['Content-Type'] = 'text/plain'
1280 return ''
1281
1282 cherrypy.tree.mount(Root(), '/', {})
1283 self.log.info('Starting engine...')
1284 cherrypy.engine.start()
94b18763 1285 self.log.info('Engine started.')
91327a77
AA
1286 # Wait for shutdown event
1287 self.shutdown_event.wait()
1288 self.shutdown_event.clear()
1289 cherrypy.engine.stop()
1290 self.log.info('Engine stopped.')
94b18763
FG
1291
1292 def shutdown(self):
1293 self.log.info("Stopping engine...")
91327a77 1294 self.shutdown_event.set()
94b18763 1295 self.log.info("Stopped engine")