]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/prometheus/module.py
import quincy 17.2.0
[ceph.git] / ceph / src / pybind / mgr / prometheus / module.py
CommitLineData
c07f9fc5 1import cherrypy
adb31ebb 2from collections import defaultdict
a8e16298 3from distutils.version import StrictVersion
3efd9988 4import json
c07f9fc5
FG
5import math
6import os
11fdf7f2 7import re
91327a77
AA
8import threading
9import time
20effc67
TL
10import enum
11from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
522d829b 12from mgr_util import get_default_addr, profile_method, build_url
11fdf7f2 13from rbd import RBD
adb31ebb 14from collections import namedtuple
20effc67 15import yaml
522d829b 16
20effc67
TL
17from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable
18
19LabelValues = Tuple[str, ...]
20Number = Union[int, float]
21MetricValue = Dict[LabelValues, Number]
c07f9fc5
FG
22
23# Defaults for the Prometheus HTTP server. Can also set in config-key
24# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
25# for Prometheus exporter port registry
26
c07f9fc5
FG
27DEFAULT_PORT = 9283
28
a8e16298
TL
29# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
30# that the ports its listening on are in fact bound. When using the any address
31# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
32# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
33# exception.
34if cherrypy is not None:
35 v = StrictVersion(cherrypy.__version__)
36 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
37 # centos:7) and back to at least 3.0.0.
38 if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
39 # https://github.com/cherrypy/cherrypy/issues/1100
40 from cherrypy.process import servers
41 servers.wait_for_occupied_port = lambda host, port: None
c07f9fc5 42
9f95a23c 43
c07f9fc5 44# cherrypy likes to sys.exit on error. don't let it take us down too!
f67539c2 45def os_exit_noop(status: int) -> None:
c07f9fc5
FG
46 pass
47
48
f67539c2 49os._exit = os_exit_noop # type: ignore
c07f9fc5 50
c07f9fc5
FG
51# to access things in class Module from subclass Root. Because
52# it's a dict, the writer doesn't need to declare 'global' for access
53
f6b5b4d7 54_global_instance = None # type: Optional[Module]
cd265ab1
TL
55cherrypy.config.update({
56 'response.headers.server': 'Ceph-Prometheus'
57})
c07f9fc5
FG
58
59
f67539c2 60def health_status_to_number(status: str) -> int:
3efd9988
FG
61 if status == 'HEALTH_OK':
62 return 0
63 elif status == 'HEALTH_WARN':
64 return 1
65 elif status == 'HEALTH_ERR':
66 return 2
f67539c2 67 raise ValueError(f'unknown status "{status}"')
c07f9fc5 68
11fdf7f2
TL
69
70DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
71
1d09f67e 72DF_POOL = ['max_avail', 'avail_raw', 'stored', 'stored_raw', 'objects', 'dirty',
f91f0fd5 73 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes',
b3b6e05e 74 'compress_bytes_used', 'compress_under_bytes', 'bytes_used', 'percent_used']
c07f9fc5 75
11fdf7f2
TL
76OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
77 'recovering_keys_per_sec', 'num_objects_recovered',
78 'num_bytes_recovered', 'num_bytes_recovered')
79
94b18763
FG
80OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
81 'norecover', 'noscrub', 'nodeep-scrub')
3efd9988 82
28e407b8 83FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name')
b32b8144 84
28e407b8
AA
85MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
86 'ceph_version')
3efd9988 87
11fdf7f2
TL
88MON_METADATA = ('ceph_daemon', 'hostname',
89 'public_addr', 'rank', 'ceph_version')
c07f9fc5 90
494da23a
TL
91MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
92
93MGR_STATUS = ('ceph_daemon',)
94
95MGR_MODULE_STATUS = ('name',)
96
97MGR_MODULE_CAN_RUN = ('name',)
98
a8e16298
TL
99OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
100 'front_iface', 'hostname', 'objectstore', 'public_addr',
101 'ceph_version')
c07f9fc5 102
94b18763 103OSD_STATUS = ['weight', 'up', 'in']
c07f9fc5 104
94b18763 105OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
c07f9fc5 106
b3b6e05e 107POOL_METADATA = ('pool_id', 'name', 'type', 'description', 'compression_mode')
c07f9fc5 108
20effc67 109RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version', 'instance_id')
c07f9fc5 110
11fdf7f2
TL
111RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname',
112 'ceph_version')
113
114DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
f67539c2 115 'wal_device', 'instance', 'devices', 'device_ids')
28e407b8
AA
116
117NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
c07f9fc5 118
adb31ebb
TL
119alert_metric = namedtuple('alert_metric', 'name description')
120HEALTH_CHECKS = [
f67539c2 121 alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'),
adb31ebb
TL
122]
123
20effc67
TL
124HEALTHCHECK_DETAIL = ('name', 'severity')
125
126
127class Severity(enum.Enum):
128 ok = "HEALTH_OK"
129 warn = "HEALTH_WARN"
130 error = "HEALTH_ERR"
131
132
133class Format(enum.Enum):
134 plain = 'plain'
135 json = 'json'
136 json_pretty = 'json-pretty'
137 yaml = 'yaml'
138
139
140class HealthCheckEvent:
141
142 def __init__(self, name: str, severity: Severity, first_seen: float, last_seen: float, count: int, active: bool = True):
143 self.name = name
144 self.severity = severity
145 self.first_seen = first_seen
146 self.last_seen = last_seen
147 self.count = count
148 self.active = active
149
150 def as_dict(self) -> Dict[str, Any]:
151 """Return the instance as a dictionary."""
152 return self.__dict__
153
154
155class HealthHistory:
156 kv_name = 'health_history'
157 titles = "{healthcheck_name:<24} {first_seen:<20} {last_seen:<20} {count:>5} {active:^6}"
158 date_format = "%Y/%m/%d %H:%M:%S"
159
160 def __init__(self, mgr: MgrModule):
161 self.mgr = mgr
162 self.lock = threading.Lock()
163 self.healthcheck: Dict[str, HealthCheckEvent] = {}
164 self._load()
165
166 def _load(self) -> None:
167 """Load the current state from the mons KV store."""
168 data = self.mgr.get_store(self.kv_name)
169 if data:
170 try:
171 healthcheck_data = json.loads(data)
172 except json.JSONDecodeError:
173 self.mgr.log.warn(
174 f"INVALID data read from mgr/prometheus/{self.kv_name}. Resetting")
175 self.reset()
176 return
177 else:
178 for k, v in healthcheck_data.items():
179 self.healthcheck[k] = HealthCheckEvent(
180 name=k,
181 severity=v.get('severity'),
182 first_seen=v.get('first_seen', 0),
183 last_seen=v.get('last_seen', 0),
184 count=v.get('count', 1),
185 active=v.get('active', True))
186 else:
187 self.reset()
188
189 def reset(self) -> None:
190 """Reset the healthcheck history."""
191 with self.lock:
192 self.mgr.set_store(self.kv_name, "{}")
193 self.healthcheck = {}
194
195 def save(self) -> None:
196 """Save the current in-memory healthcheck history to the KV store."""
197 with self.lock:
198 self.mgr.set_store(self.kv_name, self.as_json())
199
200 def check(self, health_checks: Dict[str, Any]) -> None:
201 """Look at the current health checks and compare existing the history.
202
203 Args:
204 health_checks (Dict[str, Any]): current health check data
205 """
206
207 current_checks = health_checks.get('checks', {})
208 changes_made = False
209
210 # first turn off any active states we're tracking
211 for seen_check in self.healthcheck:
212 check = self.healthcheck[seen_check]
213 if check.active and seen_check not in current_checks:
214 check.active = False
215 changes_made = True
216
217 # now look for any additions to track
218 now = time.time()
219 for name, info in current_checks.items():
220 if name not in self.healthcheck:
221 # this healthcheck is new, so start tracking it
222 changes_made = True
223 self.healthcheck[name] = HealthCheckEvent(
224 name=name,
225 severity=info.get('severity'),
226 first_seen=now,
227 last_seen=now,
228 count=1,
229 active=True
230 )
231 else:
232 # seen it before, so update its metadata
233 check = self.healthcheck[name]
234 if check.active:
235 # check has been registered as active already, so skip
236 continue
237 else:
238 check.last_seen = now
239 check.count += 1
240 check.active = True
241 changes_made = True
242
243 if changes_made:
244 self.save()
245
246 def __str__(self) -> str:
247 """Print the healthcheck history.
248
249 Returns:
250 str: Human readable representation of the healthcheck history
251 """
252 out = []
253
254 if len(self.healthcheck.keys()) == 0:
255 out.append("No healthchecks have been recorded")
256 else:
257 out.append(self.titles.format(
258 healthcheck_name="Healthcheck Name",
259 first_seen="First Seen (UTC)",
260 last_seen="Last seen (UTC)",
261 count="Count",
262 active="Active")
263 )
264 for k in sorted(self.healthcheck.keys()):
265 check = self.healthcheck[k]
266 out.append(self.titles.format(
267 healthcheck_name=check.name,
268 first_seen=time.strftime(self.date_format, time.localtime(check.first_seen)),
269 last_seen=time.strftime(self.date_format, time.localtime(check.last_seen)),
270 count=check.count,
271 active="Yes" if check.active else "No")
272 )
273 out.extend([f"{len(self.healthcheck)} health check(s) listed", ""])
274
275 return "\n".join(out)
276
277 def as_dict(self) -> Dict[str, Any]:
278 """Return the history in a dictionary.
279
280 Returns:
281 Dict[str, Any]: dictionary indexed by the healthcheck name
282 """
283 return {name: self.healthcheck[name].as_dict() for name in self.healthcheck}
284
285 def as_json(self, pretty: bool = False) -> str:
286 """Return the healthcheck history object as a dict (JSON).
287
288 Args:
289 pretty (bool, optional): whether to json pretty print the history. Defaults to False.
290
291 Returns:
292 str: str representation of the healthcheck in JSON format
293 """
294 if pretty:
295 return json.dumps(self.as_dict(), indent=2)
296 else:
297 return json.dumps(self.as_dict())
298
299 def as_yaml(self) -> str:
300 """Return the healthcheck history in yaml format.
301
302 Returns:
303 str: YAML representation of the healthcheck history
304 """
305 return yaml.safe_dump(self.as_dict(), explicit_start=True, default_flow_style=False)
306
c07f9fc5 307
91327a77 308class Metric(object):
20effc67 309 def __init__(self, mtype: str, name: str, desc: str, labels: Optional[LabelValues] = None) -> None:
91327a77
AA
310 self.mtype = mtype
311 self.name = name
312 self.desc = desc
20effc67
TL
313 self.labelnames = labels # tuple if present
314 self.value: Dict[LabelValues, Number] = {}
91327a77 315
f67539c2 316 def clear(self) -> None:
91327a77
AA
317 self.value = {}
318
20effc67 319 def set(self, value: Number, labelvalues: Optional[LabelValues] = None) -> None:
91327a77
AA
320 # labelvalues must be a tuple
321 labelvalues = labelvalues or ('',)
322 self.value[labelvalues] = value
3efd9988 323
f67539c2 324 def str_expfmt(self) -> str:
91327a77 325
f67539c2 326 def promethize(path: str) -> str:
91327a77 327 ''' replace illegal metric name characters '''
81eedcae 328 result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus')
91327a77
AA
329
330 # Hyphens usually turn into underscores, unless they are
331 # trailing
332 if result.endswith("-"):
333 result = result[0:-1] + "_minus"
334 else:
335 result = result.replace("-", "_")
336
337 return "ceph_{0}".format(result)
338
f67539c2 339 def floatstr(value: float) -> str:
91327a77
AA
340 ''' represent as Go-compatible float '''
341 if value == float('inf'):
342 return '+Inf'
343 if value == float('-inf'):
344 return '-Inf'
345 if math.isnan(value):
346 return 'NaN'
347 return repr(float(value))
348
349 name = promethize(self.name)
350 expfmt = '''
351# HELP {name} {desc}
352# TYPE {name} {mtype}'''.format(
353 name=name,
354 desc=self.desc,
355 mtype=self.mtype,
356 )
357
358 for labelvalues, value in self.value.items():
359 if self.labelnames:
f6b5b4d7
TL
360 labels_list = zip(self.labelnames, labelvalues)
361 labels = ','.join('%s="%s"' % (k, v) for k, v in labels_list)
91327a77
AA
362 else:
363 labels = ''
364 if labels:
365 fmtstr = '\n{name}{{{labels}}} {value}'
366 else:
367 fmtstr = '\n{name} {value}'
368 expfmt += fmtstr.format(
369 name=name,
370 labels=labels,
371 value=floatstr(value),
372 )
373 return expfmt
374
20effc67
TL
375 def group_by(
376 self,
377 keys: List[str],
378 joins: Dict[str, Callable[[List[str]], str]],
379 name: Optional[str] = None,
380 ) -> "Metric":
381 """
382 Groups data by label names.
383
384 Label names not passed are being removed from the resulting metric but
385 by providing a join function, labels of metrics can be grouped.
386
387 The purpose of this method is to provide a version of a metric that can
388 be used in matching where otherwise multiple results would be returned.
389
390 As grouping is possible in Prometheus, the only additional value of this
391 method is the possibility to join labels when grouping. For that reason,
392 passing joins is required. Please use PromQL expressions in all other
393 cases.
394
395 >>> m = Metric('type', 'name', '', labels=('label1', 'id'))
396 >>> m.value = {
397 ... ('foo', 'x'): 1,
398 ... ('foo', 'y'): 1,
399 ... }
400 >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value
401 {('foo', 'x,y'): 1}
402
403 The functionality of group by could roughly be compared with Prometheus'
404
405 group (ceph_disk_occupation) by (device, instance)
406
407 with the exception that not all labels which aren't used as a condition
408 to group a metric are discarded, but their values can are joined and the
409 label is thereby preserved.
410
411 This function takes the value of the first entry of a found group to be
412 used for the resulting value of the grouping operation.
413
414 >>> m = Metric('type', 'name', '', labels=('label1', 'id'))
415 >>> m.value = {
416 ... ('foo', 'x'): 555,
417 ... ('foo', 'y'): 10,
418 ... }
419 >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value
420 {('foo', 'x,y'): 555}
421 """
422 assert self.labelnames, "cannot match keys without label names"
423 for key in keys:
424 assert key in self.labelnames, "unknown key: {}".format(key)
425 assert joins, "joins must not be empty"
426 assert all(callable(c) for c in joins.values()), "joins must be callable"
427
428 # group
429 grouped: Dict[LabelValues, List[Tuple[Dict[str, str], Number]]] = defaultdict(list)
430 for label_values, metric_value in self.value.items():
431 labels = dict(zip(self.labelnames, label_values))
432 if not all(k in labels for k in keys):
433 continue
434 group_key = tuple(labels[k] for k in keys)
435 grouped[group_key].append((labels, metric_value))
436
437 # as there is nothing specified on how to join labels that are not equal
438 # and Prometheus `group` aggregation functions similarly, we simply drop
439 # those labels.
440 labelnames = tuple(
441 label for label in self.labelnames if label in keys or label in joins
442 )
443 superfluous_labelnames = [
444 label for label in self.labelnames if label not in labelnames
445 ]
446
447 # iterate and convert groups with more than one member into a single
448 # entry
449 values: MetricValue = {}
450 for group in grouped.values():
451 labels, metric_value = group[0]
452
453 for label in superfluous_labelnames:
454 del labels[label]
455
456 if len(group) > 1:
457 for key, fn in joins.items():
458 labels[key] = fn(list(labels[key] for labels, _ in group))
459
460 values[tuple(labels.values())] = metric_value
461
462 new_metric = Metric(self.mtype, name if name else self.name, self.desc, labelnames)
463 new_metric.value = values
464
465 return new_metric
466
91327a77 467
f67539c2
TL
468class MetricCounter(Metric):
469 def __init__(self,
470 name: str,
471 desc: str,
20effc67 472 labels: Optional[LabelValues] = None) -> None:
f67539c2
TL
473 super(MetricCounter, self).__init__('counter', name, desc, labels)
474 self.value = defaultdict(lambda: 0)
475
476 def clear(self) -> None:
477 pass # Skip calls to clear as we want to keep the counters here.
478
479 def set(self,
20effc67
TL
480 value: Number,
481 labelvalues: Optional[LabelValues] = None) -> None:
f67539c2
TL
482 msg = 'This method must not be used for instances of MetricCounter class'
483 raise NotImplementedError(msg)
484
485 def add(self,
20effc67
TL
486 value: Number,
487 labelvalues: Optional[LabelValues] = None) -> None:
f67539c2
TL
488 # labelvalues must be a tuple
489 labelvalues = labelvalues or ('',)
490 self.value[labelvalues] += value
491
492
f6b5b4d7 493class MetricCollectionThread(threading.Thread):
f67539c2 494 def __init__(self, module: 'Module') -> None:
f6b5b4d7 495 self.mod = module
adb31ebb
TL
496 self.active = True
497 self.event = threading.Event()
f6b5b4d7
TL
498 super(MetricCollectionThread, self).__init__(target=self.collect)
499
f67539c2 500 def collect(self) -> None:
f6b5b4d7 501 self.mod.log.info('starting metric collection thread')
adb31ebb 502 while self.active:
f6b5b4d7
TL
503 self.mod.log.debug('collecting cache in thread')
504 if self.mod.have_mon_connection():
505 start_time = time.time()
f6b5b4d7 506
adb31ebb
TL
507 try:
508 data = self.mod.collect()
f67539c2 509 except Exception:
adb31ebb
TL
510 # Log any issues encountered during the data collection and continue
511 self.mod.log.exception("failed to collect metrics:")
512 self.event.wait(self.mod.scrape_interval)
513 continue
514
515 duration = time.time() - start_time
f6b5b4d7 516 self.mod.log.debug('collecting cache in thread done')
adb31ebb 517
f6b5b4d7
TL
518 sleep_time = self.mod.scrape_interval - duration
519 if sleep_time < 0:
520 self.mod.log.warning(
521 'Collecting data took more time than configured scrape interval. '
522 'This possibly results in stale data. Please check the '
523 '`stale_cache_strategy` configuration option. '
524 'Collecting data took {:.2f} seconds but scrape interval is configured '
525 'to be {:.0f} seconds.'.format(
526 duration,
527 self.mod.scrape_interval,
528 )
529 )
530 sleep_time = 0
531
532 with self.mod.collect_lock:
533 self.mod.collect_cache = data
534 self.mod.collect_time = duration
535
adb31ebb 536 self.event.wait(sleep_time)
f6b5b4d7
TL
537 else:
538 self.mod.log.error('No MON connection')
adb31ebb 539 self.event.wait(self.mod.scrape_interval)
f6b5b4d7 540
f67539c2 541 def stop(self) -> None:
adb31ebb
TL
542 self.active = False
543 self.event.set()
f6b5b4d7 544
91327a77 545
f67539c2 546class Module(MgrModule):
11fdf7f2 547 MODULE_OPTIONS = [
f67539c2 548 Option(
20effc67
TL
549 'server_addr',
550 default=get_default_addr(),
551 desc='the IPv4 or IPv6 address on which the module listens for HTTP requests',
f67539c2
TL
552 ),
553 Option(
554 'server_port',
20effc67
TL
555 type='int',
556 default=DEFAULT_PORT,
557 desc='the port on which the module listens for HTTP requests'
f67539c2
TL
558 ),
559 Option(
560 'scrape_interval',
561 type='float',
562 default=15.0
563 ),
564 Option(
565 'stale_cache_strategy',
566 default='log'
567 ),
a4b75251
TL
568 Option(
569 'cache',
570 type='bool',
571 default=True,
572 ),
f67539c2
TL
573 Option(
574 'rbd_stats_pools',
575 default=''
576 ),
577 Option(
578 name='rbd_stats_pools_refresh_interval',
579 type='int',
580 default=300
20effc67
TL
581 ),
582 Option(
583 name='standby_behaviour',
584 type='str',
585 default='default',
586 enum_allowed=['default', 'error'],
587 runtime=True
588 ),
589 Option(
590 name='standby_error_status_code',
591 type='int',
592 default=500,
593 min=400,
594 max=599,
595 runtime=True
f67539c2 596 )
91327a77
AA
597 ]
598
f6b5b4d7
TL
599 STALE_CACHE_FAIL = 'fail'
600 STALE_CACHE_RETURN = 'return'
601
f67539c2 602 def __init__(self, *args: Any, **kwargs: Any) -> None:
91327a77
AA
603 super(Module, self).__init__(*args, **kwargs)
604 self.metrics = self._setup_static_metrics()
605 self.shutdown_event = threading.Event()
f6b5b4d7
TL
606 self.collect_lock = threading.Lock()
607 self.collect_time = 0.0
f67539c2 608 self.scrape_interval: float = 15.0
a4b75251 609 self.cache = True
f67539c2
TL
610 self.stale_cache_strategy: str = self.STALE_CACHE_FAIL
611 self.collect_cache: Optional[str] = None
11fdf7f2
TL
612 self.rbd_stats = {
613 'pools': {},
614 'pools_refresh_time': 0,
615 'counters_info': {
616 'write_ops': {'type': self.PERFCOUNTER_COUNTER,
617 'desc': 'RBD image writes count'},
618 'read_ops': {'type': self.PERFCOUNTER_COUNTER,
619 'desc': 'RBD image reads count'},
620 'write_bytes': {'type': self.PERFCOUNTER_COUNTER,
621 'desc': 'RBD image bytes written'},
622 'read_bytes': {'type': self.PERFCOUNTER_COUNTER,
623 'desc': 'RBD image bytes read'},
624 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
625 'desc': 'RBD image writes latency (msec)'},
626 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG,
627 'desc': 'RBD image reads latency (msec)'},
628 },
f6b5b4d7
TL
629 } # type: Dict[str, Any]
630 global _global_instance
631 _global_instance = self
adb31ebb 632 self.metrics_thread = MetricCollectionThread(_global_instance)
20effc67 633 self.health_history = HealthHistory(self)
3efd9988 634
f67539c2 635 def _setup_static_metrics(self) -> Dict[str, Metric]:
3efd9988
FG
636 metrics = {}
637 metrics['health_status'] = Metric(
638 'untyped',
639 'health_status',
640 'Cluster health status'
641 )
94b18763 642 metrics['mon_quorum_status'] = Metric(
3efd9988 643 'gauge',
94b18763
FG
644 'mon_quorum_status',
645 'Monitors in quorum',
646 ('ceph_daemon',)
647 )
648 metrics['fs_metadata'] = Metric(
649 'untyped',
650 'fs_metadata',
651 'FS Metadata',
652 FS_METADATA
653 )
654 metrics['mds_metadata'] = Metric(
655 'untyped',
656 'mds_metadata',
657 'MDS Metadata',
658 MDS_METADATA
659 )
660 metrics['mon_metadata'] = Metric(
661 'untyped',
662 'mon_metadata',
663 'MON Metadata',
664 MON_METADATA
3efd9988 665 )
494da23a
TL
666 metrics['mgr_metadata'] = Metric(
667 'gauge',
668 'mgr_metadata',
669 'MGR metadata',
670 MGR_METADATA
671 )
672 metrics['mgr_status'] = Metric(
673 'gauge',
674 'mgr_status',
675 'MGR status (0=standby, 1=active)',
676 MGR_STATUS
677 )
678 metrics['mgr_module_status'] = Metric(
679 'gauge',
680 'mgr_module_status',
681 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
682 MGR_MODULE_STATUS
683 )
684 metrics['mgr_module_can_run'] = Metric(
685 'gauge',
686 'mgr_module_can_run',
687 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
688 MGR_MODULE_CAN_RUN
689 )
3efd9988
FG
690 metrics['osd_metadata'] = Metric(
691 'untyped',
692 'osd_metadata',
693 'OSD Metadata',
694 OSD_METADATA
695 )
c07f9fc5 696
3efd9988
FG
697 # The reason for having this separate to OSD_METADATA is
698 # so that we can stably use the same tag names that
699 # the Prometheus node_exporter does
700 metrics['disk_occupation'] = Metric(
b32b8144 701 'untyped',
3efd9988
FG
702 'disk_occupation',
703 'Associate Ceph daemon with disk used',
704 DISK_OCCUPATION
705 )
c07f9fc5 706
20effc67
TL
707 metrics['disk_occupation_human'] = Metric(
708 'untyped',
709 'disk_occupation_human',
710 'Associate Ceph daemon with disk used for displaying to humans,'
711 ' not for joining tables (vector matching)',
712 DISK_OCCUPATION, # label names are automatically decimated on grouping
713 )
714
3efd9988
FG
715 metrics['pool_metadata'] = Metric(
716 'untyped',
717 'pool_metadata',
718 'POOL Metadata',
719 POOL_METADATA
720 )
94b18763
FG
721
722 metrics['rgw_metadata'] = Metric(
723 'untyped',
724 'rgw_metadata',
725 'RGW Metadata',
726 RGW_METADATA
727 )
728
11fdf7f2
TL
729 metrics['rbd_mirror_metadata'] = Metric(
730 'untyped',
731 'rbd_mirror_metadata',
732 'RBD Mirror Metadata',
733 RBD_MIRROR_METADATA
734 )
735
94b18763
FG
736 metrics['pg_total'] = Metric(
737 'gauge',
738 'pg_total',
92f5a8d4
TL
739 'PG Total Count per Pool',
740 ('pool_id',)
94b18763
FG
741 )
742
20effc67
TL
743 metrics['health_detail'] = Metric(
744 'gauge',
745 'health_detail',
746 'healthcheck status by type (0=inactive, 1=active)',
747 HEALTHCHECK_DETAIL
748 )
749
94b18763
FG
750 for flag in OSD_FLAGS:
751 path = 'osd_flag_{}'.format(flag)
752 metrics[path] = Metric(
753 'untyped',
754 path,
755 'OSD Flag {}'.format(flag)
756 )
3efd9988
FG
757 for state in OSD_STATUS:
758 path = 'osd_{}'.format(state)
3efd9988
FG
759 metrics[path] = Metric(
760 'untyped',
c07f9fc5 761 path,
3efd9988
FG
762 'OSD status {}'.format(state),
763 ('ceph_daemon',)
c07f9fc5 764 )
b32b8144
FG
765 for stat in OSD_STATS:
766 path = 'osd_{}'.format(stat)
b32b8144
FG
767 metrics[path] = Metric(
768 'gauge',
769 path,
770 'OSD stat {}'.format(stat),
771 ('ceph_daemon',)
772 )
11fdf7f2
TL
773 for stat in OSD_POOL_STATS:
774 path = 'pool_{}'.format(stat)
775 metrics[path] = Metric(
776 'gauge',
777 path,
9f95a23c 778 "OSD pool stats: {}".format(stat),
11fdf7f2
TL
779 ('pool_id',)
780 )
3efd9988
FG
781 for state in PG_STATES:
782 path = 'pg_{}'.format(state)
3efd9988
FG
783 metrics[path] = Metric(
784 'gauge',
785 path,
92f5a8d4
TL
786 'PG {} per pool'.format(state),
787 ('pool_id',)
3efd9988
FG
788 )
789 for state in DF_CLUSTER:
790 path = 'cluster_{}'.format(state)
3efd9988
FG
791 metrics[path] = Metric(
792 'gauge',
793 path,
794 'DF {}'.format(state),
795 )
796 for state in DF_POOL:
797 path = 'pool_{}'.format(state)
3efd9988 798 metrics[path] = Metric(
20effc67 799 'counter' if state in ('rd', 'rd_bytes', 'wr', 'wr_bytes') else 'gauge',
3efd9988
FG
800 path,
801 'DF pool {}'.format(state),
802 ('pool_id',)
803 )
28e407b8
AA
804 for state in NUM_OBJECTS:
805 path = 'num_objects_{}'.format(state)
806 metrics[path] = Metric(
807 'gauge',
808 path,
809 'Number of {} objects'.format(state),
810 )
3efd9988 811
adb31ebb
TL
812 for check in HEALTH_CHECKS:
813 path = 'healthcheck_{}'.format(check.name.lower())
814 metrics[path] = Metric(
815 'gauge',
816 path,
817 check.description,
818 )
819
3efd9988 820 return metrics
c07f9fc5 821
f6b5b4d7 822 @profile_method()
f67539c2 823 def get_health(self) -> None:
adb31ebb 824
f67539c2 825 def _get_value(message: str, delim: str = ' ', word_pos: int = 0) -> Tuple[int, int]:
adb31ebb
TL
826 """Extract value from message (default is 1st field)"""
827 v_str = message.split(delim)[word_pos]
828 if v_str.isdigit():
829 return int(v_str), 0
830 return 0, 1
831
3efd9988 832 health = json.loads(self.get('health')['json'])
adb31ebb 833 # set overall health
91327a77
AA
834 self.metrics['health_status'].set(
835 health_status_to_number(health['status'])
c07f9fc5
FG
836 )
837
adb31ebb 838 # Examine the health to see if any health checks triggered need to
20effc67 839 # become a specific metric with a value from the health detail
adb31ebb
TL
840 active_healthchecks = health.get('checks', {})
841 active_names = active_healthchecks.keys()
842
843 for check in HEALTH_CHECKS:
844 path = 'healthcheck_{}'.format(check.name.lower())
845
846 if path in self.metrics:
847
848 if check.name in active_names:
849 check_data = active_healthchecks[check.name]
850 message = check_data['summary'].get('message', '')
851 v, err = 0, 0
852
853 if check.name == "SLOW_OPS":
f67539c2
TL
854 # 42 slow ops, oldest one blocked for 12 sec, daemons [osd.0, osd.3] have
855 # slow ops.
adb31ebb
TL
856 v, err = _get_value(message)
857
858 if err:
f67539c2
TL
859 self.log.error(
860 "healthcheck %s message format is incompatible and has been dropped",
861 check.name)
adb31ebb
TL
862 # drop the metric, so it's no longer emitted
863 del self.metrics[path]
864 continue
865 else:
866 self.metrics[path].set(v)
867 else:
868 # health check is not active, so give it a default of 0
869 self.metrics[path].set(0)
870
20effc67
TL
871 self.health_history.check(health)
872 for name, info in self.health_history.healthcheck.items():
873 v = 1 if info.active else 0
874 self.metrics['health_detail'].set(
875 v, (
876 name,
877 str(info.severity))
878 )
879
f6b5b4d7 880 @profile_method()
f67539c2 881 def get_pool_stats(self) -> None:
11fdf7f2
TL
882 # retrieve pool stats to provide per pool recovery metrics
883 # (osd_pool_stats moved to mgr in Mimic)
884 pstats = self.get('osd_pool_stats')
885 for pool in pstats['pool_stats']:
886 for stat in OSD_POOL_STATS:
887 self.metrics['pool_{}'.format(stat)].set(
888 pool['recovery_rate'].get(stat, 0),
889 (pool['pool_id'],)
890 )
891
f6b5b4d7 892 @profile_method()
f67539c2 893 def get_df(self) -> None:
3efd9988
FG
894 # maybe get the to-be-exported metrics from a config?
895 df = self.get('df')
896 for stat in DF_CLUSTER:
91327a77 897 self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat])
3efd9988
FG
898
899 for pool in df['pools']:
900 for stat in DF_POOL:
91327a77
AA
901 self.metrics['pool_{}'.format(stat)].set(
902 pool['stats'][stat],
903 (pool['id'],)
904 )
94b18763 905
f6b5b4d7 906 @profile_method()
f67539c2 907 def get_fs(self) -> None:
94b18763
FG
908 fs_map = self.get('fs_map')
909 servers = self.get_service_list()
9f95a23c
TL
910 self.log.debug('standbys: {}'.format(fs_map['standbys']))
911 # export standby mds metadata, default standby fs_id is '-1'
912 for standby in fs_map['standbys']:
913 id_ = standby['name']
20effc67 914 host, version, _ = servers.get((id_, 'mds'), ('', '', ''))
f67539c2 915 addr, rank = standby['addr'], standby['rank']
9f95a23c
TL
916 self.metrics['mds_metadata'].set(1, (
917 'mds.{}'.format(id_), '-1',
f67539c2
TL
918 cast(str, host),
919 cast(str, addr),
920 cast(str, rank),
921 cast(str, version)
9f95a23c 922 ))
94b18763
FG
923 for fs in fs_map['filesystems']:
924 # collect fs metadata
11fdf7f2
TL
925 data_pools = ",".join([str(pool)
926 for pool in fs['mdsmap']['data_pools']])
91327a77
AA
927 self.metrics['fs_metadata'].set(1, (
928 data_pools,
929 fs['id'],
930 fs['mdsmap']['metadata_pool'],
931 fs['mdsmap']['fs_name']
932 ))
28e407b8 933 self.log.debug('mdsmap: {}'.format(fs['mdsmap']))
94b18763
FG
934 for gid, daemon in fs['mdsmap']['info'].items():
935 id_ = daemon['name']
20effc67 936 host, version, _ = servers.get((id_, 'mds'), ('', '', ''))
91327a77
AA
937 self.metrics['mds_metadata'].set(1, (
938 'mds.{}'.format(id_), fs['id'],
f67539c2
TL
939 host, daemon['addr'],
940 daemon['rank'], version
91327a77 941 ))
3efd9988 942
f6b5b4d7 943 @profile_method()
f67539c2 944 def get_quorum_status(self) -> None:
3efd9988 945 mon_status = json.loads(self.get('mon_status')['json'])
94b18763
FG
946 servers = self.get_service_list()
947 for mon in mon_status['monmap']['mons']:
948 rank = mon['rank']
949 id_ = mon['name']
20effc67 950 host_version = servers.get((id_, 'mon'), ('', '', ''))
91327a77
AA
951 self.metrics['mon_metadata'].set(1, (
952 'mon.{}'.format(id_), host_version[0],
f91f0fd5 953 mon['public_addr'].rsplit(':', 1)[0], rank,
91327a77
AA
954 host_version[1]
955 ))
94b18763 956 in_quorum = int(rank in mon_status['quorum'])
91327a77
AA
957 self.metrics['mon_quorum_status'].set(in_quorum, (
958 'mon.{}'.format(id_),
959 ))
3efd9988 960
f6b5b4d7 961 @profile_method()
f67539c2 962 def get_mgr_status(self) -> None:
494da23a
TL
963 mgr_map = self.get('mgr_map')
964 servers = self.get_service_list()
965
966 active = mgr_map['active_name']
967 standbys = [s.get('name') for s in mgr_map['standbys']]
968
969 all_mgrs = list(standbys)
970 all_mgrs.append(active)
971
f67539c2
TL
972 all_modules = {module.get('name'): module.get('can_run')
973 for module in mgr_map['available_modules']}
494da23a
TL
974
975 for mgr in all_mgrs:
20effc67 976 host, version, _ = servers.get((mgr, 'mgr'), ('', '', ''))
494da23a
TL
977 if mgr == active:
978 _state = 1
494da23a
TL
979 else:
980 _state = 0
801d1391 981
494da23a 982 self.metrics['mgr_metadata'].set(1, (
f67539c2 983 f'mgr.{mgr}', host, version
494da23a
TL
984 ))
985 self.metrics['mgr_status'].set(_state, (
f67539c2 986 f'mgr.{mgr}',))
adb31ebb 987 always_on_modules = mgr_map['always_on_modules'].get(self.release_name, [])
494da23a
TL
988 active_modules = list(always_on_modules)
989 active_modules.extend(mgr_map['modules'])
990
991 for mod_name in all_modules.keys():
992
993 if mod_name in always_on_modules:
994 _state = 2
995 elif mod_name in active_modules:
996 _state = 1
997 else:
998 _state = 0
999
1000 _can_run = 1 if all_modules[mod_name] else 0
1001 self.metrics['mgr_module_status'].set(_state, (mod_name,))
1002 self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,))
1003
f6b5b4d7 1004 @profile_method()
f67539c2 1005 def get_pg_status(self) -> None:
94b18763 1006
92f5a8d4
TL
1007 pg_summary = self.get('pg_summary')
1008
1009 for pool in pg_summary['by_pool']:
adb31ebb 1010 num_by_state = defaultdict(int) # type: DefaultDict[str, int]
92f5a8d4 1011
801d1391 1012 for state_name, count in pg_summary['by_pool'][pool].items():
92f5a8d4 1013 for state in state_name.split('+'):
801d1391
TL
1014 num_by_state[state] += count
1015 num_by_state['total'] += count
1016
1017 for state, num in num_by_state.items():
1018 try:
1019 self.metrics["pg_{}".format(state)].set(num, (pool,))
1020 except KeyError:
e306af50 1021 self.log.warning("skipping pg in unknown state {}".format(state))
b32b8144 1022
f6b5b4d7 1023 @profile_method()
f67539c2 1024 def get_osd_stats(self) -> None:
b32b8144
FG
1025 osd_stats = self.get('osd_stats')
1026 for osd in osd_stats['osd_stats']:
1027 id_ = osd['osd']
1028 for stat in OSD_STATS:
94b18763 1029 val = osd['perf_stat'][stat]
91327a77
AA
1030 self.metrics['osd_{}'.format(stat)].set(val, (
1031 'osd.{}'.format(id_),
1032 ))
94b18763 1033
20effc67 1034 def get_service_list(self) -> Dict[Tuple[str, str], Tuple[str, str, str]]:
94b18763
FG
1035 ret = {}
1036 for server in self.list_servers():
f67539c2
TL
1037 version = cast(str, server.get('ceph_version', ''))
1038 host = cast(str, server.get('hostname', ''))
1039 for service in cast(List[ServiceInfoT], server.get('services', [])):
20effc67 1040 ret.update({(service['id'], service['type']): (host, version, service.get('name', ''))})
94b18763 1041 return ret
3efd9988 1042
f6b5b4d7 1043 @profile_method()
f67539c2 1044 def get_metadata_and_osd_status(self) -> None:
3efd9988 1045 osd_map = self.get('osd_map')
94b18763
FG
1046 osd_flags = osd_map['flags'].split(',')
1047 for flag in OSD_FLAGS:
91327a77
AA
1048 self.metrics['osd_flag_{}'.format(flag)].set(
1049 int(flag in osd_flags)
1050 )
94b18763 1051
3efd9988 1052 osd_devices = self.get('osd_map_crush')['devices']
94b18763 1053 servers = self.get_service_list()
3efd9988 1054 for osd in osd_map['osds']:
94b18763 1055 # id can be used to link osd metrics and metadata
3efd9988 1056 id_ = osd['osd']
94b18763 1057 # collect osd metadata
f91f0fd5
TL
1058 p_addr = osd['public_addr'].rsplit(':', 1)[0]
1059 c_addr = osd['cluster_addr'].rsplit(':', 1)[0]
94b18763
FG
1060 if p_addr == "-" or c_addr == "-":
1061 self.log.info(
1062 "Missing address metadata for osd {0}, skipping occupation"
1063 " and metadata records for this osd".format(id_)
1064 )
1065 continue
1066
1067 dev_class = None
1068 for osd_device in osd_devices:
1069 if osd_device['id'] == id_:
1070 dev_class = osd_device.get('class', '')
1071 break
1072
1073 if dev_class is None:
9f95a23c
TL
1074 self.log.info("OSD {0} is missing from CRUSH map, "
1075 "skipping output".format(id_))
94b18763
FG
1076 continue
1077
20effc67 1078 host_version = servers.get((str(id_), 'osd'), ('', '', ''))
94b18763 1079
a8e16298
TL
1080 # collect disk occupation metadata
1081 osd_metadata = self.get_metadata("osd", str(id_))
1082 if osd_metadata is None:
1083 continue
1084
1085 obj_store = osd_metadata.get('osd_objectstore', '')
1086 f_iface = osd_metadata.get('front_iface', '')
1087 b_iface = osd_metadata.get('back_iface', '')
1088
91327a77 1089 self.metrics['osd_metadata'].set(1, (
a8e16298 1090 b_iface,
28e407b8 1091 'osd.{}'.format(id_),
3efd9988 1092 c_addr,
94b18763 1093 dev_class,
a8e16298 1094 f_iface,
28e407b8 1095 host_version[0],
a8e16298
TL
1096 obj_store,
1097 p_addr,
1098 host_version[1]
3efd9988 1099 ))
94b18763
FG
1100
1101 # collect osd status
3efd9988
FG
1102 for state in OSD_STATUS:
1103 status = osd[state]
91327a77
AA
1104 self.metrics['osd_{}'.format(state)].set(status, (
1105 'osd.{}'.format(id_),
1106 ))
3efd9988 1107
92f5a8d4 1108 osd_dev_node = None
f67539c2
TL
1109 osd_wal_dev_node = ''
1110 osd_db_dev_node = ''
a8e16298 1111 if obj_store == "filestore":
11fdf7f2
TL
1112 # collect filestore backend device
1113 osd_dev_node = osd_metadata.get(
1114 'backend_filestore_dev_node', None)
1115 # collect filestore journal device
f64942e4
AA
1116 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
1117 osd_db_dev_node = ''
a8e16298 1118 elif obj_store == "bluestore":
11fdf7f2
TL
1119 # collect bluestore backend device
1120 osd_dev_node = osd_metadata.get(
1121 'bluestore_bdev_dev_node', None)
1122 # collect bluestore wal backend
f64942e4 1123 osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
11fdf7f2 1124 # collect bluestore db backend
f64942e4
AA
1125 osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
1126 if osd_dev_node and osd_dev_node == "unknown":
1127 osd_dev_node = None
1128
f67539c2
TL
1129 # fetch the devices and ids (vendor, model, serial) from the
1130 # osd_metadata
1131 osd_devs = osd_metadata.get('devices', '') or 'N/A'
1132 osd_dev_ids = osd_metadata.get('device_ids', '') or 'N/A'
1133
3efd9988
FG
1134 osd_hostname = osd_metadata.get('hostname', None)
1135 if osd_dev_node and osd_hostname:
1136 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
1137 id_, osd_hostname, osd_dev_node))
91327a77 1138 self.metrics['disk_occupation'].set(1, (
28e407b8 1139 "osd.{0}".format(id_),
3efd9988 1140 osd_dev_node,
f64942e4
AA
1141 osd_db_dev_node,
1142 osd_wal_dev_node,
f67539c2
TL
1143 osd_hostname,
1144 osd_devs,
1145 osd_dev_ids,
3efd9988
FG
1146 ))
1147 else:
1148 self.log.info("Missing dev node metadata for osd {0}, skipping "
11fdf7f2 1149 "occupation record for this osd".format(id_))
3efd9988 1150
20effc67
TL
1151 if 'disk_occupation' in self.metrics:
1152 try:
1153 self.metrics['disk_occupation_human'] = \
1154 self.metrics['disk_occupation'].group_by(
1155 ['device', 'instance'],
1156 {'ceph_daemon': lambda daemons: ', '.join(daemons)},
1157 name='disk_occupation_human',
1158 )
1159 except Exception as e:
1160 self.log.error(e)
1161
b3b6e05e
TL
1162 ec_profiles = osd_map.get('erasure_code_profiles', {})
1163
1164 def _get_pool_info(pool: Dict[str, Any]) -> Tuple[str, str]:
1165 pool_type = 'unknown'
1166 description = 'unknown'
1167
1168 if pool['type'] == 1:
1169 pool_type = "replicated"
1170 description = f"replica:{pool['size']}"
1171 elif pool['type'] == 3:
1172 pool_type = "erasure"
1173 name = pool.get('erasure_code_profile', '')
1174 profile = ec_profiles.get(name, {})
1175 if profile:
1176 description = f"ec:{profile['k']}+{profile['m']}"
1177 else:
1178 description = "ec:unknown"
1179
1180 return pool_type, description
1181
3efd9988 1182 for pool in osd_map['pools']:
b3b6e05e
TL
1183
1184 compression_mode = 'none'
1185 pool_type, pool_description = _get_pool_info(pool)
1186
1187 if 'options' in pool:
1188 compression_mode = pool['options'].get('compression_mode', 'none')
1189
11fdf7f2 1190 self.metrics['pool_metadata'].set(
b3b6e05e
TL
1191 1, (
1192 pool['pool'],
1193 pool['pool_name'],
1194 pool_type,
1195 pool_description,
1196 compression_mode)
1197 )
94b18763 1198
11fdf7f2 1199 # Populate other servers metadata
94b18763
FG
1200 for key, value in servers.items():
1201 service_id, service_type = key
11fdf7f2 1202 if service_type == 'rgw':
20effc67 1203 hostname, version, name = value
11fdf7f2
TL
1204 self.metrics['rgw_metadata'].set(
1205 1,
20effc67
TL
1206 ('{}.{}'.format(service_type, name),
1207 hostname, version, service_id)
11fdf7f2
TL
1208 )
1209 elif service_type == 'rbd-mirror':
1210 mirror_metadata = self.get_metadata('rbd-mirror', service_id)
1211 if mirror_metadata is None:
1212 continue
1213 mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type,
1214 service_id)
20effc67 1215 rbd_mirror_metadata = cast(LabelValues,
f67539c2
TL
1216 (mirror_metadata.get(k, '')
1217 for k in RBD_MIRROR_METADATA))
11fdf7f2 1218 self.metrics['rbd_mirror_metadata'].set(
f67539c2 1219 1, rbd_mirror_metadata
11fdf7f2 1220 )
3efd9988 1221
f6b5b4d7 1222 @profile_method()
f67539c2 1223 def get_num_objects(self) -> None:
28e407b8
AA
1224 pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum']
1225 for obj in NUM_OBJECTS:
1226 stat = 'num_objects_{}'.format(obj)
91327a77 1227 self.metrics[stat].set(pg_sum[stat])
28e407b8 1228
f6b5b4d7 1229 @profile_method()
f67539c2 1230 def get_rbd_stats(self) -> None:
11fdf7f2
TL
1231 # Per RBD image stats is collected by registering a dynamic osd perf
1232 # stats query that tells OSDs to group stats for requests associated
1233 # with RBD objects by pool, namespace, and image id, which are
1234 # extracted from the request object names or other attributes.
1235 # The RBD object names have the following prefixes:
1236 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
1237 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
1238 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
1239 # The pool_id in the object name is the id of the pool with the image
1240 # metdata, and should be used in the image spec. If there is no pool_id
1241 # in the object name, the image pool is the pool where the object is
1242 # located.
1243
1244 # Parse rbd_stats_pools option, which is a comma or space separated
1245 # list of pool[/namespace] entries. If no namespace is specifed the
f6b5b4d7
TL
1246 # stats are collected for every namespace in the pool. The wildcard
1247 # '*' can be used to indicate all pools or namespaces
f67539c2 1248 pools_string = cast(str, self.get_localized_module_option('rbd_stats_pools'))
f6b5b4d7 1249 pool_keys = []
f67539c2 1250 for x in re.split(r'[\s,]+', pools_string):
f6b5b4d7
TL
1251 if not x:
1252 continue
1253
1254 s = x.split('/', 2)
11fdf7f2 1255 pool_name = s[0]
f6b5b4d7
TL
1256 namespace_name = None
1257 if len(s) == 2:
1258 namespace_name = s[1]
1259
1260 if pool_name == "*":
1261 # collect for all pools
1262 osd_map = self.get('osd_map')
1263 for pool in osd_map['pools']:
1264 if 'rbd' not in pool.get('application_metadata', {}):
1265 continue
1266 pool_keys.append((pool['pool_name'], namespace_name))
1267 else:
1268 pool_keys.append((pool_name, namespace_name))
1269
1270 pools = {} # type: Dict[str, Set[str]]
1271 for pool_key in pool_keys:
1272 pool_name = pool_key[0]
1273 namespace_name = pool_key[1]
1274 if not namespace_name or namespace_name == "*":
11fdf7f2
TL
1275 # empty set means collect for all namespaces
1276 pools[pool_name] = set()
1277 continue
f6b5b4d7 1278
11fdf7f2
TL
1279 if pool_name not in pools:
1280 pools[pool_name] = set()
1281 elif not pools[pool_name]:
1282 continue
f6b5b4d7 1283 pools[pool_name].add(namespace_name)
11fdf7f2
TL
1284
1285 rbd_stats_pools = {}
f6b5b4d7 1286 for pool_id in self.rbd_stats['pools'].keys():
11fdf7f2
TL
1287 name = self.rbd_stats['pools'][pool_id]['name']
1288 if name not in pools:
1289 del self.rbd_stats['pools'][pool_id]
1290 else:
1291 rbd_stats_pools[name] = \
1292 self.rbd_stats['pools'][pool_id]['ns_names']
1293
1294 pools_refreshed = False
1295 if pools:
1296 next_refresh = self.rbd_stats['pools_refresh_time'] + \
1297 self.get_localized_module_option(
1298 'rbd_stats_pools_refresh_interval', 300)
1299 if rbd_stats_pools != pools or time.time() >= next_refresh:
1300 self.refresh_rbd_stats_pools(pools)
1301 pools_refreshed = True
1302
1303 pool_ids = list(self.rbd_stats['pools'])
1304 pool_ids.sort()
1305 pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$'
1306
1307 nspace_names = []
1308 for pool_id, pool in self.rbd_stats['pools'].items():
1309 if pool['ns_names']:
1310 nspace_names.extend(pool['ns_names'])
1311 else:
1312 nspace_names = []
1313 break
1314 if nspace_names:
1315 namespace_regex = '^(' + \
1316 "|".join([re.escape(x)
1317 for x in set(nspace_names)]) + ')$'
1318 else:
1319 namespace_regex = '^(.*)$'
1320
f67539c2
TL
1321 if ('query' in self.rbd_stats
1322 and (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex']
1323 or namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex'])):
11fdf7f2
TL
1324 self.remove_osd_perf_query(self.rbd_stats['query_id'])
1325 del self.rbd_stats['query_id']
1326 del self.rbd_stats['query']
1327
1328 if not self.rbd_stats['pools']:
1329 return
1330
1331 counters_info = self.rbd_stats['counters_info']
1332
1333 if 'query_id' not in self.rbd_stats:
1334 query = {
1335 'key_descriptor': [
1336 {'type': 'pool_id', 'regex': pool_id_regex},
1337 {'type': 'namespace', 'regex': namespace_regex},
1338 {'type': 'object_name',
f67539c2 1339 'regex': r'^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
11fdf7f2
TL
1340 ],
1341 'performance_counter_descriptors': list(counters_info),
1342 }
1343 query_id = self.add_osd_perf_query(query)
1344 if query_id is None:
1345 self.log.error('failed to add query %s' % query)
1346 return
1347 self.rbd_stats['query'] = query
1348 self.rbd_stats['query_id'] = query_id
1349
1350 res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
f67539c2 1351 assert res
11fdf7f2
TL
1352 for c in res['counters']:
1353 # if the pool id is not found in the object name use id of the
1354 # pool where the object is located
1355 if c['k'][2][0]:
1356 pool_id = int(c['k'][2][0])
1357 else:
1358 pool_id = int(c['k'][0][0])
1359 if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
1360 self.refresh_rbd_stats_pools(pools)
1361 pools_refreshed = True
1362 if pool_id not in self.rbd_stats['pools']:
1363 continue
1364 pool = self.rbd_stats['pools'][pool_id]
1365 nspace_name = c['k'][1][0]
1366 if nspace_name not in pool['images']:
1367 continue
1368 image_id = c['k'][2][1]
1369 if image_id not in pool['images'][nspace_name] and \
1370 not pools_refreshed:
1371 self.refresh_rbd_stats_pools(pools)
1372 pool = self.rbd_stats['pools'][pool_id]
1373 pools_refreshed = True
1374 if image_id not in pool['images'][nspace_name]:
1375 continue
1376 counters = pool['images'][nspace_name][image_id]['c']
1377 for i in range(len(c['c'])):
1378 counters[i][0] += c['c'][i][0]
1379 counters[i][1] += c['c'][i][1]
1380
1381 label_names = ("pool", "namespace", "image")
1382 for pool_id, pool in self.rbd_stats['pools'].items():
1383 pool_name = pool['name']
1384 for nspace_name, images in pool['images'].items():
1385 for image_id in images:
1386 image_name = images[image_id]['n']
1387 counters = images[image_id]['c']
1388 i = 0
1389 for key in counters_info:
1390 counter_info = counters_info[key]
1391 stattype = self._stattype_to_str(counter_info['type'])
1392 labels = (pool_name, nspace_name, image_name)
1393 if counter_info['type'] == self.PERFCOUNTER_COUNTER:
1394 path = 'rbd_' + key
1395 if path not in self.metrics:
1396 self.metrics[path] = Metric(
1397 stattype,
1398 path,
1399 counter_info['desc'],
1400 label_names,
1401 )
1402 self.metrics[path].set(counters[i][0], labels)
1403 elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
1404 path = 'rbd_' + key + '_sum'
1405 if path not in self.metrics:
1406 self.metrics[path] = Metric(
1407 stattype,
1408 path,
1409 counter_info['desc'] + ' Total',
1410 label_names,
1411 )
1412 self.metrics[path].set(counters[i][0], labels)
1413 path = 'rbd_' + key + '_count'
1414 if path not in self.metrics:
1415 self.metrics[path] = Metric(
1416 'counter',
1417 path,
1418 counter_info['desc'] + ' Count',
1419 label_names,
1420 )
1421 self.metrics[path].set(counters[i][1], labels)
1422 i += 1
1423
f67539c2 1424 def refresh_rbd_stats_pools(self, pools: Dict[str, Set[str]]) -> None:
11fdf7f2
TL
1425 self.log.debug('refreshing rbd pools %s' % (pools))
1426
1427 rbd = RBD()
1428 counters_info = self.rbd_stats['counters_info']
1429 for pool_name, cfg_ns_names in pools.items():
1430 try:
1431 pool_id = self.rados.pool_lookup(pool_name)
1432 with self.rados.open_ioctx(pool_name) as ioctx:
1433 if pool_id not in self.rbd_stats['pools']:
1434 self.rbd_stats['pools'][pool_id] = {'images': {}}
1435 pool = self.rbd_stats['pools'][pool_id]
1436 pool['name'] = pool_name
1437 pool['ns_names'] = cfg_ns_names
1438 if cfg_ns_names:
1439 nspace_names = list(cfg_ns_names)
1440 else:
1441 nspace_names = [''] + rbd.namespace_list(ioctx)
1442 for nspace_name in pool['images']:
1443 if nspace_name not in nspace_names:
1444 del pool['images'][nspace_name]
1445 for nspace_name in nspace_names:
f67539c2
TL
1446 if nspace_name and\
1447 not rbd.namespace_exists(ioctx, nspace_name):
11fdf7f2
TL
1448 self.log.debug('unknown namespace %s for pool %s' %
1449 (nspace_name, pool_name))
1450 continue
1451 ioctx.set_namespace(nspace_name)
1452 if nspace_name not in pool['images']:
1453 pool['images'][nspace_name] = {}
1454 namespace = pool['images'][nspace_name]
1455 images = {}
1456 for image_meta in RBD().list2(ioctx):
1457 image = {'n': image_meta['name']}
1458 image_id = image_meta['id']
1459 if image_id in namespace:
1460 image['c'] = namespace[image_id]['c']
1461 else:
1462 image['c'] = [[0, 0] for x in counters_info]
1463 images[image_id] = image
1464 pool['images'][nspace_name] = images
1465 except Exception as e:
1466 self.log.error('failed listing pool %s: %s' % (pool_name, e))
1467 self.rbd_stats['pools_refresh_time'] = time.time()
1468
f67539c2 1469 def shutdown_rbd_stats(self) -> None:
11fdf7f2
TL
1470 if 'query_id' in self.rbd_stats:
1471 self.remove_osd_perf_query(self.rbd_stats['query_id'])
1472 del self.rbd_stats['query_id']
1473 del self.rbd_stats['query']
1474 self.rbd_stats['pools'].clear()
1475
f67539c2 1476 def add_fixed_name_metrics(self) -> None:
e306af50
TL
1477 """
1478 Add fixed name metrics from existing ones that have details in their names
1479 that should be in labels (not in name).
1480 For backward compatibility, a new fixed name metric is created (instead of replacing)
1481 and details are put in new labels.
1482 Intended for RGW sync perf. counters but extendable as required.
1483 See: https://tracker.ceph.com/issues/45311
1484 """
1485 new_metrics = {}
f67539c2 1486 for metric_path, metrics in self.metrics.items():
e306af50 1487 # Address RGW sync perf. counters.
f67539c2 1488 match = re.search(r'^data-sync-from-(.*)\.', metric_path)
e306af50
TL
1489 if match:
1490 new_path = re.sub('from-([^.]*)', 'from-zone', metric_path)
1491 if new_path not in new_metrics:
1492 new_metrics[new_path] = Metric(
f67539c2 1493 metrics.mtype,
e306af50 1494 new_path,
f67539c2 1495 metrics.desc,
20effc67 1496 cast(LabelValues, metrics.labelnames) + ('source_zone',)
e306af50 1497 )
f67539c2 1498 for label_values, value in metrics.value.items():
e306af50
TL
1499 new_metrics[new_path].set(value, label_values + (match.group(1),))
1500
1501 self.metrics.update(new_metrics)
1502
f67539c2
TL
1503 def get_collect_time_metrics(self) -> None:
1504 sum_metric = self.metrics.get('prometheus_collect_duration_seconds_sum')
1505 count_metric = self.metrics.get('prometheus_collect_duration_seconds_count')
1506 if sum_metric is None:
1507 sum_metric = MetricCounter(
1508 'prometheus_collect_duration_seconds_sum',
1509 'The sum of seconds took to collect all metrics of this exporter',
1510 ('method',))
1511 self.metrics['prometheus_collect_duration_seconds_sum'] = sum_metric
1512 if count_metric is None:
1513 count_metric = MetricCounter(
1514 'prometheus_collect_duration_seconds_count',
1515 'The amount of metrics gathered for this exporter',
1516 ('method',))
20effc67 1517 self.metrics['prometheus_collect_duration_seconds_count'] = count_metric
f67539c2
TL
1518
1519 # Collect all timing data and make it available as metric, excluding the
1520 # `collect` method because it has not finished at this point and hence
1521 # there's no `_execution_duration` attribute to be found. The
1522 # `_execution_duration` attribute is added by the `profile_method`
1523 # decorator.
1524 for method_name, method in Module.__dict__.items():
1525 duration = getattr(method, '_execution_duration', None)
1526 if duration is not None:
1527 cast(MetricCounter, sum_metric).add(duration, (method_name,))
1528 cast(MetricCounter, count_metric).add(1, (method_name,))
1529
f6b5b4d7 1530 @profile_method(True)
f67539c2 1531 def collect(self) -> str:
91327a77
AA
1532 # Clear the metrics before scraping
1533 for k in self.metrics.keys():
1534 self.metrics[k].clear()
1535
3efd9988
FG
1536 self.get_health()
1537 self.get_df()
11fdf7f2 1538 self.get_pool_stats()
94b18763 1539 self.get_fs()
b32b8144 1540 self.get_osd_stats()
3efd9988 1541 self.get_quorum_status()
494da23a 1542 self.get_mgr_status()
3efd9988
FG
1543 self.get_metadata_and_osd_status()
1544 self.get_pg_status()
28e407b8 1545 self.get_num_objects()
3efd9988 1546
94b18763 1547 for daemon, counters in self.get_all_perf_counters().items():
3efd9988 1548 for path, counter_info in counters.items():
28e407b8 1549 # Skip histograms, they are represented by long running avgs
3efd9988 1550 stattype = self._stattype_to_str(counter_info['type'])
3efd9988
FG
1551 if not stattype or stattype == 'histogram':
1552 self.log.debug('ignoring %s, type %s' % (path, stattype))
1553 continue
1554
81eedcae
TL
1555 path, label_names, labels = self._perfpath_to_path_labels(
1556 daemon, path)
1557
28e407b8 1558 # Get the value of the counter
11fdf7f2
TL
1559 value = self._perfvalue_to_value(
1560 counter_info['type'], counter_info['value'])
28e407b8
AA
1561
1562 # Represent the long running avgs as sum/count pairs
1563 if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
1564 _path = path + '_sum'
91327a77
AA
1565 if _path not in self.metrics:
1566 self.metrics[_path] = Metric(
1567 stattype,
1568 _path,
1569 counter_info['description'] + ' Total',
81eedcae 1570 label_names,
91327a77 1571 )
81eedcae 1572 self.metrics[_path].set(value, labels)
28e407b8
AA
1573
1574 _path = path + '_count'
91327a77
AA
1575 if _path not in self.metrics:
1576 self.metrics[_path] = Metric(
1577 'counter',
1578 _path,
1579 counter_info['description'] + ' Count',
81eedcae 1580 label_names,
91327a77 1581 )
81eedcae 1582 self.metrics[_path].set(counter_info['count'], labels,)
28e407b8 1583 else:
91327a77
AA
1584 if path not in self.metrics:
1585 self.metrics[path] = Metric(
1586 stattype,
1587 path,
1588 counter_info['description'],
81eedcae 1589 label_names,
91327a77 1590 )
81eedcae 1591 self.metrics[path].set(value, labels)
91327a77 1592
e306af50 1593 self.add_fixed_name_metrics()
11fdf7f2
TL
1594 self.get_rbd_stats()
1595
f67539c2
TL
1596 self.get_collect_time_metrics()
1597
91327a77
AA
1598 # Return formatted metrics and clear no longer used data
1599 _metrics = [m.str_expfmt() for m in self.metrics.values()]
1600 for k in self.metrics.keys():
1601 self.metrics[k].clear()
1602
1603 return ''.join(_metrics) + '\n'
c07f9fc5 1604
f67539c2
TL
1605 @CLIReadCommand('prometheus file_sd_config')
1606 def get_file_sd_config(self) -> Tuple[int, str, str]:
1607 '''
1608 Return file_sd compatible prometheus config for mgr cluster
1609 '''
11fdf7f2
TL
1610 servers = self.list_servers()
1611 targets = []
1612 for server in servers:
1613 hostname = server.get('hostname', '')
f67539c2 1614 for service in cast(List[ServiceInfoT], server.get('services', [])):
11fdf7f2
TL
1615 if service['type'] != 'mgr':
1616 continue
1617 id_ = service['id']
adb31ebb
TL
1618 port = self._get_module_option('server_port', DEFAULT_PORT, id_)
1619 targets.append(f'{hostname}:{port}')
11fdf7f2
TL
1620 ret = [
1621 {
1622 "targets": targets,
1623 "labels": {}
1624 }
1625 ]
1626 return 0, json.dumps(ret), ""
1627
f67539c2 1628 def self_test(self) -> None:
11fdf7f2
TL
1629 self.collect()
1630 self.get_file_sd_config()
1631
f67539c2 1632 def serve(self) -> None:
c07f9fc5
FG
1633
1634 class Root(object):
1635
1636 # collapse everything to '/'
f67539c2 1637 def _cp_dispatch(self, vpath: str) -> 'Root':
c07f9fc5
FG
1638 cherrypy.request.path = ''
1639 return self
1640
c07f9fc5 1641 @cherrypy.expose
f67539c2 1642 def index(self) -> str:
3efd9988
FG
1643 return '''<!DOCTYPE html>
1644<html>
9f95a23c
TL
1645 <head><title>Ceph Exporter</title></head>
1646 <body>
1647 <h1>Ceph Exporter</h1>
1648 <p><a href='/metrics'>Metrics</a></p>
1649 </body>
3efd9988
FG
1650</html>'''
1651
1652 @cherrypy.expose
f67539c2 1653 def metrics(self) -> Optional[str]:
91327a77 1654 # Lock the function execution
f6b5b4d7
TL
1655 assert isinstance(_global_instance, Module)
1656 with _global_instance.collect_lock:
1657 return self._metrics(_global_instance)
91327a77 1658
11fdf7f2 1659 @staticmethod
f67539c2 1660 def _metrics(instance: 'Module') -> Optional[str]:
a4b75251
TL
1661 if not self.cache:
1662 self.log.debug('Cache disabled, collecting and returning without cache')
1663 cherrypy.response.headers['Content-Type'] = 'text/plain'
1664 return self.collect()
1665
f6b5b4d7
TL
1666 # Return cached data if available
1667 if not instance.collect_cache:
1668 raise cherrypy.HTTPError(503, 'No cached data available yet')
91327a77 1669
f67539c2 1670 def respond() -> Optional[str]:
f6b5b4d7 1671 assert isinstance(instance, Module)
91327a77
AA
1672 cherrypy.response.headers['Content-Type'] = 'text/plain'
1673 return instance.collect_cache
f6b5b4d7
TL
1674
1675 if instance.collect_time < instance.scrape_interval:
1676 # Respond if cache isn't stale
1677 return respond()
1678
1679 if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN:
1680 # Respond even if cache is stale
1681 instance.log.info(
1682 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, '
1683 'returning metrics from stale cache.'.format(
1684 instance.collect_time,
1685 instance.collect_time - instance.scrape_interval
1686 )
1687 )
1688 return respond()
1689
1690 if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL:
1691 # Fail if cache is stale
1692 msg = (
1693 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, '
1694 'returning "service unavailable".'.format(
1695 instance.collect_time,
1696 instance.collect_time - instance.scrape_interval,
1697 )
1698 )
1699 instance.log.error(msg)
1700 raise cherrypy.HTTPError(503, msg)
f67539c2 1701 return None
c07f9fc5 1702
91327a77 1703 # Make the cache timeout for collecting configurable
f67539c2 1704 self.scrape_interval = cast(float, self.get_localized_module_option('scrape_interval'))
f6b5b4d7 1705
f67539c2
TL
1706 self.stale_cache_strategy = cast(
1707 str, self.get_localized_module_option('stale_cache_strategy'))
f6b5b4d7
TL
1708 if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL,
1709 self.STALE_CACHE_RETURN]:
1710 self.stale_cache_strategy = self.STALE_CACHE_FAIL
91327a77 1711
522d829b
TL
1712 server_addr = cast(str, self.get_localized_module_option(
1713 'server_addr', get_default_addr()))
1714 server_port = cast(int, self.get_localized_module_option(
1715 'server_port', DEFAULT_PORT))
c07f9fc5
FG
1716 self.log.info(
1717 "server_addr: %s server_port: %s" %
1718 (server_addr, server_port)
1719 )
c07f9fc5 1720
a4b75251
TL
1721 self.cache = cast(bool, self.get_localized_module_option('cache', True))
1722 if self.cache:
1723 self.log.info('Cache enabled')
1724 self.metrics_thread.start()
1725 else:
1726 self.log.info('Cache disabled')
adb31ebb 1727
a4b75251
TL
1728 cherrypy.config.update({
1729 'server.socket_host': server_addr,
1730 'server.socket_port': server_port,
1731 'engine.autoreload.on': False
1732 })
94b18763
FG
1733 # Publish the URI that others may use to access the service we're
1734 # about to start serving
b3b6e05e
TL
1735 if server_addr in ['::', '0.0.0.0']:
1736 server_addr = self.get_mgr_ip()
522d829b 1737 self.set_uri(build_url(scheme='http', host=server_addr, port=server_port, path='/'))
94b18763 1738
c07f9fc5 1739 cherrypy.tree.mount(Root(), "/")
94b18763 1740 self.log.info('Starting engine...')
c07f9fc5 1741 cherrypy.engine.start()
94b18763 1742 self.log.info('Engine started.')
91327a77
AA
1743 # wait for the shutdown event
1744 self.shutdown_event.wait()
1745 self.shutdown_event.clear()
adb31ebb
TL
1746 # tell metrics collection thread to stop collecting new metrics
1747 self.metrics_thread.stop()
91327a77
AA
1748 cherrypy.engine.stop()
1749 self.log.info('Engine stopped.')
11fdf7f2 1750 self.shutdown_rbd_stats()
adb31ebb
TL
1751 # wait for the metrics collection thread to stop
1752 self.metrics_thread.join()
94b18763 1753
f67539c2 1754 def shutdown(self) -> None:
94b18763 1755 self.log.info('Stopping engine...')
91327a77 1756 self.shutdown_event.set()
94b18763 1757
20effc67
TL
1758 @CLIReadCommand('healthcheck history ls')
1759 def _list_healthchecks(self, format: Format = Format.plain) -> HandleCommandResult:
1760 """List all the healthchecks being tracked
1761
1762 The format options are parsed in ceph_argparse, before they get evaluated here so
1763 we can safely assume that what we have to process is valid. ceph_argparse will throw
1764 a ValueError if the cast to our Format class fails.
1765
1766 Args:
1767 format (Format, optional): output format. Defaults to Format.plain.
1768
1769 Returns:
1770 HandleCommandResult: return code, stdout and stderr returned to the caller
1771 """
1772
1773 out = ""
1774 if format == Format.plain:
1775 out = str(self.health_history)
1776 elif format == Format.yaml:
1777 out = self.health_history.as_yaml()
1778 else:
1779 out = self.health_history.as_json(format == Format.json_pretty)
1780
1781 return HandleCommandResult(retval=0, stdout=out)
1782
1783 @CLIWriteCommand('healthcheck history clear')
1784 def _clear_healthchecks(self) -> HandleCommandResult:
1785 """Clear the healthcheck history"""
1786 self.health_history.reset()
1787 return HandleCommandResult(retval=0, stdout="healthcheck history cleared")
1788
94b18763
FG
1789
1790class StandbyModule(MgrStandbyModule):
20effc67
TL
1791
1792 MODULE_OPTIONS = Module.MODULE_OPTIONS
1793
f67539c2 1794 def __init__(self, *args: Any, **kwargs: Any) -> None:
91327a77
AA
1795 super(StandbyModule, self).__init__(*args, **kwargs)
1796 self.shutdown_event = threading.Event()
1797
f67539c2 1798 def serve(self) -> None:
494da23a
TL
1799 server_addr = self.get_localized_module_option(
1800 'server_addr', get_default_addr())
11fdf7f2
TL
1801 server_port = self.get_localized_module_option(
1802 'server_port', DEFAULT_PORT)
1803 self.log.info("server_addr: %s server_port: %s" %
1804 (server_addr, server_port))
94b18763
FG
1805 cherrypy.config.update({
1806 'server.socket_host': server_addr,
f67539c2 1807 'server.socket_port': server_port,
20effc67
TL
1808 'engine.autoreload.on': False,
1809 'request.show_tracebacks': False
94b18763
FG
1810 })
1811
1812 module = self
1813
1814 class Root(object):
94b18763 1815 @cherrypy.expose
f67539c2 1816 def index(self) -> str:
20effc67
TL
1817 standby_behaviour = module.get_module_option('standby_behaviour')
1818 if standby_behaviour == 'default':
1819 active_uri = module.get_active_uri()
1820 return '''<!DOCTYPE html>
94b18763 1821<html>
9f95a23c
TL
1822 <head><title>Ceph Exporter</title></head>
1823 <body>
1824 <h1>Ceph Exporter</h1>
94b18763 1825 <p><a href='{}metrics'>Metrics</a></p>
9f95a23c 1826 </body>
94b18763 1827</html>'''.format(active_uri)
20effc67
TL
1828 else:
1829 status = module.get_module_option('standby_error_status_code')
1830 raise cherrypy.HTTPError(status, message="Keep on looking")
94b18763
FG
1831
1832 @cherrypy.expose
f67539c2 1833 def metrics(self) -> str:
94b18763
FG
1834 cherrypy.response.headers['Content-Type'] = 'text/plain'
1835 return ''
1836
1837 cherrypy.tree.mount(Root(), '/', {})
1838 self.log.info('Starting engine...')
1839 cherrypy.engine.start()
94b18763 1840 self.log.info('Engine started.')
91327a77
AA
1841 # Wait for shutdown event
1842 self.shutdown_event.wait()
1843 self.shutdown_event.clear()
1844 cherrypy.engine.stop()
1845 self.log.info('Engine stopped.')
94b18763 1846
f67539c2 1847 def shutdown(self) -> None:
94b18763 1848 self.log.info("Stopping engine...")
91327a77 1849 self.shutdown_event.set()
94b18763 1850 self.log.info("Stopped engine")