]>
Commit | Line | Data |
---|---|---|
c07f9fc5 | 1 | import cherrypy |
adb31ebb | 2 | from collections import defaultdict |
a8e16298 | 3 | from distutils.version import StrictVersion |
3efd9988 | 4 | import json |
c07f9fc5 FG |
5 | import math |
6 | import os | |
11fdf7f2 | 7 | import re |
91327a77 AA |
8 | import threading |
9 | import time | |
20effc67 TL |
10 | import enum |
11 | from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand | |
522d829b | 12 | from mgr_util import get_default_addr, profile_method, build_url |
11fdf7f2 | 13 | from rbd import RBD |
adb31ebb | 14 | from collections import namedtuple |
20effc67 | 15 | import yaml |
522d829b | 16 | |
20effc67 TL |
17 | from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable |
18 | ||
19 | LabelValues = Tuple[str, ...] | |
20 | Number = Union[int, float] | |
21 | MetricValue = Dict[LabelValues, Number] | |
c07f9fc5 FG |
22 | |
23 | # Defaults for the Prometheus HTTP server. Can also set in config-key | |
24 | # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations | |
25 | # for Prometheus exporter port registry | |
26 | ||
c07f9fc5 FG |
27 | DEFAULT_PORT = 9283 |
28 | ||
a8e16298 TL |
29 | # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify |
30 | # that the ports its listening on are in fact bound. When using the any address | |
31 | # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes) | |
32 | # ipv6 isn't yet configured / supported and CherryPy throws an uncaught | |
33 | # exception. | |
34 | if cherrypy is not None: | |
35 | v = StrictVersion(cherrypy.__version__) | |
36 | # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on | |
37 | # centos:7) and back to at least 3.0.0. | |
38 | if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"): | |
39 | # https://github.com/cherrypy/cherrypy/issues/1100 | |
40 | from cherrypy.process import servers | |
41 | servers.wait_for_occupied_port = lambda host, port: None | |
c07f9fc5 | 42 | |
9f95a23c | 43 | |
c07f9fc5 | 44 | # cherrypy likes to sys.exit on error. don't let it take us down too! |
f67539c2 | 45 | def os_exit_noop(status: int) -> None: |
c07f9fc5 FG |
46 | pass |
47 | ||
48 | ||
f67539c2 | 49 | os._exit = os_exit_noop # type: ignore |
c07f9fc5 | 50 | |
c07f9fc5 FG |
51 | # to access things in class Module from subclass Root. Because |
52 | # it's a dict, the writer doesn't need to declare 'global' for access | |
53 | ||
f6b5b4d7 | 54 | _global_instance = None # type: Optional[Module] |
cd265ab1 TL |
55 | cherrypy.config.update({ |
56 | 'response.headers.server': 'Ceph-Prometheus' | |
57 | }) | |
c07f9fc5 FG |
58 | |
59 | ||
f67539c2 | 60 | def health_status_to_number(status: str) -> int: |
3efd9988 FG |
61 | if status == 'HEALTH_OK': |
62 | return 0 | |
63 | elif status == 'HEALTH_WARN': | |
64 | return 1 | |
65 | elif status == 'HEALTH_ERR': | |
66 | return 2 | |
f67539c2 | 67 | raise ValueError(f'unknown status "{status}"') |
c07f9fc5 | 68 | |
11fdf7f2 TL |
69 | |
70 | DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes'] | |
71 | ||
1d09f67e | 72 | DF_POOL = ['max_avail', 'avail_raw', 'stored', 'stored_raw', 'objects', 'dirty', |
f91f0fd5 | 73 | 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes', |
b3b6e05e | 74 | 'compress_bytes_used', 'compress_under_bytes', 'bytes_used', 'percent_used'] |
c07f9fc5 | 75 | |
11fdf7f2 TL |
76 | OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec', |
77 | 'recovering_keys_per_sec', 'num_objects_recovered', | |
78 | 'num_bytes_recovered', 'num_bytes_recovered') | |
79 | ||
94b18763 FG |
80 | OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance', |
81 | 'norecover', 'noscrub', 'nodeep-scrub') | |
3efd9988 | 82 | |
28e407b8 | 83 | FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name') |
b32b8144 | 84 | |
28e407b8 AA |
85 | MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank', |
86 | 'ceph_version') | |
3efd9988 | 87 | |
11fdf7f2 TL |
88 | MON_METADATA = ('ceph_daemon', 'hostname', |
89 | 'public_addr', 'rank', 'ceph_version') | |
c07f9fc5 | 90 | |
494da23a TL |
91 | MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
92 | ||
93 | MGR_STATUS = ('ceph_daemon',) | |
94 | ||
95 | MGR_MODULE_STATUS = ('name',) | |
96 | ||
97 | MGR_MODULE_CAN_RUN = ('name',) | |
98 | ||
a8e16298 TL |
99 | OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class', |
100 | 'front_iface', 'hostname', 'objectstore', 'public_addr', | |
101 | 'ceph_version') | |
c07f9fc5 | 102 | |
94b18763 | 103 | OSD_STATUS = ['weight', 'up', 'in'] |
c07f9fc5 | 104 | |
94b18763 | 105 | OSD_STATS = ['apply_latency_ms', 'commit_latency_ms'] |
c07f9fc5 | 106 | |
b3b6e05e | 107 | POOL_METADATA = ('pool_id', 'name', 'type', 'description', 'compression_mode') |
c07f9fc5 | 108 | |
20effc67 | 109 | RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version', 'instance_id') |
c07f9fc5 | 110 | |
11fdf7f2 TL |
111 | RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname', |
112 | 'ceph_version') | |
113 | ||
114 | DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', | |
f67539c2 | 115 | 'wal_device', 'instance', 'devices', 'device_ids') |
28e407b8 AA |
116 | |
117 | NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] | |
c07f9fc5 | 118 | |
adb31ebb TL |
119 | alert_metric = namedtuple('alert_metric', 'name description') |
120 | HEALTH_CHECKS = [ | |
f67539c2 | 121 | alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'), |
adb31ebb TL |
122 | ] |
123 | ||
20effc67 TL |
124 | HEALTHCHECK_DETAIL = ('name', 'severity') |
125 | ||
126 | ||
127 | class Severity(enum.Enum): | |
128 | ok = "HEALTH_OK" | |
129 | warn = "HEALTH_WARN" | |
130 | error = "HEALTH_ERR" | |
131 | ||
132 | ||
133 | class Format(enum.Enum): | |
134 | plain = 'plain' | |
135 | json = 'json' | |
136 | json_pretty = 'json-pretty' | |
137 | yaml = 'yaml' | |
138 | ||
139 | ||
140 | class HealthCheckEvent: | |
141 | ||
142 | def __init__(self, name: str, severity: Severity, first_seen: float, last_seen: float, count: int, active: bool = True): | |
143 | self.name = name | |
144 | self.severity = severity | |
145 | self.first_seen = first_seen | |
146 | self.last_seen = last_seen | |
147 | self.count = count | |
148 | self.active = active | |
149 | ||
150 | def as_dict(self) -> Dict[str, Any]: | |
151 | """Return the instance as a dictionary.""" | |
152 | return self.__dict__ | |
153 | ||
154 | ||
155 | class HealthHistory: | |
156 | kv_name = 'health_history' | |
157 | titles = "{healthcheck_name:<24} {first_seen:<20} {last_seen:<20} {count:>5} {active:^6}" | |
158 | date_format = "%Y/%m/%d %H:%M:%S" | |
159 | ||
160 | def __init__(self, mgr: MgrModule): | |
161 | self.mgr = mgr | |
162 | self.lock = threading.Lock() | |
163 | self.healthcheck: Dict[str, HealthCheckEvent] = {} | |
164 | self._load() | |
165 | ||
166 | def _load(self) -> None: | |
167 | """Load the current state from the mons KV store.""" | |
168 | data = self.mgr.get_store(self.kv_name) | |
169 | if data: | |
170 | try: | |
171 | healthcheck_data = json.loads(data) | |
172 | except json.JSONDecodeError: | |
173 | self.mgr.log.warn( | |
174 | f"INVALID data read from mgr/prometheus/{self.kv_name}. Resetting") | |
175 | self.reset() | |
176 | return | |
177 | else: | |
178 | for k, v in healthcheck_data.items(): | |
179 | self.healthcheck[k] = HealthCheckEvent( | |
180 | name=k, | |
181 | severity=v.get('severity'), | |
182 | first_seen=v.get('first_seen', 0), | |
183 | last_seen=v.get('last_seen', 0), | |
184 | count=v.get('count', 1), | |
185 | active=v.get('active', True)) | |
186 | else: | |
187 | self.reset() | |
188 | ||
189 | def reset(self) -> None: | |
190 | """Reset the healthcheck history.""" | |
191 | with self.lock: | |
192 | self.mgr.set_store(self.kv_name, "{}") | |
193 | self.healthcheck = {} | |
194 | ||
195 | def save(self) -> None: | |
196 | """Save the current in-memory healthcheck history to the KV store.""" | |
197 | with self.lock: | |
198 | self.mgr.set_store(self.kv_name, self.as_json()) | |
199 | ||
200 | def check(self, health_checks: Dict[str, Any]) -> None: | |
201 | """Look at the current health checks and compare existing the history. | |
202 | ||
203 | Args: | |
204 | health_checks (Dict[str, Any]): current health check data | |
205 | """ | |
206 | ||
207 | current_checks = health_checks.get('checks', {}) | |
208 | changes_made = False | |
209 | ||
210 | # first turn off any active states we're tracking | |
211 | for seen_check in self.healthcheck: | |
212 | check = self.healthcheck[seen_check] | |
213 | if check.active and seen_check not in current_checks: | |
214 | check.active = False | |
215 | changes_made = True | |
216 | ||
217 | # now look for any additions to track | |
218 | now = time.time() | |
219 | for name, info in current_checks.items(): | |
220 | if name not in self.healthcheck: | |
221 | # this healthcheck is new, so start tracking it | |
222 | changes_made = True | |
223 | self.healthcheck[name] = HealthCheckEvent( | |
224 | name=name, | |
225 | severity=info.get('severity'), | |
226 | first_seen=now, | |
227 | last_seen=now, | |
228 | count=1, | |
229 | active=True | |
230 | ) | |
231 | else: | |
232 | # seen it before, so update its metadata | |
233 | check = self.healthcheck[name] | |
234 | if check.active: | |
235 | # check has been registered as active already, so skip | |
236 | continue | |
237 | else: | |
238 | check.last_seen = now | |
239 | check.count += 1 | |
240 | check.active = True | |
241 | changes_made = True | |
242 | ||
243 | if changes_made: | |
244 | self.save() | |
245 | ||
246 | def __str__(self) -> str: | |
247 | """Print the healthcheck history. | |
248 | ||
249 | Returns: | |
250 | str: Human readable representation of the healthcheck history | |
251 | """ | |
252 | out = [] | |
253 | ||
254 | if len(self.healthcheck.keys()) == 0: | |
255 | out.append("No healthchecks have been recorded") | |
256 | else: | |
257 | out.append(self.titles.format( | |
258 | healthcheck_name="Healthcheck Name", | |
259 | first_seen="First Seen (UTC)", | |
260 | last_seen="Last seen (UTC)", | |
261 | count="Count", | |
262 | active="Active") | |
263 | ) | |
264 | for k in sorted(self.healthcheck.keys()): | |
265 | check = self.healthcheck[k] | |
266 | out.append(self.titles.format( | |
267 | healthcheck_name=check.name, | |
268 | first_seen=time.strftime(self.date_format, time.localtime(check.first_seen)), | |
269 | last_seen=time.strftime(self.date_format, time.localtime(check.last_seen)), | |
270 | count=check.count, | |
271 | active="Yes" if check.active else "No") | |
272 | ) | |
273 | out.extend([f"{len(self.healthcheck)} health check(s) listed", ""]) | |
274 | ||
275 | return "\n".join(out) | |
276 | ||
277 | def as_dict(self) -> Dict[str, Any]: | |
278 | """Return the history in a dictionary. | |
279 | ||
280 | Returns: | |
281 | Dict[str, Any]: dictionary indexed by the healthcheck name | |
282 | """ | |
283 | return {name: self.healthcheck[name].as_dict() for name in self.healthcheck} | |
284 | ||
285 | def as_json(self, pretty: bool = False) -> str: | |
286 | """Return the healthcheck history object as a dict (JSON). | |
287 | ||
288 | Args: | |
289 | pretty (bool, optional): whether to json pretty print the history. Defaults to False. | |
290 | ||
291 | Returns: | |
292 | str: str representation of the healthcheck in JSON format | |
293 | """ | |
294 | if pretty: | |
295 | return json.dumps(self.as_dict(), indent=2) | |
296 | else: | |
297 | return json.dumps(self.as_dict()) | |
298 | ||
299 | def as_yaml(self) -> str: | |
300 | """Return the healthcheck history in yaml format. | |
301 | ||
302 | Returns: | |
303 | str: YAML representation of the healthcheck history | |
304 | """ | |
305 | return yaml.safe_dump(self.as_dict(), explicit_start=True, default_flow_style=False) | |
306 | ||
c07f9fc5 | 307 | |
91327a77 | 308 | class Metric(object): |
20effc67 | 309 | def __init__(self, mtype: str, name: str, desc: str, labels: Optional[LabelValues] = None) -> None: |
91327a77 AA |
310 | self.mtype = mtype |
311 | self.name = name | |
312 | self.desc = desc | |
20effc67 TL |
313 | self.labelnames = labels # tuple if present |
314 | self.value: Dict[LabelValues, Number] = {} | |
91327a77 | 315 | |
f67539c2 | 316 | def clear(self) -> None: |
91327a77 AA |
317 | self.value = {} |
318 | ||
20effc67 | 319 | def set(self, value: Number, labelvalues: Optional[LabelValues] = None) -> None: |
91327a77 AA |
320 | # labelvalues must be a tuple |
321 | labelvalues = labelvalues or ('',) | |
322 | self.value[labelvalues] = value | |
3efd9988 | 323 | |
f67539c2 | 324 | def str_expfmt(self) -> str: |
91327a77 | 325 | |
f67539c2 | 326 | def promethize(path: str) -> str: |
91327a77 | 327 | ''' replace illegal metric name characters ''' |
81eedcae | 328 | result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus') |
91327a77 AA |
329 | |
330 | # Hyphens usually turn into underscores, unless they are | |
331 | # trailing | |
332 | if result.endswith("-"): | |
333 | result = result[0:-1] + "_minus" | |
334 | else: | |
335 | result = result.replace("-", "_") | |
336 | ||
337 | return "ceph_{0}".format(result) | |
338 | ||
f67539c2 | 339 | def floatstr(value: float) -> str: |
91327a77 AA |
340 | ''' represent as Go-compatible float ''' |
341 | if value == float('inf'): | |
342 | return '+Inf' | |
343 | if value == float('-inf'): | |
344 | return '-Inf' | |
345 | if math.isnan(value): | |
346 | return 'NaN' | |
347 | return repr(float(value)) | |
348 | ||
349 | name = promethize(self.name) | |
350 | expfmt = ''' | |
351 | # HELP {name} {desc} | |
352 | # TYPE {name} {mtype}'''.format( | |
353 | name=name, | |
354 | desc=self.desc, | |
355 | mtype=self.mtype, | |
356 | ) | |
357 | ||
358 | for labelvalues, value in self.value.items(): | |
359 | if self.labelnames: | |
f6b5b4d7 TL |
360 | labels_list = zip(self.labelnames, labelvalues) |
361 | labels = ','.join('%s="%s"' % (k, v) for k, v in labels_list) | |
91327a77 AA |
362 | else: |
363 | labels = '' | |
364 | if labels: | |
365 | fmtstr = '\n{name}{{{labels}}} {value}' | |
366 | else: | |
367 | fmtstr = '\n{name} {value}' | |
368 | expfmt += fmtstr.format( | |
369 | name=name, | |
370 | labels=labels, | |
371 | value=floatstr(value), | |
372 | ) | |
373 | return expfmt | |
374 | ||
20effc67 TL |
375 | def group_by( |
376 | self, | |
377 | keys: List[str], | |
378 | joins: Dict[str, Callable[[List[str]], str]], | |
379 | name: Optional[str] = None, | |
380 | ) -> "Metric": | |
381 | """ | |
382 | Groups data by label names. | |
383 | ||
384 | Label names not passed are being removed from the resulting metric but | |
385 | by providing a join function, labels of metrics can be grouped. | |
386 | ||
387 | The purpose of this method is to provide a version of a metric that can | |
388 | be used in matching where otherwise multiple results would be returned. | |
389 | ||
390 | As grouping is possible in Prometheus, the only additional value of this | |
391 | method is the possibility to join labels when grouping. For that reason, | |
392 | passing joins is required. Please use PromQL expressions in all other | |
393 | cases. | |
394 | ||
395 | >>> m = Metric('type', 'name', '', labels=('label1', 'id')) | |
396 | >>> m.value = { | |
397 | ... ('foo', 'x'): 1, | |
398 | ... ('foo', 'y'): 1, | |
399 | ... } | |
400 | >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value | |
401 | {('foo', 'x,y'): 1} | |
402 | ||
403 | The functionality of group by could roughly be compared with Prometheus' | |
404 | ||
405 | group (ceph_disk_occupation) by (device, instance) | |
406 | ||
407 | with the exception that not all labels which aren't used as a condition | |
408 | to group a metric are discarded, but their values can are joined and the | |
409 | label is thereby preserved. | |
410 | ||
411 | This function takes the value of the first entry of a found group to be | |
412 | used for the resulting value of the grouping operation. | |
413 | ||
414 | >>> m = Metric('type', 'name', '', labels=('label1', 'id')) | |
415 | >>> m.value = { | |
416 | ... ('foo', 'x'): 555, | |
417 | ... ('foo', 'y'): 10, | |
418 | ... } | |
419 | >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value | |
420 | {('foo', 'x,y'): 555} | |
421 | """ | |
422 | assert self.labelnames, "cannot match keys without label names" | |
423 | for key in keys: | |
424 | assert key in self.labelnames, "unknown key: {}".format(key) | |
425 | assert joins, "joins must not be empty" | |
426 | assert all(callable(c) for c in joins.values()), "joins must be callable" | |
427 | ||
428 | # group | |
429 | grouped: Dict[LabelValues, List[Tuple[Dict[str, str], Number]]] = defaultdict(list) | |
430 | for label_values, metric_value in self.value.items(): | |
431 | labels = dict(zip(self.labelnames, label_values)) | |
432 | if not all(k in labels for k in keys): | |
433 | continue | |
434 | group_key = tuple(labels[k] for k in keys) | |
435 | grouped[group_key].append((labels, metric_value)) | |
436 | ||
437 | # as there is nothing specified on how to join labels that are not equal | |
438 | # and Prometheus `group` aggregation functions similarly, we simply drop | |
439 | # those labels. | |
440 | labelnames = tuple( | |
441 | label for label in self.labelnames if label in keys or label in joins | |
442 | ) | |
443 | superfluous_labelnames = [ | |
444 | label for label in self.labelnames if label not in labelnames | |
445 | ] | |
446 | ||
447 | # iterate and convert groups with more than one member into a single | |
448 | # entry | |
449 | values: MetricValue = {} | |
450 | for group in grouped.values(): | |
451 | labels, metric_value = group[0] | |
452 | ||
453 | for label in superfluous_labelnames: | |
454 | del labels[label] | |
455 | ||
456 | if len(group) > 1: | |
457 | for key, fn in joins.items(): | |
458 | labels[key] = fn(list(labels[key] for labels, _ in group)) | |
459 | ||
460 | values[tuple(labels.values())] = metric_value | |
461 | ||
462 | new_metric = Metric(self.mtype, name if name else self.name, self.desc, labelnames) | |
463 | new_metric.value = values | |
464 | ||
465 | return new_metric | |
466 | ||
91327a77 | 467 | |
f67539c2 TL |
468 | class MetricCounter(Metric): |
469 | def __init__(self, | |
470 | name: str, | |
471 | desc: str, | |
20effc67 | 472 | labels: Optional[LabelValues] = None) -> None: |
f67539c2 TL |
473 | super(MetricCounter, self).__init__('counter', name, desc, labels) |
474 | self.value = defaultdict(lambda: 0) | |
475 | ||
476 | def clear(self) -> None: | |
477 | pass # Skip calls to clear as we want to keep the counters here. | |
478 | ||
479 | def set(self, | |
20effc67 TL |
480 | value: Number, |
481 | labelvalues: Optional[LabelValues] = None) -> None: | |
f67539c2 TL |
482 | msg = 'This method must not be used for instances of MetricCounter class' |
483 | raise NotImplementedError(msg) | |
484 | ||
485 | def add(self, | |
20effc67 TL |
486 | value: Number, |
487 | labelvalues: Optional[LabelValues] = None) -> None: | |
f67539c2 TL |
488 | # labelvalues must be a tuple |
489 | labelvalues = labelvalues or ('',) | |
490 | self.value[labelvalues] += value | |
491 | ||
492 | ||
f6b5b4d7 | 493 | class MetricCollectionThread(threading.Thread): |
f67539c2 | 494 | def __init__(self, module: 'Module') -> None: |
f6b5b4d7 | 495 | self.mod = module |
adb31ebb TL |
496 | self.active = True |
497 | self.event = threading.Event() | |
f6b5b4d7 TL |
498 | super(MetricCollectionThread, self).__init__(target=self.collect) |
499 | ||
f67539c2 | 500 | def collect(self) -> None: |
f6b5b4d7 | 501 | self.mod.log.info('starting metric collection thread') |
adb31ebb | 502 | while self.active: |
f6b5b4d7 TL |
503 | self.mod.log.debug('collecting cache in thread') |
504 | if self.mod.have_mon_connection(): | |
505 | start_time = time.time() | |
f6b5b4d7 | 506 | |
adb31ebb TL |
507 | try: |
508 | data = self.mod.collect() | |
f67539c2 | 509 | except Exception: |
adb31ebb TL |
510 | # Log any issues encountered during the data collection and continue |
511 | self.mod.log.exception("failed to collect metrics:") | |
512 | self.event.wait(self.mod.scrape_interval) | |
513 | continue | |
514 | ||
515 | duration = time.time() - start_time | |
f6b5b4d7 | 516 | self.mod.log.debug('collecting cache in thread done') |
adb31ebb | 517 | |
f6b5b4d7 TL |
518 | sleep_time = self.mod.scrape_interval - duration |
519 | if sleep_time < 0: | |
520 | self.mod.log.warning( | |
521 | 'Collecting data took more time than configured scrape interval. ' | |
522 | 'This possibly results in stale data. Please check the ' | |
523 | '`stale_cache_strategy` configuration option. ' | |
524 | 'Collecting data took {:.2f} seconds but scrape interval is configured ' | |
525 | 'to be {:.0f} seconds.'.format( | |
526 | duration, | |
527 | self.mod.scrape_interval, | |
528 | ) | |
529 | ) | |
530 | sleep_time = 0 | |
531 | ||
532 | with self.mod.collect_lock: | |
533 | self.mod.collect_cache = data | |
534 | self.mod.collect_time = duration | |
535 | ||
adb31ebb | 536 | self.event.wait(sleep_time) |
f6b5b4d7 TL |
537 | else: |
538 | self.mod.log.error('No MON connection') | |
adb31ebb | 539 | self.event.wait(self.mod.scrape_interval) |
f6b5b4d7 | 540 | |
f67539c2 | 541 | def stop(self) -> None: |
adb31ebb TL |
542 | self.active = False |
543 | self.event.set() | |
f6b5b4d7 | 544 | |
91327a77 | 545 | |
f67539c2 | 546 | class Module(MgrModule): |
11fdf7f2 | 547 | MODULE_OPTIONS = [ |
f67539c2 | 548 | Option( |
20effc67 TL |
549 | 'server_addr', |
550 | default=get_default_addr(), | |
551 | desc='the IPv4 or IPv6 address on which the module listens for HTTP requests', | |
f67539c2 TL |
552 | ), |
553 | Option( | |
554 | 'server_port', | |
20effc67 TL |
555 | type='int', |
556 | default=DEFAULT_PORT, | |
557 | desc='the port on which the module listens for HTTP requests' | |
f67539c2 TL |
558 | ), |
559 | Option( | |
560 | 'scrape_interval', | |
561 | type='float', | |
562 | default=15.0 | |
563 | ), | |
564 | Option( | |
565 | 'stale_cache_strategy', | |
566 | default='log' | |
567 | ), | |
a4b75251 TL |
568 | Option( |
569 | 'cache', | |
570 | type='bool', | |
571 | default=True, | |
572 | ), | |
f67539c2 TL |
573 | Option( |
574 | 'rbd_stats_pools', | |
575 | default='' | |
576 | ), | |
577 | Option( | |
578 | name='rbd_stats_pools_refresh_interval', | |
579 | type='int', | |
580 | default=300 | |
20effc67 TL |
581 | ), |
582 | Option( | |
583 | name='standby_behaviour', | |
584 | type='str', | |
585 | default='default', | |
586 | enum_allowed=['default', 'error'], | |
587 | runtime=True | |
588 | ), | |
589 | Option( | |
590 | name='standby_error_status_code', | |
591 | type='int', | |
592 | default=500, | |
593 | min=400, | |
594 | max=599, | |
595 | runtime=True | |
f67539c2 | 596 | ) |
91327a77 AA |
597 | ] |
598 | ||
f6b5b4d7 TL |
599 | STALE_CACHE_FAIL = 'fail' |
600 | STALE_CACHE_RETURN = 'return' | |
601 | ||
f67539c2 | 602 | def __init__(self, *args: Any, **kwargs: Any) -> None: |
91327a77 AA |
603 | super(Module, self).__init__(*args, **kwargs) |
604 | self.metrics = self._setup_static_metrics() | |
605 | self.shutdown_event = threading.Event() | |
f6b5b4d7 TL |
606 | self.collect_lock = threading.Lock() |
607 | self.collect_time = 0.0 | |
f67539c2 | 608 | self.scrape_interval: float = 15.0 |
a4b75251 | 609 | self.cache = True |
f67539c2 TL |
610 | self.stale_cache_strategy: str = self.STALE_CACHE_FAIL |
611 | self.collect_cache: Optional[str] = None | |
11fdf7f2 TL |
612 | self.rbd_stats = { |
613 | 'pools': {}, | |
614 | 'pools_refresh_time': 0, | |
615 | 'counters_info': { | |
616 | 'write_ops': {'type': self.PERFCOUNTER_COUNTER, | |
617 | 'desc': 'RBD image writes count'}, | |
618 | 'read_ops': {'type': self.PERFCOUNTER_COUNTER, | |
619 | 'desc': 'RBD image reads count'}, | |
620 | 'write_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
621 | 'desc': 'RBD image bytes written'}, | |
622 | 'read_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
623 | 'desc': 'RBD image bytes read'}, | |
624 | 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
625 | 'desc': 'RBD image writes latency (msec)'}, | |
626 | 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
627 | 'desc': 'RBD image reads latency (msec)'}, | |
628 | }, | |
f6b5b4d7 TL |
629 | } # type: Dict[str, Any] |
630 | global _global_instance | |
631 | _global_instance = self | |
adb31ebb | 632 | self.metrics_thread = MetricCollectionThread(_global_instance) |
20effc67 | 633 | self.health_history = HealthHistory(self) |
3efd9988 | 634 | |
f67539c2 | 635 | def _setup_static_metrics(self) -> Dict[str, Metric]: |
3efd9988 FG |
636 | metrics = {} |
637 | metrics['health_status'] = Metric( | |
638 | 'untyped', | |
639 | 'health_status', | |
640 | 'Cluster health status' | |
641 | ) | |
94b18763 | 642 | metrics['mon_quorum_status'] = Metric( |
3efd9988 | 643 | 'gauge', |
94b18763 FG |
644 | 'mon_quorum_status', |
645 | 'Monitors in quorum', | |
646 | ('ceph_daemon',) | |
647 | ) | |
648 | metrics['fs_metadata'] = Metric( | |
649 | 'untyped', | |
650 | 'fs_metadata', | |
651 | 'FS Metadata', | |
652 | FS_METADATA | |
653 | ) | |
654 | metrics['mds_metadata'] = Metric( | |
655 | 'untyped', | |
656 | 'mds_metadata', | |
657 | 'MDS Metadata', | |
658 | MDS_METADATA | |
659 | ) | |
660 | metrics['mon_metadata'] = Metric( | |
661 | 'untyped', | |
662 | 'mon_metadata', | |
663 | 'MON Metadata', | |
664 | MON_METADATA | |
3efd9988 | 665 | ) |
494da23a TL |
666 | metrics['mgr_metadata'] = Metric( |
667 | 'gauge', | |
668 | 'mgr_metadata', | |
669 | 'MGR metadata', | |
670 | MGR_METADATA | |
671 | ) | |
672 | metrics['mgr_status'] = Metric( | |
673 | 'gauge', | |
674 | 'mgr_status', | |
675 | 'MGR status (0=standby, 1=active)', | |
676 | MGR_STATUS | |
677 | ) | |
678 | metrics['mgr_module_status'] = Metric( | |
679 | 'gauge', | |
680 | 'mgr_module_status', | |
681 | 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)', | |
682 | MGR_MODULE_STATUS | |
683 | ) | |
684 | metrics['mgr_module_can_run'] = Metric( | |
685 | 'gauge', | |
686 | 'mgr_module_can_run', | |
687 | 'MGR module runnable state i.e. can it run (0=no, 1=yes)', | |
688 | MGR_MODULE_CAN_RUN | |
689 | ) | |
3efd9988 FG |
690 | metrics['osd_metadata'] = Metric( |
691 | 'untyped', | |
692 | 'osd_metadata', | |
693 | 'OSD Metadata', | |
694 | OSD_METADATA | |
695 | ) | |
c07f9fc5 | 696 | |
3efd9988 FG |
697 | # The reason for having this separate to OSD_METADATA is |
698 | # so that we can stably use the same tag names that | |
699 | # the Prometheus node_exporter does | |
700 | metrics['disk_occupation'] = Metric( | |
b32b8144 | 701 | 'untyped', |
3efd9988 FG |
702 | 'disk_occupation', |
703 | 'Associate Ceph daemon with disk used', | |
704 | DISK_OCCUPATION | |
705 | ) | |
c07f9fc5 | 706 | |
20effc67 TL |
707 | metrics['disk_occupation_human'] = Metric( |
708 | 'untyped', | |
709 | 'disk_occupation_human', | |
710 | 'Associate Ceph daemon with disk used for displaying to humans,' | |
711 | ' not for joining tables (vector matching)', | |
712 | DISK_OCCUPATION, # label names are automatically decimated on grouping | |
713 | ) | |
714 | ||
3efd9988 FG |
715 | metrics['pool_metadata'] = Metric( |
716 | 'untyped', | |
717 | 'pool_metadata', | |
718 | 'POOL Metadata', | |
719 | POOL_METADATA | |
720 | ) | |
94b18763 FG |
721 | |
722 | metrics['rgw_metadata'] = Metric( | |
723 | 'untyped', | |
724 | 'rgw_metadata', | |
725 | 'RGW Metadata', | |
726 | RGW_METADATA | |
727 | ) | |
728 | ||
11fdf7f2 TL |
729 | metrics['rbd_mirror_metadata'] = Metric( |
730 | 'untyped', | |
731 | 'rbd_mirror_metadata', | |
732 | 'RBD Mirror Metadata', | |
733 | RBD_MIRROR_METADATA | |
734 | ) | |
735 | ||
94b18763 FG |
736 | metrics['pg_total'] = Metric( |
737 | 'gauge', | |
738 | 'pg_total', | |
92f5a8d4 TL |
739 | 'PG Total Count per Pool', |
740 | ('pool_id',) | |
94b18763 FG |
741 | ) |
742 | ||
20effc67 TL |
743 | metrics['health_detail'] = Metric( |
744 | 'gauge', | |
745 | 'health_detail', | |
746 | 'healthcheck status by type (0=inactive, 1=active)', | |
747 | HEALTHCHECK_DETAIL | |
748 | ) | |
749 | ||
94b18763 FG |
750 | for flag in OSD_FLAGS: |
751 | path = 'osd_flag_{}'.format(flag) | |
752 | metrics[path] = Metric( | |
753 | 'untyped', | |
754 | path, | |
755 | 'OSD Flag {}'.format(flag) | |
756 | ) | |
3efd9988 FG |
757 | for state in OSD_STATUS: |
758 | path = 'osd_{}'.format(state) | |
3efd9988 FG |
759 | metrics[path] = Metric( |
760 | 'untyped', | |
c07f9fc5 | 761 | path, |
3efd9988 FG |
762 | 'OSD status {}'.format(state), |
763 | ('ceph_daemon',) | |
c07f9fc5 | 764 | ) |
b32b8144 FG |
765 | for stat in OSD_STATS: |
766 | path = 'osd_{}'.format(stat) | |
b32b8144 FG |
767 | metrics[path] = Metric( |
768 | 'gauge', | |
769 | path, | |
770 | 'OSD stat {}'.format(stat), | |
771 | ('ceph_daemon',) | |
772 | ) | |
11fdf7f2 TL |
773 | for stat in OSD_POOL_STATS: |
774 | path = 'pool_{}'.format(stat) | |
775 | metrics[path] = Metric( | |
776 | 'gauge', | |
777 | path, | |
9f95a23c | 778 | "OSD pool stats: {}".format(stat), |
11fdf7f2 TL |
779 | ('pool_id',) |
780 | ) | |
3efd9988 FG |
781 | for state in PG_STATES: |
782 | path = 'pg_{}'.format(state) | |
3efd9988 FG |
783 | metrics[path] = Metric( |
784 | 'gauge', | |
785 | path, | |
92f5a8d4 TL |
786 | 'PG {} per pool'.format(state), |
787 | ('pool_id',) | |
3efd9988 FG |
788 | ) |
789 | for state in DF_CLUSTER: | |
790 | path = 'cluster_{}'.format(state) | |
3efd9988 FG |
791 | metrics[path] = Metric( |
792 | 'gauge', | |
793 | path, | |
794 | 'DF {}'.format(state), | |
795 | ) | |
796 | for state in DF_POOL: | |
797 | path = 'pool_{}'.format(state) | |
3efd9988 | 798 | metrics[path] = Metric( |
20effc67 | 799 | 'counter' if state in ('rd', 'rd_bytes', 'wr', 'wr_bytes') else 'gauge', |
3efd9988 FG |
800 | path, |
801 | 'DF pool {}'.format(state), | |
802 | ('pool_id',) | |
803 | ) | |
28e407b8 AA |
804 | for state in NUM_OBJECTS: |
805 | path = 'num_objects_{}'.format(state) | |
806 | metrics[path] = Metric( | |
807 | 'gauge', | |
808 | path, | |
809 | 'Number of {} objects'.format(state), | |
810 | ) | |
3efd9988 | 811 | |
adb31ebb TL |
812 | for check in HEALTH_CHECKS: |
813 | path = 'healthcheck_{}'.format(check.name.lower()) | |
814 | metrics[path] = Metric( | |
815 | 'gauge', | |
816 | path, | |
817 | check.description, | |
818 | ) | |
819 | ||
3efd9988 | 820 | return metrics |
c07f9fc5 | 821 | |
f6b5b4d7 | 822 | @profile_method() |
f67539c2 | 823 | def get_health(self) -> None: |
adb31ebb | 824 | |
f67539c2 | 825 | def _get_value(message: str, delim: str = ' ', word_pos: int = 0) -> Tuple[int, int]: |
adb31ebb TL |
826 | """Extract value from message (default is 1st field)""" |
827 | v_str = message.split(delim)[word_pos] | |
828 | if v_str.isdigit(): | |
829 | return int(v_str), 0 | |
830 | return 0, 1 | |
831 | ||
3efd9988 | 832 | health = json.loads(self.get('health')['json']) |
adb31ebb | 833 | # set overall health |
91327a77 AA |
834 | self.metrics['health_status'].set( |
835 | health_status_to_number(health['status']) | |
c07f9fc5 FG |
836 | ) |
837 | ||
adb31ebb | 838 | # Examine the health to see if any health checks triggered need to |
20effc67 | 839 | # become a specific metric with a value from the health detail |
adb31ebb TL |
840 | active_healthchecks = health.get('checks', {}) |
841 | active_names = active_healthchecks.keys() | |
842 | ||
843 | for check in HEALTH_CHECKS: | |
844 | path = 'healthcheck_{}'.format(check.name.lower()) | |
845 | ||
846 | if path in self.metrics: | |
847 | ||
848 | if check.name in active_names: | |
849 | check_data = active_healthchecks[check.name] | |
850 | message = check_data['summary'].get('message', '') | |
851 | v, err = 0, 0 | |
852 | ||
853 | if check.name == "SLOW_OPS": | |
f67539c2 TL |
854 | # 42 slow ops, oldest one blocked for 12 sec, daemons [osd.0, osd.3] have |
855 | # slow ops. | |
adb31ebb TL |
856 | v, err = _get_value(message) |
857 | ||
858 | if err: | |
f67539c2 TL |
859 | self.log.error( |
860 | "healthcheck %s message format is incompatible and has been dropped", | |
861 | check.name) | |
adb31ebb TL |
862 | # drop the metric, so it's no longer emitted |
863 | del self.metrics[path] | |
864 | continue | |
865 | else: | |
866 | self.metrics[path].set(v) | |
867 | else: | |
868 | # health check is not active, so give it a default of 0 | |
869 | self.metrics[path].set(0) | |
870 | ||
20effc67 TL |
871 | self.health_history.check(health) |
872 | for name, info in self.health_history.healthcheck.items(): | |
873 | v = 1 if info.active else 0 | |
874 | self.metrics['health_detail'].set( | |
875 | v, ( | |
876 | name, | |
877 | str(info.severity)) | |
878 | ) | |
879 | ||
f6b5b4d7 | 880 | @profile_method() |
f67539c2 | 881 | def get_pool_stats(self) -> None: |
11fdf7f2 TL |
882 | # retrieve pool stats to provide per pool recovery metrics |
883 | # (osd_pool_stats moved to mgr in Mimic) | |
884 | pstats = self.get('osd_pool_stats') | |
885 | for pool in pstats['pool_stats']: | |
886 | for stat in OSD_POOL_STATS: | |
887 | self.metrics['pool_{}'.format(stat)].set( | |
888 | pool['recovery_rate'].get(stat, 0), | |
889 | (pool['pool_id'],) | |
890 | ) | |
891 | ||
f6b5b4d7 | 892 | @profile_method() |
f67539c2 | 893 | def get_df(self) -> None: |
3efd9988 FG |
894 | # maybe get the to-be-exported metrics from a config? |
895 | df = self.get('df') | |
896 | for stat in DF_CLUSTER: | |
91327a77 | 897 | self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat]) |
3efd9988 FG |
898 | |
899 | for pool in df['pools']: | |
900 | for stat in DF_POOL: | |
91327a77 AA |
901 | self.metrics['pool_{}'.format(stat)].set( |
902 | pool['stats'][stat], | |
903 | (pool['id'],) | |
904 | ) | |
94b18763 | 905 | |
f6b5b4d7 | 906 | @profile_method() |
f67539c2 | 907 | def get_fs(self) -> None: |
94b18763 FG |
908 | fs_map = self.get('fs_map') |
909 | servers = self.get_service_list() | |
9f95a23c TL |
910 | self.log.debug('standbys: {}'.format(fs_map['standbys'])) |
911 | # export standby mds metadata, default standby fs_id is '-1' | |
912 | for standby in fs_map['standbys']: | |
913 | id_ = standby['name'] | |
20effc67 | 914 | host, version, _ = servers.get((id_, 'mds'), ('', '', '')) |
f67539c2 | 915 | addr, rank = standby['addr'], standby['rank'] |
9f95a23c TL |
916 | self.metrics['mds_metadata'].set(1, ( |
917 | 'mds.{}'.format(id_), '-1', | |
f67539c2 TL |
918 | cast(str, host), |
919 | cast(str, addr), | |
920 | cast(str, rank), | |
921 | cast(str, version) | |
9f95a23c | 922 | )) |
94b18763 FG |
923 | for fs in fs_map['filesystems']: |
924 | # collect fs metadata | |
11fdf7f2 TL |
925 | data_pools = ",".join([str(pool) |
926 | for pool in fs['mdsmap']['data_pools']]) | |
91327a77 AA |
927 | self.metrics['fs_metadata'].set(1, ( |
928 | data_pools, | |
929 | fs['id'], | |
930 | fs['mdsmap']['metadata_pool'], | |
931 | fs['mdsmap']['fs_name'] | |
932 | )) | |
28e407b8 | 933 | self.log.debug('mdsmap: {}'.format(fs['mdsmap'])) |
94b18763 FG |
934 | for gid, daemon in fs['mdsmap']['info'].items(): |
935 | id_ = daemon['name'] | |
20effc67 | 936 | host, version, _ = servers.get((id_, 'mds'), ('', '', '')) |
91327a77 AA |
937 | self.metrics['mds_metadata'].set(1, ( |
938 | 'mds.{}'.format(id_), fs['id'], | |
f67539c2 TL |
939 | host, daemon['addr'], |
940 | daemon['rank'], version | |
91327a77 | 941 | )) |
3efd9988 | 942 | |
f6b5b4d7 | 943 | @profile_method() |
f67539c2 | 944 | def get_quorum_status(self) -> None: |
3efd9988 | 945 | mon_status = json.loads(self.get('mon_status')['json']) |
94b18763 FG |
946 | servers = self.get_service_list() |
947 | for mon in mon_status['monmap']['mons']: | |
948 | rank = mon['rank'] | |
949 | id_ = mon['name'] | |
20effc67 | 950 | host_version = servers.get((id_, 'mon'), ('', '', '')) |
91327a77 AA |
951 | self.metrics['mon_metadata'].set(1, ( |
952 | 'mon.{}'.format(id_), host_version[0], | |
f91f0fd5 | 953 | mon['public_addr'].rsplit(':', 1)[0], rank, |
91327a77 AA |
954 | host_version[1] |
955 | )) | |
94b18763 | 956 | in_quorum = int(rank in mon_status['quorum']) |
91327a77 AA |
957 | self.metrics['mon_quorum_status'].set(in_quorum, ( |
958 | 'mon.{}'.format(id_), | |
959 | )) | |
3efd9988 | 960 | |
f6b5b4d7 | 961 | @profile_method() |
f67539c2 | 962 | def get_mgr_status(self) -> None: |
494da23a TL |
963 | mgr_map = self.get('mgr_map') |
964 | servers = self.get_service_list() | |
965 | ||
966 | active = mgr_map['active_name'] | |
967 | standbys = [s.get('name') for s in mgr_map['standbys']] | |
968 | ||
969 | all_mgrs = list(standbys) | |
970 | all_mgrs.append(active) | |
971 | ||
f67539c2 TL |
972 | all_modules = {module.get('name'): module.get('can_run') |
973 | for module in mgr_map['available_modules']} | |
494da23a TL |
974 | |
975 | for mgr in all_mgrs: | |
20effc67 | 976 | host, version, _ = servers.get((mgr, 'mgr'), ('', '', '')) |
494da23a TL |
977 | if mgr == active: |
978 | _state = 1 | |
494da23a TL |
979 | else: |
980 | _state = 0 | |
801d1391 | 981 | |
494da23a | 982 | self.metrics['mgr_metadata'].set(1, ( |
f67539c2 | 983 | f'mgr.{mgr}', host, version |
494da23a TL |
984 | )) |
985 | self.metrics['mgr_status'].set(_state, ( | |
f67539c2 | 986 | f'mgr.{mgr}',)) |
adb31ebb | 987 | always_on_modules = mgr_map['always_on_modules'].get(self.release_name, []) |
494da23a TL |
988 | active_modules = list(always_on_modules) |
989 | active_modules.extend(mgr_map['modules']) | |
990 | ||
991 | for mod_name in all_modules.keys(): | |
992 | ||
993 | if mod_name in always_on_modules: | |
994 | _state = 2 | |
995 | elif mod_name in active_modules: | |
996 | _state = 1 | |
997 | else: | |
998 | _state = 0 | |
999 | ||
1000 | _can_run = 1 if all_modules[mod_name] else 0 | |
1001 | self.metrics['mgr_module_status'].set(_state, (mod_name,)) | |
1002 | self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,)) | |
1003 | ||
f6b5b4d7 | 1004 | @profile_method() |
f67539c2 | 1005 | def get_pg_status(self) -> None: |
94b18763 | 1006 | |
92f5a8d4 TL |
1007 | pg_summary = self.get('pg_summary') |
1008 | ||
1009 | for pool in pg_summary['by_pool']: | |
adb31ebb | 1010 | num_by_state = defaultdict(int) # type: DefaultDict[str, int] |
92f5a8d4 | 1011 | |
801d1391 | 1012 | for state_name, count in pg_summary['by_pool'][pool].items(): |
92f5a8d4 | 1013 | for state in state_name.split('+'): |
801d1391 TL |
1014 | num_by_state[state] += count |
1015 | num_by_state['total'] += count | |
1016 | ||
1017 | for state, num in num_by_state.items(): | |
1018 | try: | |
1019 | self.metrics["pg_{}".format(state)].set(num, (pool,)) | |
1020 | except KeyError: | |
e306af50 | 1021 | self.log.warning("skipping pg in unknown state {}".format(state)) |
b32b8144 | 1022 | |
f6b5b4d7 | 1023 | @profile_method() |
f67539c2 | 1024 | def get_osd_stats(self) -> None: |
b32b8144 FG |
1025 | osd_stats = self.get('osd_stats') |
1026 | for osd in osd_stats['osd_stats']: | |
1027 | id_ = osd['osd'] | |
1028 | for stat in OSD_STATS: | |
94b18763 | 1029 | val = osd['perf_stat'][stat] |
91327a77 AA |
1030 | self.metrics['osd_{}'.format(stat)].set(val, ( |
1031 | 'osd.{}'.format(id_), | |
1032 | )) | |
94b18763 | 1033 | |
20effc67 | 1034 | def get_service_list(self) -> Dict[Tuple[str, str], Tuple[str, str, str]]: |
94b18763 FG |
1035 | ret = {} |
1036 | for server in self.list_servers(): | |
f67539c2 TL |
1037 | version = cast(str, server.get('ceph_version', '')) |
1038 | host = cast(str, server.get('hostname', '')) | |
1039 | for service in cast(List[ServiceInfoT], server.get('services', [])): | |
20effc67 | 1040 | ret.update({(service['id'], service['type']): (host, version, service.get('name', ''))}) |
94b18763 | 1041 | return ret |
3efd9988 | 1042 | |
f6b5b4d7 | 1043 | @profile_method() |
f67539c2 | 1044 | def get_metadata_and_osd_status(self) -> None: |
3efd9988 | 1045 | osd_map = self.get('osd_map') |
94b18763 FG |
1046 | osd_flags = osd_map['flags'].split(',') |
1047 | for flag in OSD_FLAGS: | |
91327a77 AA |
1048 | self.metrics['osd_flag_{}'.format(flag)].set( |
1049 | int(flag in osd_flags) | |
1050 | ) | |
94b18763 | 1051 | |
3efd9988 | 1052 | osd_devices = self.get('osd_map_crush')['devices'] |
94b18763 | 1053 | servers = self.get_service_list() |
3efd9988 | 1054 | for osd in osd_map['osds']: |
94b18763 | 1055 | # id can be used to link osd metrics and metadata |
3efd9988 | 1056 | id_ = osd['osd'] |
94b18763 | 1057 | # collect osd metadata |
f91f0fd5 TL |
1058 | p_addr = osd['public_addr'].rsplit(':', 1)[0] |
1059 | c_addr = osd['cluster_addr'].rsplit(':', 1)[0] | |
94b18763 FG |
1060 | if p_addr == "-" or c_addr == "-": |
1061 | self.log.info( | |
1062 | "Missing address metadata for osd {0}, skipping occupation" | |
1063 | " and metadata records for this osd".format(id_) | |
1064 | ) | |
1065 | continue | |
1066 | ||
1067 | dev_class = None | |
1068 | for osd_device in osd_devices: | |
1069 | if osd_device['id'] == id_: | |
1070 | dev_class = osd_device.get('class', '') | |
1071 | break | |
1072 | ||
1073 | if dev_class is None: | |
9f95a23c TL |
1074 | self.log.info("OSD {0} is missing from CRUSH map, " |
1075 | "skipping output".format(id_)) | |
94b18763 FG |
1076 | continue |
1077 | ||
20effc67 | 1078 | host_version = servers.get((str(id_), 'osd'), ('', '', '')) |
94b18763 | 1079 | |
a8e16298 TL |
1080 | # collect disk occupation metadata |
1081 | osd_metadata = self.get_metadata("osd", str(id_)) | |
1082 | if osd_metadata is None: | |
1083 | continue | |
1084 | ||
1085 | obj_store = osd_metadata.get('osd_objectstore', '') | |
1086 | f_iface = osd_metadata.get('front_iface', '') | |
1087 | b_iface = osd_metadata.get('back_iface', '') | |
1088 | ||
91327a77 | 1089 | self.metrics['osd_metadata'].set(1, ( |
a8e16298 | 1090 | b_iface, |
28e407b8 | 1091 | 'osd.{}'.format(id_), |
3efd9988 | 1092 | c_addr, |
94b18763 | 1093 | dev_class, |
a8e16298 | 1094 | f_iface, |
28e407b8 | 1095 | host_version[0], |
a8e16298 TL |
1096 | obj_store, |
1097 | p_addr, | |
1098 | host_version[1] | |
3efd9988 | 1099 | )) |
94b18763 FG |
1100 | |
1101 | # collect osd status | |
3efd9988 FG |
1102 | for state in OSD_STATUS: |
1103 | status = osd[state] | |
91327a77 AA |
1104 | self.metrics['osd_{}'.format(state)].set(status, ( |
1105 | 'osd.{}'.format(id_), | |
1106 | )) | |
3efd9988 | 1107 | |
92f5a8d4 | 1108 | osd_dev_node = None |
f67539c2 TL |
1109 | osd_wal_dev_node = '' |
1110 | osd_db_dev_node = '' | |
a8e16298 | 1111 | if obj_store == "filestore": |
11fdf7f2 TL |
1112 | # collect filestore backend device |
1113 | osd_dev_node = osd_metadata.get( | |
1114 | 'backend_filestore_dev_node', None) | |
1115 | # collect filestore journal device | |
f64942e4 AA |
1116 | osd_wal_dev_node = osd_metadata.get('osd_journal', '') |
1117 | osd_db_dev_node = '' | |
a8e16298 | 1118 | elif obj_store == "bluestore": |
11fdf7f2 TL |
1119 | # collect bluestore backend device |
1120 | osd_dev_node = osd_metadata.get( | |
1121 | 'bluestore_bdev_dev_node', None) | |
1122 | # collect bluestore wal backend | |
f64942e4 | 1123 | osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '') |
11fdf7f2 | 1124 | # collect bluestore db backend |
f64942e4 AA |
1125 | osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '') |
1126 | if osd_dev_node and osd_dev_node == "unknown": | |
1127 | osd_dev_node = None | |
1128 | ||
f67539c2 TL |
1129 | # fetch the devices and ids (vendor, model, serial) from the |
1130 | # osd_metadata | |
1131 | osd_devs = osd_metadata.get('devices', '') or 'N/A' | |
1132 | osd_dev_ids = osd_metadata.get('device_ids', '') or 'N/A' | |
1133 | ||
3efd9988 FG |
1134 | osd_hostname = osd_metadata.get('hostname', None) |
1135 | if osd_dev_node and osd_hostname: | |
1136 | self.log.debug("Got dev for osd {0}: {1}/{2}".format( | |
1137 | id_, osd_hostname, osd_dev_node)) | |
91327a77 | 1138 | self.metrics['disk_occupation'].set(1, ( |
28e407b8 | 1139 | "osd.{0}".format(id_), |
3efd9988 | 1140 | osd_dev_node, |
f64942e4 AA |
1141 | osd_db_dev_node, |
1142 | osd_wal_dev_node, | |
f67539c2 TL |
1143 | osd_hostname, |
1144 | osd_devs, | |
1145 | osd_dev_ids, | |
3efd9988 FG |
1146 | )) |
1147 | else: | |
1148 | self.log.info("Missing dev node metadata for osd {0}, skipping " | |
11fdf7f2 | 1149 | "occupation record for this osd".format(id_)) |
3efd9988 | 1150 | |
20effc67 TL |
1151 | if 'disk_occupation' in self.metrics: |
1152 | try: | |
1153 | self.metrics['disk_occupation_human'] = \ | |
1154 | self.metrics['disk_occupation'].group_by( | |
1155 | ['device', 'instance'], | |
1156 | {'ceph_daemon': lambda daemons: ', '.join(daemons)}, | |
1157 | name='disk_occupation_human', | |
1158 | ) | |
1159 | except Exception as e: | |
1160 | self.log.error(e) | |
1161 | ||
b3b6e05e TL |
1162 | ec_profiles = osd_map.get('erasure_code_profiles', {}) |
1163 | ||
1164 | def _get_pool_info(pool: Dict[str, Any]) -> Tuple[str, str]: | |
1165 | pool_type = 'unknown' | |
1166 | description = 'unknown' | |
1167 | ||
1168 | if pool['type'] == 1: | |
1169 | pool_type = "replicated" | |
1170 | description = f"replica:{pool['size']}" | |
1171 | elif pool['type'] == 3: | |
1172 | pool_type = "erasure" | |
1173 | name = pool.get('erasure_code_profile', '') | |
1174 | profile = ec_profiles.get(name, {}) | |
1175 | if profile: | |
1176 | description = f"ec:{profile['k']}+{profile['m']}" | |
1177 | else: | |
1178 | description = "ec:unknown" | |
1179 | ||
1180 | return pool_type, description | |
1181 | ||
3efd9988 | 1182 | for pool in osd_map['pools']: |
b3b6e05e TL |
1183 | |
1184 | compression_mode = 'none' | |
1185 | pool_type, pool_description = _get_pool_info(pool) | |
1186 | ||
1187 | if 'options' in pool: | |
1188 | compression_mode = pool['options'].get('compression_mode', 'none') | |
1189 | ||
11fdf7f2 | 1190 | self.metrics['pool_metadata'].set( |
b3b6e05e TL |
1191 | 1, ( |
1192 | pool['pool'], | |
1193 | pool['pool_name'], | |
1194 | pool_type, | |
1195 | pool_description, | |
1196 | compression_mode) | |
1197 | ) | |
94b18763 | 1198 | |
11fdf7f2 | 1199 | # Populate other servers metadata |
94b18763 FG |
1200 | for key, value in servers.items(): |
1201 | service_id, service_type = key | |
11fdf7f2 | 1202 | if service_type == 'rgw': |
20effc67 | 1203 | hostname, version, name = value |
11fdf7f2 TL |
1204 | self.metrics['rgw_metadata'].set( |
1205 | 1, | |
20effc67 TL |
1206 | ('{}.{}'.format(service_type, name), |
1207 | hostname, version, service_id) | |
11fdf7f2 TL |
1208 | ) |
1209 | elif service_type == 'rbd-mirror': | |
1210 | mirror_metadata = self.get_metadata('rbd-mirror', service_id) | |
1211 | if mirror_metadata is None: | |
1212 | continue | |
1213 | mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type, | |
1214 | service_id) | |
20effc67 | 1215 | rbd_mirror_metadata = cast(LabelValues, |
f67539c2 TL |
1216 | (mirror_metadata.get(k, '') |
1217 | for k in RBD_MIRROR_METADATA)) | |
11fdf7f2 | 1218 | self.metrics['rbd_mirror_metadata'].set( |
f67539c2 | 1219 | 1, rbd_mirror_metadata |
11fdf7f2 | 1220 | ) |
3efd9988 | 1221 | |
f6b5b4d7 | 1222 | @profile_method() |
f67539c2 | 1223 | def get_num_objects(self) -> None: |
28e407b8 AA |
1224 | pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum'] |
1225 | for obj in NUM_OBJECTS: | |
1226 | stat = 'num_objects_{}'.format(obj) | |
91327a77 | 1227 | self.metrics[stat].set(pg_sum[stat]) |
28e407b8 | 1228 | |
f6b5b4d7 | 1229 | @profile_method() |
f67539c2 | 1230 | def get_rbd_stats(self) -> None: |
11fdf7f2 TL |
1231 | # Per RBD image stats is collected by registering a dynamic osd perf |
1232 | # stats query that tells OSDs to group stats for requests associated | |
1233 | # with RBD objects by pool, namespace, and image id, which are | |
1234 | # extracted from the request object names or other attributes. | |
1235 | # The RBD object names have the following prefixes: | |
1236 | # - rbd_data.{image_id}. (data stored in the same pool as metadata) | |
1237 | # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool) | |
1238 | # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled) | |
1239 | # The pool_id in the object name is the id of the pool with the image | |
1240 | # metdata, and should be used in the image spec. If there is no pool_id | |
1241 | # in the object name, the image pool is the pool where the object is | |
1242 | # located. | |
1243 | ||
1244 | # Parse rbd_stats_pools option, which is a comma or space separated | |
1245 | # list of pool[/namespace] entries. If no namespace is specifed the | |
f6b5b4d7 TL |
1246 | # stats are collected for every namespace in the pool. The wildcard |
1247 | # '*' can be used to indicate all pools or namespaces | |
f67539c2 | 1248 | pools_string = cast(str, self.get_localized_module_option('rbd_stats_pools')) |
f6b5b4d7 | 1249 | pool_keys = [] |
f67539c2 | 1250 | for x in re.split(r'[\s,]+', pools_string): |
f6b5b4d7 TL |
1251 | if not x: |
1252 | continue | |
1253 | ||
1254 | s = x.split('/', 2) | |
11fdf7f2 | 1255 | pool_name = s[0] |
f6b5b4d7 TL |
1256 | namespace_name = None |
1257 | if len(s) == 2: | |
1258 | namespace_name = s[1] | |
1259 | ||
1260 | if pool_name == "*": | |
1261 | # collect for all pools | |
1262 | osd_map = self.get('osd_map') | |
1263 | for pool in osd_map['pools']: | |
1264 | if 'rbd' not in pool.get('application_metadata', {}): | |
1265 | continue | |
1266 | pool_keys.append((pool['pool_name'], namespace_name)) | |
1267 | else: | |
1268 | pool_keys.append((pool_name, namespace_name)) | |
1269 | ||
1270 | pools = {} # type: Dict[str, Set[str]] | |
1271 | for pool_key in pool_keys: | |
1272 | pool_name = pool_key[0] | |
1273 | namespace_name = pool_key[1] | |
1274 | if not namespace_name or namespace_name == "*": | |
11fdf7f2 TL |
1275 | # empty set means collect for all namespaces |
1276 | pools[pool_name] = set() | |
1277 | continue | |
f6b5b4d7 | 1278 | |
11fdf7f2 TL |
1279 | if pool_name not in pools: |
1280 | pools[pool_name] = set() | |
1281 | elif not pools[pool_name]: | |
1282 | continue | |
f6b5b4d7 | 1283 | pools[pool_name].add(namespace_name) |
11fdf7f2 TL |
1284 | |
1285 | rbd_stats_pools = {} | |
f6b5b4d7 | 1286 | for pool_id in self.rbd_stats['pools'].keys(): |
11fdf7f2 TL |
1287 | name = self.rbd_stats['pools'][pool_id]['name'] |
1288 | if name not in pools: | |
1289 | del self.rbd_stats['pools'][pool_id] | |
1290 | else: | |
1291 | rbd_stats_pools[name] = \ | |
1292 | self.rbd_stats['pools'][pool_id]['ns_names'] | |
1293 | ||
1294 | pools_refreshed = False | |
1295 | if pools: | |
1296 | next_refresh = self.rbd_stats['pools_refresh_time'] + \ | |
1297 | self.get_localized_module_option( | |
1298 | 'rbd_stats_pools_refresh_interval', 300) | |
1299 | if rbd_stats_pools != pools or time.time() >= next_refresh: | |
1300 | self.refresh_rbd_stats_pools(pools) | |
1301 | pools_refreshed = True | |
1302 | ||
1303 | pool_ids = list(self.rbd_stats['pools']) | |
1304 | pool_ids.sort() | |
1305 | pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$' | |
1306 | ||
1307 | nspace_names = [] | |
1308 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
1309 | if pool['ns_names']: | |
1310 | nspace_names.extend(pool['ns_names']) | |
1311 | else: | |
1312 | nspace_names = [] | |
1313 | break | |
1314 | if nspace_names: | |
1315 | namespace_regex = '^(' + \ | |
1316 | "|".join([re.escape(x) | |
1317 | for x in set(nspace_names)]) + ')$' | |
1318 | else: | |
1319 | namespace_regex = '^(.*)$' | |
1320 | ||
f67539c2 TL |
1321 | if ('query' in self.rbd_stats |
1322 | and (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] | |
1323 | or namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex'])): | |
11fdf7f2 TL |
1324 | self.remove_osd_perf_query(self.rbd_stats['query_id']) |
1325 | del self.rbd_stats['query_id'] | |
1326 | del self.rbd_stats['query'] | |
1327 | ||
1328 | if not self.rbd_stats['pools']: | |
1329 | return | |
1330 | ||
1331 | counters_info = self.rbd_stats['counters_info'] | |
1332 | ||
1333 | if 'query_id' not in self.rbd_stats: | |
1334 | query = { | |
1335 | 'key_descriptor': [ | |
1336 | {'type': 'pool_id', 'regex': pool_id_regex}, | |
1337 | {'type': 'namespace', 'regex': namespace_regex}, | |
1338 | {'type': 'object_name', | |
f67539c2 | 1339 | 'regex': r'^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'}, |
11fdf7f2 TL |
1340 | ], |
1341 | 'performance_counter_descriptors': list(counters_info), | |
1342 | } | |
1343 | query_id = self.add_osd_perf_query(query) | |
1344 | if query_id is None: | |
1345 | self.log.error('failed to add query %s' % query) | |
1346 | return | |
1347 | self.rbd_stats['query'] = query | |
1348 | self.rbd_stats['query_id'] = query_id | |
1349 | ||
1350 | res = self.get_osd_perf_counters(self.rbd_stats['query_id']) | |
f67539c2 | 1351 | assert res |
11fdf7f2 TL |
1352 | for c in res['counters']: |
1353 | # if the pool id is not found in the object name use id of the | |
1354 | # pool where the object is located | |
1355 | if c['k'][2][0]: | |
1356 | pool_id = int(c['k'][2][0]) | |
1357 | else: | |
1358 | pool_id = int(c['k'][0][0]) | |
1359 | if pool_id not in self.rbd_stats['pools'] and not pools_refreshed: | |
1360 | self.refresh_rbd_stats_pools(pools) | |
1361 | pools_refreshed = True | |
1362 | if pool_id not in self.rbd_stats['pools']: | |
1363 | continue | |
1364 | pool = self.rbd_stats['pools'][pool_id] | |
1365 | nspace_name = c['k'][1][0] | |
1366 | if nspace_name not in pool['images']: | |
1367 | continue | |
1368 | image_id = c['k'][2][1] | |
1369 | if image_id not in pool['images'][nspace_name] and \ | |
1370 | not pools_refreshed: | |
1371 | self.refresh_rbd_stats_pools(pools) | |
1372 | pool = self.rbd_stats['pools'][pool_id] | |
1373 | pools_refreshed = True | |
1374 | if image_id not in pool['images'][nspace_name]: | |
1375 | continue | |
1376 | counters = pool['images'][nspace_name][image_id]['c'] | |
1377 | for i in range(len(c['c'])): | |
1378 | counters[i][0] += c['c'][i][0] | |
1379 | counters[i][1] += c['c'][i][1] | |
1380 | ||
1381 | label_names = ("pool", "namespace", "image") | |
1382 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
1383 | pool_name = pool['name'] | |
1384 | for nspace_name, images in pool['images'].items(): | |
1385 | for image_id in images: | |
1386 | image_name = images[image_id]['n'] | |
1387 | counters = images[image_id]['c'] | |
1388 | i = 0 | |
1389 | for key in counters_info: | |
1390 | counter_info = counters_info[key] | |
1391 | stattype = self._stattype_to_str(counter_info['type']) | |
1392 | labels = (pool_name, nspace_name, image_name) | |
1393 | if counter_info['type'] == self.PERFCOUNTER_COUNTER: | |
1394 | path = 'rbd_' + key | |
1395 | if path not in self.metrics: | |
1396 | self.metrics[path] = Metric( | |
1397 | stattype, | |
1398 | path, | |
1399 | counter_info['desc'], | |
1400 | label_names, | |
1401 | ) | |
1402 | self.metrics[path].set(counters[i][0], labels) | |
1403 | elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG: | |
1404 | path = 'rbd_' + key + '_sum' | |
1405 | if path not in self.metrics: | |
1406 | self.metrics[path] = Metric( | |
1407 | stattype, | |
1408 | path, | |
1409 | counter_info['desc'] + ' Total', | |
1410 | label_names, | |
1411 | ) | |
1412 | self.metrics[path].set(counters[i][0], labels) | |
1413 | path = 'rbd_' + key + '_count' | |
1414 | if path not in self.metrics: | |
1415 | self.metrics[path] = Metric( | |
1416 | 'counter', | |
1417 | path, | |
1418 | counter_info['desc'] + ' Count', | |
1419 | label_names, | |
1420 | ) | |
1421 | self.metrics[path].set(counters[i][1], labels) | |
1422 | i += 1 | |
1423 | ||
f67539c2 | 1424 | def refresh_rbd_stats_pools(self, pools: Dict[str, Set[str]]) -> None: |
11fdf7f2 TL |
1425 | self.log.debug('refreshing rbd pools %s' % (pools)) |
1426 | ||
1427 | rbd = RBD() | |
1428 | counters_info = self.rbd_stats['counters_info'] | |
1429 | for pool_name, cfg_ns_names in pools.items(): | |
1430 | try: | |
1431 | pool_id = self.rados.pool_lookup(pool_name) | |
1432 | with self.rados.open_ioctx(pool_name) as ioctx: | |
1433 | if pool_id not in self.rbd_stats['pools']: | |
1434 | self.rbd_stats['pools'][pool_id] = {'images': {}} | |
1435 | pool = self.rbd_stats['pools'][pool_id] | |
1436 | pool['name'] = pool_name | |
1437 | pool['ns_names'] = cfg_ns_names | |
1438 | if cfg_ns_names: | |
1439 | nspace_names = list(cfg_ns_names) | |
1440 | else: | |
1441 | nspace_names = [''] + rbd.namespace_list(ioctx) | |
1442 | for nspace_name in pool['images']: | |
1443 | if nspace_name not in nspace_names: | |
1444 | del pool['images'][nspace_name] | |
1445 | for nspace_name in nspace_names: | |
f67539c2 TL |
1446 | if nspace_name and\ |
1447 | not rbd.namespace_exists(ioctx, nspace_name): | |
11fdf7f2 TL |
1448 | self.log.debug('unknown namespace %s for pool %s' % |
1449 | (nspace_name, pool_name)) | |
1450 | continue | |
1451 | ioctx.set_namespace(nspace_name) | |
1452 | if nspace_name not in pool['images']: | |
1453 | pool['images'][nspace_name] = {} | |
1454 | namespace = pool['images'][nspace_name] | |
1455 | images = {} | |
1456 | for image_meta in RBD().list2(ioctx): | |
1457 | image = {'n': image_meta['name']} | |
1458 | image_id = image_meta['id'] | |
1459 | if image_id in namespace: | |
1460 | image['c'] = namespace[image_id]['c'] | |
1461 | else: | |
1462 | image['c'] = [[0, 0] for x in counters_info] | |
1463 | images[image_id] = image | |
1464 | pool['images'][nspace_name] = images | |
1465 | except Exception as e: | |
1466 | self.log.error('failed listing pool %s: %s' % (pool_name, e)) | |
1467 | self.rbd_stats['pools_refresh_time'] = time.time() | |
1468 | ||
f67539c2 | 1469 | def shutdown_rbd_stats(self) -> None: |
11fdf7f2 TL |
1470 | if 'query_id' in self.rbd_stats: |
1471 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
1472 | del self.rbd_stats['query_id'] | |
1473 | del self.rbd_stats['query'] | |
1474 | self.rbd_stats['pools'].clear() | |
1475 | ||
f67539c2 | 1476 | def add_fixed_name_metrics(self) -> None: |
e306af50 TL |
1477 | """ |
1478 | Add fixed name metrics from existing ones that have details in their names | |
1479 | that should be in labels (not in name). | |
1480 | For backward compatibility, a new fixed name metric is created (instead of replacing) | |
1481 | and details are put in new labels. | |
1482 | Intended for RGW sync perf. counters but extendable as required. | |
1483 | See: https://tracker.ceph.com/issues/45311 | |
1484 | """ | |
1485 | new_metrics = {} | |
f67539c2 | 1486 | for metric_path, metrics in self.metrics.items(): |
e306af50 | 1487 | # Address RGW sync perf. counters. |
f67539c2 | 1488 | match = re.search(r'^data-sync-from-(.*)\.', metric_path) |
e306af50 TL |
1489 | if match: |
1490 | new_path = re.sub('from-([^.]*)', 'from-zone', metric_path) | |
1491 | if new_path not in new_metrics: | |
1492 | new_metrics[new_path] = Metric( | |
f67539c2 | 1493 | metrics.mtype, |
e306af50 | 1494 | new_path, |
f67539c2 | 1495 | metrics.desc, |
20effc67 | 1496 | cast(LabelValues, metrics.labelnames) + ('source_zone',) |
e306af50 | 1497 | ) |
f67539c2 | 1498 | for label_values, value in metrics.value.items(): |
e306af50 TL |
1499 | new_metrics[new_path].set(value, label_values + (match.group(1),)) |
1500 | ||
1501 | self.metrics.update(new_metrics) | |
1502 | ||
f67539c2 TL |
1503 | def get_collect_time_metrics(self) -> None: |
1504 | sum_metric = self.metrics.get('prometheus_collect_duration_seconds_sum') | |
1505 | count_metric = self.metrics.get('prometheus_collect_duration_seconds_count') | |
1506 | if sum_metric is None: | |
1507 | sum_metric = MetricCounter( | |
1508 | 'prometheus_collect_duration_seconds_sum', | |
1509 | 'The sum of seconds took to collect all metrics of this exporter', | |
1510 | ('method',)) | |
1511 | self.metrics['prometheus_collect_duration_seconds_sum'] = sum_metric | |
1512 | if count_metric is None: | |
1513 | count_metric = MetricCounter( | |
1514 | 'prometheus_collect_duration_seconds_count', | |
1515 | 'The amount of metrics gathered for this exporter', | |
1516 | ('method',)) | |
20effc67 | 1517 | self.metrics['prometheus_collect_duration_seconds_count'] = count_metric |
f67539c2 TL |
1518 | |
1519 | # Collect all timing data and make it available as metric, excluding the | |
1520 | # `collect` method because it has not finished at this point and hence | |
1521 | # there's no `_execution_duration` attribute to be found. The | |
1522 | # `_execution_duration` attribute is added by the `profile_method` | |
1523 | # decorator. | |
1524 | for method_name, method in Module.__dict__.items(): | |
1525 | duration = getattr(method, '_execution_duration', None) | |
1526 | if duration is not None: | |
1527 | cast(MetricCounter, sum_metric).add(duration, (method_name,)) | |
1528 | cast(MetricCounter, count_metric).add(1, (method_name,)) | |
1529 | ||
f6b5b4d7 | 1530 | @profile_method(True) |
f67539c2 | 1531 | def collect(self) -> str: |
91327a77 AA |
1532 | # Clear the metrics before scraping |
1533 | for k in self.metrics.keys(): | |
1534 | self.metrics[k].clear() | |
1535 | ||
3efd9988 FG |
1536 | self.get_health() |
1537 | self.get_df() | |
11fdf7f2 | 1538 | self.get_pool_stats() |
94b18763 | 1539 | self.get_fs() |
b32b8144 | 1540 | self.get_osd_stats() |
3efd9988 | 1541 | self.get_quorum_status() |
494da23a | 1542 | self.get_mgr_status() |
3efd9988 FG |
1543 | self.get_metadata_and_osd_status() |
1544 | self.get_pg_status() | |
28e407b8 | 1545 | self.get_num_objects() |
3efd9988 | 1546 | |
94b18763 | 1547 | for daemon, counters in self.get_all_perf_counters().items(): |
3efd9988 | 1548 | for path, counter_info in counters.items(): |
28e407b8 | 1549 | # Skip histograms, they are represented by long running avgs |
3efd9988 | 1550 | stattype = self._stattype_to_str(counter_info['type']) |
3efd9988 FG |
1551 | if not stattype or stattype == 'histogram': |
1552 | self.log.debug('ignoring %s, type %s' % (path, stattype)) | |
1553 | continue | |
1554 | ||
81eedcae TL |
1555 | path, label_names, labels = self._perfpath_to_path_labels( |
1556 | daemon, path) | |
1557 | ||
28e407b8 | 1558 | # Get the value of the counter |
11fdf7f2 TL |
1559 | value = self._perfvalue_to_value( |
1560 | counter_info['type'], counter_info['value']) | |
28e407b8 AA |
1561 | |
1562 | # Represent the long running avgs as sum/count pairs | |
1563 | if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG: | |
1564 | _path = path + '_sum' | |
91327a77 AA |
1565 | if _path not in self.metrics: |
1566 | self.metrics[_path] = Metric( | |
1567 | stattype, | |
1568 | _path, | |
1569 | counter_info['description'] + ' Total', | |
81eedcae | 1570 | label_names, |
91327a77 | 1571 | ) |
81eedcae | 1572 | self.metrics[_path].set(value, labels) |
28e407b8 AA |
1573 | |
1574 | _path = path + '_count' | |
91327a77 AA |
1575 | if _path not in self.metrics: |
1576 | self.metrics[_path] = Metric( | |
1577 | 'counter', | |
1578 | _path, | |
1579 | counter_info['description'] + ' Count', | |
81eedcae | 1580 | label_names, |
91327a77 | 1581 | ) |
81eedcae | 1582 | self.metrics[_path].set(counter_info['count'], labels,) |
28e407b8 | 1583 | else: |
91327a77 AA |
1584 | if path not in self.metrics: |
1585 | self.metrics[path] = Metric( | |
1586 | stattype, | |
1587 | path, | |
1588 | counter_info['description'], | |
81eedcae | 1589 | label_names, |
91327a77 | 1590 | ) |
81eedcae | 1591 | self.metrics[path].set(value, labels) |
91327a77 | 1592 | |
e306af50 | 1593 | self.add_fixed_name_metrics() |
11fdf7f2 TL |
1594 | self.get_rbd_stats() |
1595 | ||
f67539c2 TL |
1596 | self.get_collect_time_metrics() |
1597 | ||
91327a77 AA |
1598 | # Return formatted metrics and clear no longer used data |
1599 | _metrics = [m.str_expfmt() for m in self.metrics.values()] | |
1600 | for k in self.metrics.keys(): | |
1601 | self.metrics[k].clear() | |
1602 | ||
1603 | return ''.join(_metrics) + '\n' | |
c07f9fc5 | 1604 | |
f67539c2 TL |
1605 | @CLIReadCommand('prometheus file_sd_config') |
1606 | def get_file_sd_config(self) -> Tuple[int, str, str]: | |
1607 | ''' | |
1608 | Return file_sd compatible prometheus config for mgr cluster | |
1609 | ''' | |
11fdf7f2 TL |
1610 | servers = self.list_servers() |
1611 | targets = [] | |
1612 | for server in servers: | |
1613 | hostname = server.get('hostname', '') | |
f67539c2 | 1614 | for service in cast(List[ServiceInfoT], server.get('services', [])): |
11fdf7f2 TL |
1615 | if service['type'] != 'mgr': |
1616 | continue | |
1617 | id_ = service['id'] | |
adb31ebb TL |
1618 | port = self._get_module_option('server_port', DEFAULT_PORT, id_) |
1619 | targets.append(f'{hostname}:{port}') | |
11fdf7f2 TL |
1620 | ret = [ |
1621 | { | |
1622 | "targets": targets, | |
1623 | "labels": {} | |
1624 | } | |
1625 | ] | |
1626 | return 0, json.dumps(ret), "" | |
1627 | ||
f67539c2 | 1628 | def self_test(self) -> None: |
11fdf7f2 TL |
1629 | self.collect() |
1630 | self.get_file_sd_config() | |
1631 | ||
f67539c2 | 1632 | def serve(self) -> None: |
c07f9fc5 FG |
1633 | |
1634 | class Root(object): | |
1635 | ||
1636 | # collapse everything to '/' | |
f67539c2 | 1637 | def _cp_dispatch(self, vpath: str) -> 'Root': |
c07f9fc5 FG |
1638 | cherrypy.request.path = '' |
1639 | return self | |
1640 | ||
c07f9fc5 | 1641 | @cherrypy.expose |
f67539c2 | 1642 | def index(self) -> str: |
3efd9988 FG |
1643 | return '''<!DOCTYPE html> |
1644 | <html> | |
9f95a23c TL |
1645 | <head><title>Ceph Exporter</title></head> |
1646 | <body> | |
1647 | <h1>Ceph Exporter</h1> | |
1648 | <p><a href='/metrics'>Metrics</a></p> | |
1649 | </body> | |
3efd9988 FG |
1650 | </html>''' |
1651 | ||
1652 | @cherrypy.expose | |
f67539c2 | 1653 | def metrics(self) -> Optional[str]: |
91327a77 | 1654 | # Lock the function execution |
f6b5b4d7 TL |
1655 | assert isinstance(_global_instance, Module) |
1656 | with _global_instance.collect_lock: | |
1657 | return self._metrics(_global_instance) | |
91327a77 | 1658 | |
11fdf7f2 | 1659 | @staticmethod |
f67539c2 | 1660 | def _metrics(instance: 'Module') -> Optional[str]: |
a4b75251 TL |
1661 | if not self.cache: |
1662 | self.log.debug('Cache disabled, collecting and returning without cache') | |
1663 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1664 | return self.collect() | |
1665 | ||
f6b5b4d7 TL |
1666 | # Return cached data if available |
1667 | if not instance.collect_cache: | |
1668 | raise cherrypy.HTTPError(503, 'No cached data available yet') | |
91327a77 | 1669 | |
f67539c2 | 1670 | def respond() -> Optional[str]: |
f6b5b4d7 | 1671 | assert isinstance(instance, Module) |
91327a77 AA |
1672 | cherrypy.response.headers['Content-Type'] = 'text/plain' |
1673 | return instance.collect_cache | |
f6b5b4d7 TL |
1674 | |
1675 | if instance.collect_time < instance.scrape_interval: | |
1676 | # Respond if cache isn't stale | |
1677 | return respond() | |
1678 | ||
1679 | if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN: | |
1680 | # Respond even if cache is stale | |
1681 | instance.log.info( | |
1682 | 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' | |
1683 | 'returning metrics from stale cache.'.format( | |
1684 | instance.collect_time, | |
1685 | instance.collect_time - instance.scrape_interval | |
1686 | ) | |
1687 | ) | |
1688 | return respond() | |
1689 | ||
1690 | if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL: | |
1691 | # Fail if cache is stale | |
1692 | msg = ( | |
1693 | 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' | |
1694 | 'returning "service unavailable".'.format( | |
1695 | instance.collect_time, | |
1696 | instance.collect_time - instance.scrape_interval, | |
1697 | ) | |
1698 | ) | |
1699 | instance.log.error(msg) | |
1700 | raise cherrypy.HTTPError(503, msg) | |
f67539c2 | 1701 | return None |
c07f9fc5 | 1702 | |
91327a77 | 1703 | # Make the cache timeout for collecting configurable |
f67539c2 | 1704 | self.scrape_interval = cast(float, self.get_localized_module_option('scrape_interval')) |
f6b5b4d7 | 1705 | |
f67539c2 TL |
1706 | self.stale_cache_strategy = cast( |
1707 | str, self.get_localized_module_option('stale_cache_strategy')) | |
f6b5b4d7 TL |
1708 | if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL, |
1709 | self.STALE_CACHE_RETURN]: | |
1710 | self.stale_cache_strategy = self.STALE_CACHE_FAIL | |
91327a77 | 1711 | |
522d829b TL |
1712 | server_addr = cast(str, self.get_localized_module_option( |
1713 | 'server_addr', get_default_addr())) | |
1714 | server_port = cast(int, self.get_localized_module_option( | |
1715 | 'server_port', DEFAULT_PORT)) | |
c07f9fc5 FG |
1716 | self.log.info( |
1717 | "server_addr: %s server_port: %s" % | |
1718 | (server_addr, server_port) | |
1719 | ) | |
c07f9fc5 | 1720 | |
a4b75251 TL |
1721 | self.cache = cast(bool, self.get_localized_module_option('cache', True)) |
1722 | if self.cache: | |
1723 | self.log.info('Cache enabled') | |
1724 | self.metrics_thread.start() | |
1725 | else: | |
1726 | self.log.info('Cache disabled') | |
adb31ebb | 1727 | |
a4b75251 TL |
1728 | cherrypy.config.update({ |
1729 | 'server.socket_host': server_addr, | |
1730 | 'server.socket_port': server_port, | |
1731 | 'engine.autoreload.on': False | |
1732 | }) | |
94b18763 FG |
1733 | # Publish the URI that others may use to access the service we're |
1734 | # about to start serving | |
b3b6e05e TL |
1735 | if server_addr in ['::', '0.0.0.0']: |
1736 | server_addr = self.get_mgr_ip() | |
522d829b | 1737 | self.set_uri(build_url(scheme='http', host=server_addr, port=server_port, path='/')) |
94b18763 | 1738 | |
c07f9fc5 | 1739 | cherrypy.tree.mount(Root(), "/") |
94b18763 | 1740 | self.log.info('Starting engine...') |
c07f9fc5 | 1741 | cherrypy.engine.start() |
94b18763 | 1742 | self.log.info('Engine started.') |
91327a77 AA |
1743 | # wait for the shutdown event |
1744 | self.shutdown_event.wait() | |
1745 | self.shutdown_event.clear() | |
adb31ebb TL |
1746 | # tell metrics collection thread to stop collecting new metrics |
1747 | self.metrics_thread.stop() | |
91327a77 AA |
1748 | cherrypy.engine.stop() |
1749 | self.log.info('Engine stopped.') | |
11fdf7f2 | 1750 | self.shutdown_rbd_stats() |
adb31ebb TL |
1751 | # wait for the metrics collection thread to stop |
1752 | self.metrics_thread.join() | |
94b18763 | 1753 | |
f67539c2 | 1754 | def shutdown(self) -> None: |
94b18763 | 1755 | self.log.info('Stopping engine...') |
91327a77 | 1756 | self.shutdown_event.set() |
94b18763 | 1757 | |
20effc67 TL |
1758 | @CLIReadCommand('healthcheck history ls') |
1759 | def _list_healthchecks(self, format: Format = Format.plain) -> HandleCommandResult: | |
1760 | """List all the healthchecks being tracked | |
1761 | ||
1762 | The format options are parsed in ceph_argparse, before they get evaluated here so | |
1763 | we can safely assume that what we have to process is valid. ceph_argparse will throw | |
1764 | a ValueError if the cast to our Format class fails. | |
1765 | ||
1766 | Args: | |
1767 | format (Format, optional): output format. Defaults to Format.plain. | |
1768 | ||
1769 | Returns: | |
1770 | HandleCommandResult: return code, stdout and stderr returned to the caller | |
1771 | """ | |
1772 | ||
1773 | out = "" | |
1774 | if format == Format.plain: | |
1775 | out = str(self.health_history) | |
1776 | elif format == Format.yaml: | |
1777 | out = self.health_history.as_yaml() | |
1778 | else: | |
1779 | out = self.health_history.as_json(format == Format.json_pretty) | |
1780 | ||
1781 | return HandleCommandResult(retval=0, stdout=out) | |
1782 | ||
1783 | @CLIWriteCommand('healthcheck history clear') | |
1784 | def _clear_healthchecks(self) -> HandleCommandResult: | |
1785 | """Clear the healthcheck history""" | |
1786 | self.health_history.reset() | |
1787 | return HandleCommandResult(retval=0, stdout="healthcheck history cleared") | |
1788 | ||
94b18763 FG |
1789 | |
1790 | class StandbyModule(MgrStandbyModule): | |
20effc67 TL |
1791 | |
1792 | MODULE_OPTIONS = Module.MODULE_OPTIONS | |
1793 | ||
f67539c2 | 1794 | def __init__(self, *args: Any, **kwargs: Any) -> None: |
91327a77 AA |
1795 | super(StandbyModule, self).__init__(*args, **kwargs) |
1796 | self.shutdown_event = threading.Event() | |
1797 | ||
f67539c2 | 1798 | def serve(self) -> None: |
494da23a TL |
1799 | server_addr = self.get_localized_module_option( |
1800 | 'server_addr', get_default_addr()) | |
11fdf7f2 TL |
1801 | server_port = self.get_localized_module_option( |
1802 | 'server_port', DEFAULT_PORT) | |
1803 | self.log.info("server_addr: %s server_port: %s" % | |
1804 | (server_addr, server_port)) | |
94b18763 FG |
1805 | cherrypy.config.update({ |
1806 | 'server.socket_host': server_addr, | |
f67539c2 | 1807 | 'server.socket_port': server_port, |
20effc67 TL |
1808 | 'engine.autoreload.on': False, |
1809 | 'request.show_tracebacks': False | |
94b18763 FG |
1810 | }) |
1811 | ||
1812 | module = self | |
1813 | ||
1814 | class Root(object): | |
94b18763 | 1815 | @cherrypy.expose |
f67539c2 | 1816 | def index(self) -> str: |
20effc67 TL |
1817 | standby_behaviour = module.get_module_option('standby_behaviour') |
1818 | if standby_behaviour == 'default': | |
1819 | active_uri = module.get_active_uri() | |
1820 | return '''<!DOCTYPE html> | |
94b18763 | 1821 | <html> |
9f95a23c TL |
1822 | <head><title>Ceph Exporter</title></head> |
1823 | <body> | |
1824 | <h1>Ceph Exporter</h1> | |
94b18763 | 1825 | <p><a href='{}metrics'>Metrics</a></p> |
9f95a23c | 1826 | </body> |
94b18763 | 1827 | </html>'''.format(active_uri) |
20effc67 TL |
1828 | else: |
1829 | status = module.get_module_option('standby_error_status_code') | |
1830 | raise cherrypy.HTTPError(status, message="Keep on looking") | |
94b18763 FG |
1831 | |
1832 | @cherrypy.expose | |
f67539c2 | 1833 | def metrics(self) -> str: |
94b18763 FG |
1834 | cherrypy.response.headers['Content-Type'] = 'text/plain' |
1835 | return '' | |
1836 | ||
1837 | cherrypy.tree.mount(Root(), '/', {}) | |
1838 | self.log.info('Starting engine...') | |
1839 | cherrypy.engine.start() | |
94b18763 | 1840 | self.log.info('Engine started.') |
91327a77 AA |
1841 | # Wait for shutdown event |
1842 | self.shutdown_event.wait() | |
1843 | self.shutdown_event.clear() | |
1844 | cherrypy.engine.stop() | |
1845 | self.log.info('Engine stopped.') | |
94b18763 | 1846 | |
f67539c2 | 1847 | def shutdown(self) -> None: |
94b18763 | 1848 | self.log.info("Stopping engine...") |
91327a77 | 1849 | self.shutdown_event.set() |
94b18763 | 1850 | self.log.info("Stopped engine") |