]>
Commit | Line | Data |
---|---|---|
c07f9fc5 | 1 | import cherrypy |
a8e16298 | 2 | from distutils.version import StrictVersion |
3efd9988 FG |
3 | import json |
4 | import errno | |
c07f9fc5 FG |
5 | import math |
6 | import os | |
11fdf7f2 | 7 | import re |
94b18763 | 8 | import socket |
91327a77 AA |
9 | import threading |
10 | import time | |
11fdf7f2 | 11 | from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES |
f6b5b4d7 | 12 | from mgr_util import get_default_addr, profile_method |
11fdf7f2 | 13 | from rbd import RBD |
f6b5b4d7 TL |
14 | try: |
15 | from typing import Optional, Dict, Any, Set | |
16 | except: | |
17 | pass | |
c07f9fc5 FG |
18 | |
19 | # Defaults for the Prometheus HTTP server. Can also set in config-key | |
20 | # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations | |
21 | # for Prometheus exporter port registry | |
22 | ||
c07f9fc5 FG |
23 | DEFAULT_PORT = 9283 |
24 | ||
a8e16298 TL |
25 | # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify |
26 | # that the ports its listening on are in fact bound. When using the any address | |
27 | # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes) | |
28 | # ipv6 isn't yet configured / supported and CherryPy throws an uncaught | |
29 | # exception. | |
30 | if cherrypy is not None: | |
31 | v = StrictVersion(cherrypy.__version__) | |
32 | # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on | |
33 | # centos:7) and back to at least 3.0.0. | |
34 | if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"): | |
35 | # https://github.com/cherrypy/cherrypy/issues/1100 | |
36 | from cherrypy.process import servers | |
37 | servers.wait_for_occupied_port = lambda host, port: None | |
c07f9fc5 | 38 | |
9f95a23c | 39 | |
c07f9fc5 | 40 | # cherrypy likes to sys.exit on error. don't let it take us down too! |
3efd9988 | 41 | def os_exit_noop(*args, **kwargs): |
c07f9fc5 FG |
42 | pass |
43 | ||
44 | ||
45 | os._exit = os_exit_noop | |
46 | ||
c07f9fc5 FG |
47 | # to access things in class Module from subclass Root. Because |
48 | # it's a dict, the writer doesn't need to declare 'global' for access | |
49 | ||
f6b5b4d7 | 50 | _global_instance = None # type: Optional[Module] |
c07f9fc5 FG |
51 | |
52 | ||
3efd9988 | 53 | def health_status_to_number(status): |
3efd9988 FG |
54 | if status == 'HEALTH_OK': |
55 | return 0 | |
56 | elif status == 'HEALTH_WARN': | |
57 | return 1 | |
58 | elif status == 'HEALTH_ERR': | |
59 | return 2 | |
c07f9fc5 | 60 | |
11fdf7f2 TL |
61 | |
62 | DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes'] | |
63 | ||
64 | DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty', | |
f91f0fd5 TL |
65 | 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes', |
66 | 'compress_bytes_used', 'compress_under_bytes'] | |
c07f9fc5 | 67 | |
11fdf7f2 TL |
68 | OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec', |
69 | 'recovering_keys_per_sec', 'num_objects_recovered', | |
70 | 'num_bytes_recovered', 'num_bytes_recovered') | |
71 | ||
94b18763 FG |
72 | OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance', |
73 | 'norecover', 'noscrub', 'nodeep-scrub') | |
3efd9988 | 74 | |
28e407b8 | 75 | FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name') |
b32b8144 | 76 | |
28e407b8 AA |
77 | MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank', |
78 | 'ceph_version') | |
3efd9988 | 79 | |
11fdf7f2 TL |
80 | MON_METADATA = ('ceph_daemon', 'hostname', |
81 | 'public_addr', 'rank', 'ceph_version') | |
c07f9fc5 | 82 | |
494da23a TL |
83 | MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
84 | ||
85 | MGR_STATUS = ('ceph_daemon',) | |
86 | ||
87 | MGR_MODULE_STATUS = ('name',) | |
88 | ||
89 | MGR_MODULE_CAN_RUN = ('name',) | |
90 | ||
a8e16298 TL |
91 | OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class', |
92 | 'front_iface', 'hostname', 'objectstore', 'public_addr', | |
93 | 'ceph_version') | |
c07f9fc5 | 94 | |
94b18763 | 95 | OSD_STATUS = ['weight', 'up', 'in'] |
c07f9fc5 | 96 | |
94b18763 | 97 | OSD_STATS = ['apply_latency_ms', 'commit_latency_ms'] |
c07f9fc5 | 98 | |
94b18763 | 99 | POOL_METADATA = ('pool_id', 'name') |
c07f9fc5 | 100 | |
28e407b8 | 101 | RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
c07f9fc5 | 102 | |
11fdf7f2 TL |
103 | RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname', |
104 | 'ceph_version') | |
105 | ||
106 | DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', | |
107 | 'wal_device', 'instance') | |
28e407b8 AA |
108 | |
109 | NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] | |
c07f9fc5 | 110 | |
c07f9fc5 | 111 | |
91327a77 AA |
112 | class Metric(object): |
113 | def __init__(self, mtype, name, desc, labels=None): | |
114 | self.mtype = mtype | |
115 | self.name = name | |
116 | self.desc = desc | |
117 | self.labelnames = labels # tuple if present | |
118 | self.value = {} # indexed by label values | |
119 | ||
120 | def clear(self): | |
121 | self.value = {} | |
122 | ||
123 | def set(self, value, labelvalues=None): | |
124 | # labelvalues must be a tuple | |
125 | labelvalues = labelvalues or ('',) | |
126 | self.value[labelvalues] = value | |
3efd9988 | 127 | |
91327a77 AA |
128 | def str_expfmt(self): |
129 | ||
130 | def promethize(path): | |
131 | ''' replace illegal metric name characters ''' | |
81eedcae | 132 | result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus') |
91327a77 AA |
133 | |
134 | # Hyphens usually turn into underscores, unless they are | |
135 | # trailing | |
136 | if result.endswith("-"): | |
137 | result = result[0:-1] + "_minus" | |
138 | else: | |
139 | result = result.replace("-", "_") | |
140 | ||
141 | return "ceph_{0}".format(result) | |
142 | ||
143 | def floatstr(value): | |
144 | ''' represent as Go-compatible float ''' | |
145 | if value == float('inf'): | |
146 | return '+Inf' | |
147 | if value == float('-inf'): | |
148 | return '-Inf' | |
149 | if math.isnan(value): | |
150 | return 'NaN' | |
151 | return repr(float(value)) | |
152 | ||
153 | name = promethize(self.name) | |
154 | expfmt = ''' | |
155 | # HELP {name} {desc} | |
156 | # TYPE {name} {mtype}'''.format( | |
157 | name=name, | |
158 | desc=self.desc, | |
159 | mtype=self.mtype, | |
160 | ) | |
161 | ||
162 | for labelvalues, value in self.value.items(): | |
163 | if self.labelnames: | |
f6b5b4d7 TL |
164 | labels_list = zip(self.labelnames, labelvalues) |
165 | labels = ','.join('%s="%s"' % (k, v) for k, v in labels_list) | |
91327a77 AA |
166 | else: |
167 | labels = '' | |
168 | if labels: | |
169 | fmtstr = '\n{name}{{{labels}}} {value}' | |
170 | else: | |
171 | fmtstr = '\n{name} {value}' | |
172 | expfmt += fmtstr.format( | |
173 | name=name, | |
174 | labels=labels, | |
175 | value=floatstr(value), | |
176 | ) | |
177 | return expfmt | |
178 | ||
179 | ||
f6b5b4d7 TL |
180 | class MetricCollectionThread(threading.Thread): |
181 | def __init__(self, module): | |
182 | # type: (Module) -> None | |
183 | self.mod = module | |
184 | super(MetricCollectionThread, self).__init__(target=self.collect) | |
185 | ||
186 | def collect(self): | |
187 | self.mod.log.info('starting metric collection thread') | |
188 | while True: | |
189 | self.mod.log.debug('collecting cache in thread') | |
190 | if self.mod.have_mon_connection(): | |
191 | start_time = time.time() | |
192 | data = self.mod.collect() | |
193 | duration = time.time() - start_time | |
194 | ||
195 | self.mod.log.debug('collecting cache in thread done') | |
196 | ||
197 | sleep_time = self.mod.scrape_interval - duration | |
198 | if sleep_time < 0: | |
199 | self.mod.log.warning( | |
200 | 'Collecting data took more time than configured scrape interval. ' | |
201 | 'This possibly results in stale data. Please check the ' | |
202 | '`stale_cache_strategy` configuration option. ' | |
203 | 'Collecting data took {:.2f} seconds but scrape interval is configured ' | |
204 | 'to be {:.0f} seconds.'.format( | |
205 | duration, | |
206 | self.mod.scrape_interval, | |
207 | ) | |
208 | ) | |
209 | sleep_time = 0 | |
210 | ||
211 | with self.mod.collect_lock: | |
212 | self.mod.collect_cache = data | |
213 | self.mod.collect_time = duration | |
214 | ||
215 | time.sleep(sleep_time) | |
216 | else: | |
217 | self.mod.log.error('No MON connection') | |
218 | time.sleep(self.mod.scrape_interval) | |
219 | ||
220 | ||
91327a77 AA |
221 | class Module(MgrModule): |
222 | COMMANDS = [ | |
223 | { | |
11fdf7f2 TL |
224 | "cmd": "prometheus file_sd_config", |
225 | "desc": "Return file_sd compatible prometheus config for mgr cluster", | |
226 | "perm": "r" | |
91327a77 AA |
227 | }, |
228 | ] | |
229 | ||
11fdf7f2 TL |
230 | MODULE_OPTIONS = [ |
231 | {'name': 'server_addr'}, | |
232 | {'name': 'server_port'}, | |
233 | {'name': 'scrape_interval'}, | |
f6b5b4d7 | 234 | {'name': 'stale_cache_strategy'}, |
11fdf7f2 | 235 | {'name': 'rbd_stats_pools'}, |
e306af50 | 236 | {'name': 'rbd_stats_pools_refresh_interval', 'type': 'int', 'default': 300}, |
91327a77 AA |
237 | ] |
238 | ||
f6b5b4d7 TL |
239 | STALE_CACHE_FAIL = 'fail' |
240 | STALE_CACHE_RETURN = 'return' | |
241 | ||
91327a77 AA |
242 | def __init__(self, *args, **kwargs): |
243 | super(Module, self).__init__(*args, **kwargs) | |
244 | self.metrics = self._setup_static_metrics() | |
245 | self.shutdown_event = threading.Event() | |
f6b5b4d7 TL |
246 | self.collect_lock = threading.Lock() |
247 | self.collect_time = 0.0 | |
248 | self.scrape_interval = 15.0 | |
249 | self.stale_cache_strategy = self.STALE_CACHE_FAIL | |
91327a77 | 250 | self.collect_cache = None |
11fdf7f2 TL |
251 | self.rbd_stats = { |
252 | 'pools': {}, | |
253 | 'pools_refresh_time': 0, | |
254 | 'counters_info': { | |
255 | 'write_ops': {'type': self.PERFCOUNTER_COUNTER, | |
256 | 'desc': 'RBD image writes count'}, | |
257 | 'read_ops': {'type': self.PERFCOUNTER_COUNTER, | |
258 | 'desc': 'RBD image reads count'}, | |
259 | 'write_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
260 | 'desc': 'RBD image bytes written'}, | |
261 | 'read_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
262 | 'desc': 'RBD image bytes read'}, | |
263 | 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
264 | 'desc': 'RBD image writes latency (msec)'}, | |
265 | 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
266 | 'desc': 'RBD image reads latency (msec)'}, | |
267 | }, | |
f6b5b4d7 TL |
268 | } # type: Dict[str, Any] |
269 | global _global_instance | |
270 | _global_instance = self | |
271 | MetricCollectionThread(_global_instance).start() | |
3efd9988 FG |
272 | |
273 | def _setup_static_metrics(self): | |
274 | metrics = {} | |
275 | metrics['health_status'] = Metric( | |
276 | 'untyped', | |
277 | 'health_status', | |
278 | 'Cluster health status' | |
279 | ) | |
94b18763 | 280 | metrics['mon_quorum_status'] = Metric( |
3efd9988 | 281 | 'gauge', |
94b18763 FG |
282 | 'mon_quorum_status', |
283 | 'Monitors in quorum', | |
284 | ('ceph_daemon',) | |
285 | ) | |
286 | metrics['fs_metadata'] = Metric( | |
287 | 'untyped', | |
288 | 'fs_metadata', | |
289 | 'FS Metadata', | |
290 | FS_METADATA | |
291 | ) | |
292 | metrics['mds_metadata'] = Metric( | |
293 | 'untyped', | |
294 | 'mds_metadata', | |
295 | 'MDS Metadata', | |
296 | MDS_METADATA | |
297 | ) | |
298 | metrics['mon_metadata'] = Metric( | |
299 | 'untyped', | |
300 | 'mon_metadata', | |
301 | 'MON Metadata', | |
302 | MON_METADATA | |
3efd9988 | 303 | ) |
494da23a TL |
304 | metrics['mgr_metadata'] = Metric( |
305 | 'gauge', | |
306 | 'mgr_metadata', | |
307 | 'MGR metadata', | |
308 | MGR_METADATA | |
309 | ) | |
310 | metrics['mgr_status'] = Metric( | |
311 | 'gauge', | |
312 | 'mgr_status', | |
313 | 'MGR status (0=standby, 1=active)', | |
314 | MGR_STATUS | |
315 | ) | |
316 | metrics['mgr_module_status'] = Metric( | |
317 | 'gauge', | |
318 | 'mgr_module_status', | |
319 | 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)', | |
320 | MGR_MODULE_STATUS | |
321 | ) | |
322 | metrics['mgr_module_can_run'] = Metric( | |
323 | 'gauge', | |
324 | 'mgr_module_can_run', | |
325 | 'MGR module runnable state i.e. can it run (0=no, 1=yes)', | |
326 | MGR_MODULE_CAN_RUN | |
327 | ) | |
3efd9988 FG |
328 | metrics['osd_metadata'] = Metric( |
329 | 'untyped', | |
330 | 'osd_metadata', | |
331 | 'OSD Metadata', | |
332 | OSD_METADATA | |
333 | ) | |
c07f9fc5 | 334 | |
3efd9988 FG |
335 | # The reason for having this separate to OSD_METADATA is |
336 | # so that we can stably use the same tag names that | |
337 | # the Prometheus node_exporter does | |
338 | metrics['disk_occupation'] = Metric( | |
b32b8144 | 339 | 'untyped', |
3efd9988 FG |
340 | 'disk_occupation', |
341 | 'Associate Ceph daemon with disk used', | |
342 | DISK_OCCUPATION | |
343 | ) | |
c07f9fc5 | 344 | |
3efd9988 FG |
345 | metrics['pool_metadata'] = Metric( |
346 | 'untyped', | |
347 | 'pool_metadata', | |
348 | 'POOL Metadata', | |
349 | POOL_METADATA | |
350 | ) | |
94b18763 FG |
351 | |
352 | metrics['rgw_metadata'] = Metric( | |
353 | 'untyped', | |
354 | 'rgw_metadata', | |
355 | 'RGW Metadata', | |
356 | RGW_METADATA | |
357 | ) | |
358 | ||
11fdf7f2 TL |
359 | metrics['rbd_mirror_metadata'] = Metric( |
360 | 'untyped', | |
361 | 'rbd_mirror_metadata', | |
362 | 'RBD Mirror Metadata', | |
363 | RBD_MIRROR_METADATA | |
364 | ) | |
365 | ||
94b18763 FG |
366 | metrics['pg_total'] = Metric( |
367 | 'gauge', | |
368 | 'pg_total', | |
92f5a8d4 TL |
369 | 'PG Total Count per Pool', |
370 | ('pool_id',) | |
94b18763 FG |
371 | ) |
372 | ||
373 | for flag in OSD_FLAGS: | |
374 | path = 'osd_flag_{}'.format(flag) | |
375 | metrics[path] = Metric( | |
376 | 'untyped', | |
377 | path, | |
378 | 'OSD Flag {}'.format(flag) | |
379 | ) | |
3efd9988 FG |
380 | for state in OSD_STATUS: |
381 | path = 'osd_{}'.format(state) | |
3efd9988 FG |
382 | metrics[path] = Metric( |
383 | 'untyped', | |
c07f9fc5 | 384 | path, |
3efd9988 FG |
385 | 'OSD status {}'.format(state), |
386 | ('ceph_daemon',) | |
c07f9fc5 | 387 | ) |
b32b8144 FG |
388 | for stat in OSD_STATS: |
389 | path = 'osd_{}'.format(stat) | |
b32b8144 FG |
390 | metrics[path] = Metric( |
391 | 'gauge', | |
392 | path, | |
393 | 'OSD stat {}'.format(stat), | |
394 | ('ceph_daemon',) | |
395 | ) | |
11fdf7f2 TL |
396 | for stat in OSD_POOL_STATS: |
397 | path = 'pool_{}'.format(stat) | |
398 | metrics[path] = Metric( | |
399 | 'gauge', | |
400 | path, | |
9f95a23c | 401 | "OSD pool stats: {}".format(stat), |
11fdf7f2 TL |
402 | ('pool_id',) |
403 | ) | |
3efd9988 FG |
404 | for state in PG_STATES: |
405 | path = 'pg_{}'.format(state) | |
3efd9988 FG |
406 | metrics[path] = Metric( |
407 | 'gauge', | |
408 | path, | |
92f5a8d4 TL |
409 | 'PG {} per pool'.format(state), |
410 | ('pool_id',) | |
3efd9988 FG |
411 | ) |
412 | for state in DF_CLUSTER: | |
413 | path = 'cluster_{}'.format(state) | |
3efd9988 FG |
414 | metrics[path] = Metric( |
415 | 'gauge', | |
416 | path, | |
417 | 'DF {}'.format(state), | |
418 | ) | |
419 | for state in DF_POOL: | |
420 | path = 'pool_{}'.format(state) | |
3efd9988 FG |
421 | metrics[path] = Metric( |
422 | 'gauge', | |
423 | path, | |
424 | 'DF pool {}'.format(state), | |
425 | ('pool_id',) | |
426 | ) | |
28e407b8 AA |
427 | for state in NUM_OBJECTS: |
428 | path = 'num_objects_{}'.format(state) | |
429 | metrics[path] = Metric( | |
430 | 'gauge', | |
431 | path, | |
432 | 'Number of {} objects'.format(state), | |
433 | ) | |
3efd9988 FG |
434 | |
435 | return metrics | |
c07f9fc5 | 436 | |
f6b5b4d7 | 437 | @profile_method() |
3efd9988 FG |
438 | def get_health(self): |
439 | health = json.loads(self.get('health')['json']) | |
91327a77 AA |
440 | self.metrics['health_status'].set( |
441 | health_status_to_number(health['status']) | |
c07f9fc5 FG |
442 | ) |
443 | ||
f6b5b4d7 | 444 | @profile_method() |
11fdf7f2 TL |
445 | def get_pool_stats(self): |
446 | # retrieve pool stats to provide per pool recovery metrics | |
447 | # (osd_pool_stats moved to mgr in Mimic) | |
448 | pstats = self.get('osd_pool_stats') | |
449 | for pool in pstats['pool_stats']: | |
450 | for stat in OSD_POOL_STATS: | |
451 | self.metrics['pool_{}'.format(stat)].set( | |
452 | pool['recovery_rate'].get(stat, 0), | |
453 | (pool['pool_id'],) | |
454 | ) | |
455 | ||
f6b5b4d7 | 456 | @profile_method() |
3efd9988 FG |
457 | def get_df(self): |
458 | # maybe get the to-be-exported metrics from a config? | |
459 | df = self.get('df') | |
460 | for stat in DF_CLUSTER: | |
91327a77 | 461 | self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat]) |
3efd9988 FG |
462 | |
463 | for pool in df['pools']: | |
464 | for stat in DF_POOL: | |
91327a77 AA |
465 | self.metrics['pool_{}'.format(stat)].set( |
466 | pool['stats'][stat], | |
467 | (pool['id'],) | |
468 | ) | |
94b18763 | 469 | |
f6b5b4d7 | 470 | @profile_method() |
94b18763 FG |
471 | def get_fs(self): |
472 | fs_map = self.get('fs_map') | |
473 | servers = self.get_service_list() | |
9f95a23c TL |
474 | self.log.debug('standbys: {}'.format(fs_map['standbys'])) |
475 | # export standby mds metadata, default standby fs_id is '-1' | |
476 | for standby in fs_map['standbys']: | |
477 | id_ = standby['name'] | |
478 | host_version = servers.get((id_, 'mds'), ('', '')) | |
479 | self.metrics['mds_metadata'].set(1, ( | |
480 | 'mds.{}'.format(id_), '-1', | |
481 | host_version[0], standby['addr'], | |
482 | standby['rank'], host_version[1] | |
483 | )) | |
94b18763 FG |
484 | for fs in fs_map['filesystems']: |
485 | # collect fs metadata | |
11fdf7f2 TL |
486 | data_pools = ",".join([str(pool) |
487 | for pool in fs['mdsmap']['data_pools']]) | |
91327a77 AA |
488 | self.metrics['fs_metadata'].set(1, ( |
489 | data_pools, | |
490 | fs['id'], | |
491 | fs['mdsmap']['metadata_pool'], | |
492 | fs['mdsmap']['fs_name'] | |
493 | )) | |
28e407b8 | 494 | self.log.debug('mdsmap: {}'.format(fs['mdsmap'])) |
94b18763 FG |
495 | for gid, daemon in fs['mdsmap']['info'].items(): |
496 | id_ = daemon['name'] | |
11fdf7f2 | 497 | host_version = servers.get((id_, 'mds'), ('', '')) |
91327a77 AA |
498 | self.metrics['mds_metadata'].set(1, ( |
499 | 'mds.{}'.format(id_), fs['id'], | |
500 | host_version[0], daemon['addr'], | |
501 | daemon['rank'], host_version[1] | |
502 | )) | |
3efd9988 | 503 | |
f6b5b4d7 | 504 | @profile_method() |
3efd9988 FG |
505 | def get_quorum_status(self): |
506 | mon_status = json.loads(self.get('mon_status')['json']) | |
94b18763 FG |
507 | servers = self.get_service_list() |
508 | for mon in mon_status['monmap']['mons']: | |
509 | rank = mon['rank'] | |
510 | id_ = mon['name'] | |
11fdf7f2 | 511 | host_version = servers.get((id_, 'mon'), ('', '')) |
91327a77 AA |
512 | self.metrics['mon_metadata'].set(1, ( |
513 | 'mon.{}'.format(id_), host_version[0], | |
f91f0fd5 | 514 | mon['public_addr'].rsplit(':', 1)[0], rank, |
91327a77 AA |
515 | host_version[1] |
516 | )) | |
94b18763 | 517 | in_quorum = int(rank in mon_status['quorum']) |
91327a77 AA |
518 | self.metrics['mon_quorum_status'].set(in_quorum, ( |
519 | 'mon.{}'.format(id_), | |
520 | )) | |
3efd9988 | 521 | |
f6b5b4d7 | 522 | @profile_method() |
494da23a TL |
523 | def get_mgr_status(self): |
524 | mgr_map = self.get('mgr_map') | |
525 | servers = self.get_service_list() | |
526 | ||
527 | active = mgr_map['active_name'] | |
528 | standbys = [s.get('name') for s in mgr_map['standbys']] | |
529 | ||
530 | all_mgrs = list(standbys) | |
531 | all_mgrs.append(active) | |
532 | ||
533 | all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']} | |
534 | ||
eafe8130 | 535 | ceph_release = None |
494da23a TL |
536 | for mgr in all_mgrs: |
537 | host_version = servers.get((mgr, 'mgr'), ('', '')) | |
538 | if mgr == active: | |
539 | _state = 1 | |
540 | ceph_release = host_version[1].split()[-2] # e.g. nautilus | |
541 | else: | |
542 | _state = 0 | |
801d1391 | 543 | |
494da23a TL |
544 | self.metrics['mgr_metadata'].set(1, ( |
545 | 'mgr.{}'.format(mgr), host_version[0], | |
546 | host_version[1] | |
547 | )) | |
548 | self.metrics['mgr_status'].set(_state, ( | |
801d1391 | 549 | 'mgr.{}'.format(mgr), |
494da23a | 550 | )) |
eafe8130 | 551 | always_on_modules = mgr_map['always_on_modules'].get(ceph_release, []) |
494da23a TL |
552 | active_modules = list(always_on_modules) |
553 | active_modules.extend(mgr_map['modules']) | |
554 | ||
555 | for mod_name in all_modules.keys(): | |
556 | ||
557 | if mod_name in always_on_modules: | |
558 | _state = 2 | |
559 | elif mod_name in active_modules: | |
560 | _state = 1 | |
561 | else: | |
562 | _state = 0 | |
563 | ||
564 | _can_run = 1 if all_modules[mod_name] else 0 | |
565 | self.metrics['mgr_module_status'].set(_state, (mod_name,)) | |
566 | self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,)) | |
567 | ||
f6b5b4d7 | 568 | @profile_method() |
3efd9988 | 569 | def get_pg_status(self): |
94b18763 | 570 | |
92f5a8d4 TL |
571 | pg_summary = self.get('pg_summary') |
572 | ||
573 | for pool in pg_summary['by_pool']: | |
801d1391 TL |
574 | num_by_state = dict((state, 0) for state in PG_STATES) |
575 | num_by_state['total'] = 0 | |
92f5a8d4 | 576 | |
801d1391 | 577 | for state_name, count in pg_summary['by_pool'][pool].items(): |
92f5a8d4 | 578 | for state in state_name.split('+'): |
801d1391 TL |
579 | num_by_state[state] += count |
580 | num_by_state['total'] += count | |
581 | ||
582 | for state, num in num_by_state.items(): | |
583 | try: | |
584 | self.metrics["pg_{}".format(state)].set(num, (pool,)) | |
585 | except KeyError: | |
e306af50 | 586 | self.log.warning("skipping pg in unknown state {}".format(state)) |
b32b8144 | 587 | |
f6b5b4d7 | 588 | @profile_method() |
b32b8144 FG |
589 | def get_osd_stats(self): |
590 | osd_stats = self.get('osd_stats') | |
591 | for osd in osd_stats['osd_stats']: | |
592 | id_ = osd['osd'] | |
593 | for stat in OSD_STATS: | |
94b18763 | 594 | val = osd['perf_stat'][stat] |
91327a77 AA |
595 | self.metrics['osd_{}'.format(stat)].set(val, ( |
596 | 'osd.{}'.format(id_), | |
597 | )) | |
94b18763 FG |
598 | |
599 | def get_service_list(self): | |
600 | ret = {} | |
601 | for server in self.list_servers(): | |
602 | version = server.get('ceph_version', '') | |
603 | host = server.get('hostname', '') | |
604 | for service in server.get('services', []): | |
605 | ret.update({(service['id'], service['type']): (host, version)}) | |
606 | return ret | |
3efd9988 | 607 | |
f6b5b4d7 | 608 | @profile_method() |
3efd9988 FG |
609 | def get_metadata_and_osd_status(self): |
610 | osd_map = self.get('osd_map') | |
94b18763 FG |
611 | osd_flags = osd_map['flags'].split(',') |
612 | for flag in OSD_FLAGS: | |
91327a77 AA |
613 | self.metrics['osd_flag_{}'.format(flag)].set( |
614 | int(flag in osd_flags) | |
615 | ) | |
94b18763 | 616 | |
3efd9988 | 617 | osd_devices = self.get('osd_map_crush')['devices'] |
94b18763 | 618 | servers = self.get_service_list() |
3efd9988 | 619 | for osd in osd_map['osds']: |
94b18763 | 620 | # id can be used to link osd metrics and metadata |
3efd9988 | 621 | id_ = osd['osd'] |
94b18763 | 622 | # collect osd metadata |
f91f0fd5 TL |
623 | p_addr = osd['public_addr'].rsplit(':', 1)[0] |
624 | c_addr = osd['cluster_addr'].rsplit(':', 1)[0] | |
94b18763 FG |
625 | if p_addr == "-" or c_addr == "-": |
626 | self.log.info( | |
627 | "Missing address metadata for osd {0}, skipping occupation" | |
628 | " and metadata records for this osd".format(id_) | |
629 | ) | |
630 | continue | |
631 | ||
632 | dev_class = None | |
633 | for osd_device in osd_devices: | |
634 | if osd_device['id'] == id_: | |
635 | dev_class = osd_device.get('class', '') | |
636 | break | |
637 | ||
638 | if dev_class is None: | |
9f95a23c TL |
639 | self.log.info("OSD {0} is missing from CRUSH map, " |
640 | "skipping output".format(id_)) | |
94b18763 FG |
641 | continue |
642 | ||
11fdf7f2 | 643 | host_version = servers.get((str(id_), 'osd'), ('', '')) |
94b18763 | 644 | |
a8e16298 TL |
645 | # collect disk occupation metadata |
646 | osd_metadata = self.get_metadata("osd", str(id_)) | |
647 | if osd_metadata is None: | |
648 | continue | |
649 | ||
650 | obj_store = osd_metadata.get('osd_objectstore', '') | |
651 | f_iface = osd_metadata.get('front_iface', '') | |
652 | b_iface = osd_metadata.get('back_iface', '') | |
653 | ||
91327a77 | 654 | self.metrics['osd_metadata'].set(1, ( |
a8e16298 | 655 | b_iface, |
28e407b8 | 656 | 'osd.{}'.format(id_), |
3efd9988 | 657 | c_addr, |
94b18763 | 658 | dev_class, |
a8e16298 | 659 | f_iface, |
28e407b8 | 660 | host_version[0], |
a8e16298 TL |
661 | obj_store, |
662 | p_addr, | |
663 | host_version[1] | |
3efd9988 | 664 | )) |
94b18763 FG |
665 | |
666 | # collect osd status | |
3efd9988 FG |
667 | for state in OSD_STATUS: |
668 | status = osd[state] | |
91327a77 AA |
669 | self.metrics['osd_{}'.format(state)].set(status, ( |
670 | 'osd.{}'.format(id_), | |
671 | )) | |
3efd9988 | 672 | |
92f5a8d4 | 673 | osd_dev_node = None |
a8e16298 | 674 | if obj_store == "filestore": |
11fdf7f2 TL |
675 | # collect filestore backend device |
676 | osd_dev_node = osd_metadata.get( | |
677 | 'backend_filestore_dev_node', None) | |
678 | # collect filestore journal device | |
f64942e4 AA |
679 | osd_wal_dev_node = osd_metadata.get('osd_journal', '') |
680 | osd_db_dev_node = '' | |
a8e16298 | 681 | elif obj_store == "bluestore": |
11fdf7f2 TL |
682 | # collect bluestore backend device |
683 | osd_dev_node = osd_metadata.get( | |
684 | 'bluestore_bdev_dev_node', None) | |
685 | # collect bluestore wal backend | |
f64942e4 | 686 | osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '') |
11fdf7f2 | 687 | # collect bluestore db backend |
f64942e4 AA |
688 | osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '') |
689 | if osd_dev_node and osd_dev_node == "unknown": | |
690 | osd_dev_node = None | |
691 | ||
3efd9988 FG |
692 | osd_hostname = osd_metadata.get('hostname', None) |
693 | if osd_dev_node and osd_hostname: | |
694 | self.log.debug("Got dev for osd {0}: {1}/{2}".format( | |
695 | id_, osd_hostname, osd_dev_node)) | |
91327a77 | 696 | self.metrics['disk_occupation'].set(1, ( |
28e407b8 | 697 | "osd.{0}".format(id_), |
3efd9988 | 698 | osd_dev_node, |
f64942e4 AA |
699 | osd_db_dev_node, |
700 | osd_wal_dev_node, | |
28e407b8 | 701 | osd_hostname |
3efd9988 FG |
702 | )) |
703 | else: | |
704 | self.log.info("Missing dev node metadata for osd {0}, skipping " | |
11fdf7f2 | 705 | "occupation record for this osd".format(id_)) |
3efd9988 FG |
706 | |
707 | for pool in osd_map['pools']: | |
11fdf7f2 TL |
708 | self.metrics['pool_metadata'].set( |
709 | 1, (pool['pool'], pool['pool_name'])) | |
94b18763 | 710 | |
11fdf7f2 | 711 | # Populate other servers metadata |
94b18763 FG |
712 | for key, value in servers.items(): |
713 | service_id, service_type = key | |
11fdf7f2 TL |
714 | if service_type == 'rgw': |
715 | hostname, version = value | |
716 | self.metrics['rgw_metadata'].set( | |
717 | 1, | |
9f95a23c TL |
718 | ('{}.{}'.format(service_type, service_id), |
719 | hostname, version) | |
11fdf7f2 TL |
720 | ) |
721 | elif service_type == 'rbd-mirror': | |
722 | mirror_metadata = self.get_metadata('rbd-mirror', service_id) | |
723 | if mirror_metadata is None: | |
724 | continue | |
725 | mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type, | |
726 | service_id) | |
727 | self.metrics['rbd_mirror_metadata'].set( | |
728 | 1, (mirror_metadata.get(k, '') | |
729 | for k in RBD_MIRROR_METADATA) | |
730 | ) | |
3efd9988 | 731 | |
f6b5b4d7 | 732 | @profile_method() |
28e407b8 AA |
733 | def get_num_objects(self): |
734 | pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum'] | |
735 | for obj in NUM_OBJECTS: | |
736 | stat = 'num_objects_{}'.format(obj) | |
91327a77 | 737 | self.metrics[stat].set(pg_sum[stat]) |
28e407b8 | 738 | |
f6b5b4d7 | 739 | @profile_method() |
11fdf7f2 TL |
740 | def get_rbd_stats(self): |
741 | # Per RBD image stats is collected by registering a dynamic osd perf | |
742 | # stats query that tells OSDs to group stats for requests associated | |
743 | # with RBD objects by pool, namespace, and image id, which are | |
744 | # extracted from the request object names or other attributes. | |
745 | # The RBD object names have the following prefixes: | |
746 | # - rbd_data.{image_id}. (data stored in the same pool as metadata) | |
747 | # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool) | |
748 | # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled) | |
749 | # The pool_id in the object name is the id of the pool with the image | |
750 | # metdata, and should be used in the image spec. If there is no pool_id | |
751 | # in the object name, the image pool is the pool where the object is | |
752 | # located. | |
753 | ||
754 | # Parse rbd_stats_pools option, which is a comma or space separated | |
755 | # list of pool[/namespace] entries. If no namespace is specifed the | |
f6b5b4d7 TL |
756 | # stats are collected for every namespace in the pool. The wildcard |
757 | # '*' can be used to indicate all pools or namespaces | |
11fdf7f2 | 758 | pools_string = self.get_localized_module_option('rbd_stats_pools', '') |
f6b5b4d7 TL |
759 | pool_keys = [] |
760 | for x in re.split('[\s,]+', pools_string): | |
761 | if not x: | |
762 | continue | |
763 | ||
764 | s = x.split('/', 2) | |
11fdf7f2 | 765 | pool_name = s[0] |
f6b5b4d7 TL |
766 | namespace_name = None |
767 | if len(s) == 2: | |
768 | namespace_name = s[1] | |
769 | ||
770 | if pool_name == "*": | |
771 | # collect for all pools | |
772 | osd_map = self.get('osd_map') | |
773 | for pool in osd_map['pools']: | |
774 | if 'rbd' not in pool.get('application_metadata', {}): | |
775 | continue | |
776 | pool_keys.append((pool['pool_name'], namespace_name)) | |
777 | else: | |
778 | pool_keys.append((pool_name, namespace_name)) | |
779 | ||
780 | pools = {} # type: Dict[str, Set[str]] | |
781 | for pool_key in pool_keys: | |
782 | pool_name = pool_key[0] | |
783 | namespace_name = pool_key[1] | |
784 | if not namespace_name or namespace_name == "*": | |
11fdf7f2 TL |
785 | # empty set means collect for all namespaces |
786 | pools[pool_name] = set() | |
787 | continue | |
f6b5b4d7 | 788 | |
11fdf7f2 TL |
789 | if pool_name not in pools: |
790 | pools[pool_name] = set() | |
791 | elif not pools[pool_name]: | |
792 | continue | |
f6b5b4d7 | 793 | pools[pool_name].add(namespace_name) |
11fdf7f2 TL |
794 | |
795 | rbd_stats_pools = {} | |
f6b5b4d7 | 796 | for pool_id in self.rbd_stats['pools'].keys(): |
11fdf7f2 TL |
797 | name = self.rbd_stats['pools'][pool_id]['name'] |
798 | if name not in pools: | |
799 | del self.rbd_stats['pools'][pool_id] | |
800 | else: | |
801 | rbd_stats_pools[name] = \ | |
802 | self.rbd_stats['pools'][pool_id]['ns_names'] | |
803 | ||
804 | pools_refreshed = False | |
805 | if pools: | |
806 | next_refresh = self.rbd_stats['pools_refresh_time'] + \ | |
807 | self.get_localized_module_option( | |
808 | 'rbd_stats_pools_refresh_interval', 300) | |
809 | if rbd_stats_pools != pools or time.time() >= next_refresh: | |
810 | self.refresh_rbd_stats_pools(pools) | |
811 | pools_refreshed = True | |
812 | ||
813 | pool_ids = list(self.rbd_stats['pools']) | |
814 | pool_ids.sort() | |
815 | pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$' | |
816 | ||
817 | nspace_names = [] | |
818 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
819 | if pool['ns_names']: | |
820 | nspace_names.extend(pool['ns_names']) | |
821 | else: | |
822 | nspace_names = [] | |
823 | break | |
824 | if nspace_names: | |
825 | namespace_regex = '^(' + \ | |
826 | "|".join([re.escape(x) | |
827 | for x in set(nspace_names)]) + ')$' | |
828 | else: | |
829 | namespace_regex = '^(.*)$' | |
830 | ||
831 | if 'query' in self.rbd_stats and \ | |
832 | (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or | |
833 | namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']): | |
834 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
835 | del self.rbd_stats['query_id'] | |
836 | del self.rbd_stats['query'] | |
837 | ||
838 | if not self.rbd_stats['pools']: | |
839 | return | |
840 | ||
841 | counters_info = self.rbd_stats['counters_info'] | |
842 | ||
843 | if 'query_id' not in self.rbd_stats: | |
844 | query = { | |
845 | 'key_descriptor': [ | |
846 | {'type': 'pool_id', 'regex': pool_id_regex}, | |
847 | {'type': 'namespace', 'regex': namespace_regex}, | |
848 | {'type': 'object_name', | |
849 | 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'}, | |
850 | ], | |
851 | 'performance_counter_descriptors': list(counters_info), | |
852 | } | |
853 | query_id = self.add_osd_perf_query(query) | |
854 | if query_id is None: | |
855 | self.log.error('failed to add query %s' % query) | |
856 | return | |
857 | self.rbd_stats['query'] = query | |
858 | self.rbd_stats['query_id'] = query_id | |
859 | ||
860 | res = self.get_osd_perf_counters(self.rbd_stats['query_id']) | |
861 | for c in res['counters']: | |
862 | # if the pool id is not found in the object name use id of the | |
863 | # pool where the object is located | |
864 | if c['k'][2][0]: | |
865 | pool_id = int(c['k'][2][0]) | |
866 | else: | |
867 | pool_id = int(c['k'][0][0]) | |
868 | if pool_id not in self.rbd_stats['pools'] and not pools_refreshed: | |
869 | self.refresh_rbd_stats_pools(pools) | |
870 | pools_refreshed = True | |
871 | if pool_id not in self.rbd_stats['pools']: | |
872 | continue | |
873 | pool = self.rbd_stats['pools'][pool_id] | |
874 | nspace_name = c['k'][1][0] | |
875 | if nspace_name not in pool['images']: | |
876 | continue | |
877 | image_id = c['k'][2][1] | |
878 | if image_id not in pool['images'][nspace_name] and \ | |
879 | not pools_refreshed: | |
880 | self.refresh_rbd_stats_pools(pools) | |
881 | pool = self.rbd_stats['pools'][pool_id] | |
882 | pools_refreshed = True | |
883 | if image_id not in pool['images'][nspace_name]: | |
884 | continue | |
885 | counters = pool['images'][nspace_name][image_id]['c'] | |
886 | for i in range(len(c['c'])): | |
887 | counters[i][0] += c['c'][i][0] | |
888 | counters[i][1] += c['c'][i][1] | |
889 | ||
890 | label_names = ("pool", "namespace", "image") | |
891 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
892 | pool_name = pool['name'] | |
893 | for nspace_name, images in pool['images'].items(): | |
894 | for image_id in images: | |
895 | image_name = images[image_id]['n'] | |
896 | counters = images[image_id]['c'] | |
897 | i = 0 | |
898 | for key in counters_info: | |
899 | counter_info = counters_info[key] | |
900 | stattype = self._stattype_to_str(counter_info['type']) | |
901 | labels = (pool_name, nspace_name, image_name) | |
902 | if counter_info['type'] == self.PERFCOUNTER_COUNTER: | |
903 | path = 'rbd_' + key | |
904 | if path not in self.metrics: | |
905 | self.metrics[path] = Metric( | |
906 | stattype, | |
907 | path, | |
908 | counter_info['desc'], | |
909 | label_names, | |
910 | ) | |
911 | self.metrics[path].set(counters[i][0], labels) | |
912 | elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG: | |
913 | path = 'rbd_' + key + '_sum' | |
914 | if path not in self.metrics: | |
915 | self.metrics[path] = Metric( | |
916 | stattype, | |
917 | path, | |
918 | counter_info['desc'] + ' Total', | |
919 | label_names, | |
920 | ) | |
921 | self.metrics[path].set(counters[i][0], labels) | |
922 | path = 'rbd_' + key + '_count' | |
923 | if path not in self.metrics: | |
924 | self.metrics[path] = Metric( | |
925 | 'counter', | |
926 | path, | |
927 | counter_info['desc'] + ' Count', | |
928 | label_names, | |
929 | ) | |
930 | self.metrics[path].set(counters[i][1], labels) | |
931 | i += 1 | |
932 | ||
933 | def refresh_rbd_stats_pools(self, pools): | |
934 | self.log.debug('refreshing rbd pools %s' % (pools)) | |
935 | ||
936 | rbd = RBD() | |
937 | counters_info = self.rbd_stats['counters_info'] | |
938 | for pool_name, cfg_ns_names in pools.items(): | |
939 | try: | |
940 | pool_id = self.rados.pool_lookup(pool_name) | |
941 | with self.rados.open_ioctx(pool_name) as ioctx: | |
942 | if pool_id not in self.rbd_stats['pools']: | |
943 | self.rbd_stats['pools'][pool_id] = {'images': {}} | |
944 | pool = self.rbd_stats['pools'][pool_id] | |
945 | pool['name'] = pool_name | |
946 | pool['ns_names'] = cfg_ns_names | |
947 | if cfg_ns_names: | |
948 | nspace_names = list(cfg_ns_names) | |
949 | else: | |
950 | nspace_names = [''] + rbd.namespace_list(ioctx) | |
951 | for nspace_name in pool['images']: | |
952 | if nspace_name not in nspace_names: | |
953 | del pool['images'][nspace_name] | |
954 | for nspace_name in nspace_names: | |
955 | if (nspace_name and | |
956 | not rbd.namespace_exists(ioctx, nspace_name)): | |
957 | self.log.debug('unknown namespace %s for pool %s' % | |
958 | (nspace_name, pool_name)) | |
959 | continue | |
960 | ioctx.set_namespace(nspace_name) | |
961 | if nspace_name not in pool['images']: | |
962 | pool['images'][nspace_name] = {} | |
963 | namespace = pool['images'][nspace_name] | |
964 | images = {} | |
965 | for image_meta in RBD().list2(ioctx): | |
966 | image = {'n': image_meta['name']} | |
967 | image_id = image_meta['id'] | |
968 | if image_id in namespace: | |
969 | image['c'] = namespace[image_id]['c'] | |
970 | else: | |
971 | image['c'] = [[0, 0] for x in counters_info] | |
972 | images[image_id] = image | |
973 | pool['images'][nspace_name] = images | |
974 | except Exception as e: | |
975 | self.log.error('failed listing pool %s: %s' % (pool_name, e)) | |
976 | self.rbd_stats['pools_refresh_time'] = time.time() | |
977 | ||
978 | def shutdown_rbd_stats(self): | |
979 | if 'query_id' in self.rbd_stats: | |
980 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
981 | del self.rbd_stats['query_id'] | |
982 | del self.rbd_stats['query'] | |
983 | self.rbd_stats['pools'].clear() | |
984 | ||
e306af50 TL |
985 | def add_fixed_name_metrics(self): |
986 | """ | |
987 | Add fixed name metrics from existing ones that have details in their names | |
988 | that should be in labels (not in name). | |
989 | For backward compatibility, a new fixed name metric is created (instead of replacing) | |
990 | and details are put in new labels. | |
991 | Intended for RGW sync perf. counters but extendable as required. | |
992 | See: https://tracker.ceph.com/issues/45311 | |
993 | """ | |
994 | new_metrics = {} | |
995 | for metric_path in self.metrics.keys(): | |
996 | # Address RGW sync perf. counters. | |
997 | match = re.search('^data-sync-from-(.*)\.', metric_path) | |
998 | if match: | |
999 | new_path = re.sub('from-([^.]*)', 'from-zone', metric_path) | |
1000 | if new_path not in new_metrics: | |
1001 | new_metrics[new_path] = Metric( | |
1002 | self.metrics[metric_path].mtype, | |
1003 | new_path, | |
1004 | self.metrics[metric_path].desc, | |
1005 | self.metrics[metric_path].labelnames + ('source_zone',) | |
1006 | ) | |
1007 | for label_values, value in self.metrics[metric_path].value.items(): | |
1008 | new_metrics[new_path].set(value, label_values + (match.group(1),)) | |
1009 | ||
1010 | self.metrics.update(new_metrics) | |
1011 | ||
f6b5b4d7 | 1012 | @profile_method(True) |
c07f9fc5 | 1013 | def collect(self): |
91327a77 AA |
1014 | # Clear the metrics before scraping |
1015 | for k in self.metrics.keys(): | |
1016 | self.metrics[k].clear() | |
1017 | ||
3efd9988 FG |
1018 | self.get_health() |
1019 | self.get_df() | |
11fdf7f2 | 1020 | self.get_pool_stats() |
94b18763 | 1021 | self.get_fs() |
b32b8144 | 1022 | self.get_osd_stats() |
3efd9988 | 1023 | self.get_quorum_status() |
494da23a | 1024 | self.get_mgr_status() |
3efd9988 FG |
1025 | self.get_metadata_and_osd_status() |
1026 | self.get_pg_status() | |
28e407b8 | 1027 | self.get_num_objects() |
3efd9988 | 1028 | |
94b18763 | 1029 | for daemon, counters in self.get_all_perf_counters().items(): |
3efd9988 | 1030 | for path, counter_info in counters.items(): |
28e407b8 | 1031 | # Skip histograms, they are represented by long running avgs |
3efd9988 | 1032 | stattype = self._stattype_to_str(counter_info['type']) |
3efd9988 FG |
1033 | if not stattype or stattype == 'histogram': |
1034 | self.log.debug('ignoring %s, type %s' % (path, stattype)) | |
1035 | continue | |
1036 | ||
81eedcae TL |
1037 | path, label_names, labels = self._perfpath_to_path_labels( |
1038 | daemon, path) | |
1039 | ||
28e407b8 | 1040 | # Get the value of the counter |
11fdf7f2 TL |
1041 | value = self._perfvalue_to_value( |
1042 | counter_info['type'], counter_info['value']) | |
28e407b8 AA |
1043 | |
1044 | # Represent the long running avgs as sum/count pairs | |
1045 | if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG: | |
1046 | _path = path + '_sum' | |
91327a77 AA |
1047 | if _path not in self.metrics: |
1048 | self.metrics[_path] = Metric( | |
1049 | stattype, | |
1050 | _path, | |
1051 | counter_info['description'] + ' Total', | |
81eedcae | 1052 | label_names, |
91327a77 | 1053 | ) |
81eedcae | 1054 | self.metrics[_path].set(value, labels) |
28e407b8 AA |
1055 | |
1056 | _path = path + '_count' | |
91327a77 AA |
1057 | if _path not in self.metrics: |
1058 | self.metrics[_path] = Metric( | |
1059 | 'counter', | |
1060 | _path, | |
1061 | counter_info['description'] + ' Count', | |
81eedcae | 1062 | label_names, |
91327a77 | 1063 | ) |
81eedcae | 1064 | self.metrics[_path].set(counter_info['count'], labels,) |
28e407b8 | 1065 | else: |
91327a77 AA |
1066 | if path not in self.metrics: |
1067 | self.metrics[path] = Metric( | |
1068 | stattype, | |
1069 | path, | |
1070 | counter_info['description'], | |
81eedcae | 1071 | label_names, |
91327a77 | 1072 | ) |
81eedcae | 1073 | self.metrics[path].set(value, labels) |
91327a77 | 1074 | |
e306af50 | 1075 | self.add_fixed_name_metrics() |
11fdf7f2 TL |
1076 | self.get_rbd_stats() |
1077 | ||
91327a77 AA |
1078 | # Return formatted metrics and clear no longer used data |
1079 | _metrics = [m.str_expfmt() for m in self.metrics.values()] | |
1080 | for k in self.metrics.keys(): | |
1081 | self.metrics[k].clear() | |
1082 | ||
1083 | return ''.join(_metrics) + '\n' | |
c07f9fc5 | 1084 | |
11fdf7f2 TL |
1085 | def get_file_sd_config(self): |
1086 | servers = self.list_servers() | |
1087 | targets = [] | |
1088 | for server in servers: | |
1089 | hostname = server.get('hostname', '') | |
1090 | for service in server.get('services', []): | |
1091 | if service['type'] != 'mgr': | |
1092 | continue | |
1093 | id_ = service['id'] | |
1094 | # get port for prometheus module at mgr with id_ | |
1095 | # TODO use get_config_prefix or get_config here once | |
1096 | # https://github.com/ceph/ceph/pull/20458 is merged | |
1097 | result = CommandResult("") | |
f6b5b4d7 TL |
1098 | assert isinstance(_global_instance, Module) |
1099 | _global_instance.send_command( | |
11fdf7f2 TL |
1100 | result, "mon", '', |
1101 | json.dumps({ | |
1102 | "prefix": "config-key get", | |
1103 | 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_), | |
1104 | }), | |
1105 | "") | |
1106 | r, outb, outs = result.wait() | |
1107 | if r != 0: | |
f6b5b4d7 | 1108 | _global_instance.log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs)) |
11fdf7f2 TL |
1109 | targets.append('{}:{}'.format(hostname, DEFAULT_PORT)) |
1110 | else: | |
1111 | port = json.loads(outb) | |
1112 | targets.append('{}:{}'.format(hostname, port)) | |
1113 | ||
1114 | ret = [ | |
1115 | { | |
1116 | "targets": targets, | |
1117 | "labels": {} | |
1118 | } | |
1119 | ] | |
1120 | return 0, json.dumps(ret), "" | |
1121 | ||
1122 | def self_test(self): | |
1123 | self.collect() | |
1124 | self.get_file_sd_config() | |
1125 | ||
1126 | def handle_command(self, inbuf, cmd): | |
1127 | if cmd['prefix'] == 'prometheus file_sd_config': | |
1128 | return self.get_file_sd_config() | |
3efd9988 FG |
1129 | else: |
1130 | return (-errno.EINVAL, '', | |
1131 | "Command not found '{0}'".format(cmd['prefix'])) | |
c07f9fc5 FG |
1132 | |
1133 | def serve(self): | |
1134 | ||
1135 | class Root(object): | |
1136 | ||
1137 | # collapse everything to '/' | |
1138 | def _cp_dispatch(self, vpath): | |
1139 | cherrypy.request.path = '' | |
1140 | return self | |
1141 | ||
c07f9fc5 FG |
1142 | @cherrypy.expose |
1143 | def index(self): | |
3efd9988 FG |
1144 | return '''<!DOCTYPE html> |
1145 | <html> | |
9f95a23c TL |
1146 | <head><title>Ceph Exporter</title></head> |
1147 | <body> | |
1148 | <h1>Ceph Exporter</h1> | |
1149 | <p><a href='/metrics'>Metrics</a></p> | |
1150 | </body> | |
3efd9988 FG |
1151 | </html>''' |
1152 | ||
1153 | @cherrypy.expose | |
1154 | def metrics(self): | |
91327a77 | 1155 | # Lock the function execution |
f6b5b4d7 TL |
1156 | assert isinstance(_global_instance, Module) |
1157 | with _global_instance.collect_lock: | |
1158 | return self._metrics(_global_instance) | |
91327a77 | 1159 | |
11fdf7f2 TL |
1160 | @staticmethod |
1161 | def _metrics(instance): | |
f6b5b4d7 TL |
1162 | # type: (Module) -> Any |
1163 | # Return cached data if available | |
1164 | if not instance.collect_cache: | |
1165 | raise cherrypy.HTTPError(503, 'No cached data available yet') | |
91327a77 | 1166 | |
f6b5b4d7 TL |
1167 | def respond(): |
1168 | assert isinstance(instance, Module) | |
91327a77 AA |
1169 | cherrypy.response.headers['Content-Type'] = 'text/plain' |
1170 | return instance.collect_cache | |
f6b5b4d7 TL |
1171 | |
1172 | if instance.collect_time < instance.scrape_interval: | |
1173 | # Respond if cache isn't stale | |
1174 | return respond() | |
1175 | ||
1176 | if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN: | |
1177 | # Respond even if cache is stale | |
1178 | instance.log.info( | |
1179 | 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' | |
1180 | 'returning metrics from stale cache.'.format( | |
1181 | instance.collect_time, | |
1182 | instance.collect_time - instance.scrape_interval | |
1183 | ) | |
1184 | ) | |
1185 | return respond() | |
1186 | ||
1187 | if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL: | |
1188 | # Fail if cache is stale | |
1189 | msg = ( | |
1190 | 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' | |
1191 | 'returning "service unavailable".'.format( | |
1192 | instance.collect_time, | |
1193 | instance.collect_time - instance.scrape_interval, | |
1194 | ) | |
1195 | ) | |
1196 | instance.log.error(msg) | |
1197 | raise cherrypy.HTTPError(503, msg) | |
c07f9fc5 | 1198 | |
91327a77 | 1199 | # Make the cache timeout for collecting configurable |
f6b5b4d7 TL |
1200 | self.scrape_interval = float(self.get_localized_module_option('scrape_interval', 15.0)) |
1201 | ||
1202 | self.stale_cache_strategy = self.get_localized_module_option('stale_cache_strategy', 'log') | |
1203 | if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL, | |
1204 | self.STALE_CACHE_RETURN]: | |
1205 | self.stale_cache_strategy = self.STALE_CACHE_FAIL | |
91327a77 | 1206 | |
11fdf7f2 | 1207 | server_addr = self.get_localized_module_option( |
494da23a | 1208 | 'server_addr', get_default_addr()) |
11fdf7f2 TL |
1209 | server_port = self.get_localized_module_option( |
1210 | 'server_port', DEFAULT_PORT) | |
c07f9fc5 FG |
1211 | self.log.info( |
1212 | "server_addr: %s server_port: %s" % | |
1213 | (server_addr, server_port) | |
1214 | ) | |
c07f9fc5 | 1215 | |
94b18763 FG |
1216 | # Publish the URI that others may use to access the service we're |
1217 | # about to start serving | |
1218 | self.set_uri('http://{0}:{1}/'.format( | |
eafe8130 | 1219 | socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr, |
94b18763 FG |
1220 | server_port |
1221 | )) | |
1222 | ||
c07f9fc5 FG |
1223 | cherrypy.config.update({ |
1224 | 'server.socket_host': server_addr, | |
3efd9988 | 1225 | 'server.socket_port': int(server_port), |
c07f9fc5 FG |
1226 | 'engine.autoreload.on': False |
1227 | }) | |
1228 | cherrypy.tree.mount(Root(), "/") | |
94b18763 | 1229 | self.log.info('Starting engine...') |
c07f9fc5 | 1230 | cherrypy.engine.start() |
94b18763 | 1231 | self.log.info('Engine started.') |
91327a77 AA |
1232 | # wait for the shutdown event |
1233 | self.shutdown_event.wait() | |
1234 | self.shutdown_event.clear() | |
1235 | cherrypy.engine.stop() | |
1236 | self.log.info('Engine stopped.') | |
11fdf7f2 | 1237 | self.shutdown_rbd_stats() |
94b18763 FG |
1238 | |
1239 | def shutdown(self): | |
1240 | self.log.info('Stopping engine...') | |
91327a77 | 1241 | self.shutdown_event.set() |
94b18763 FG |
1242 | |
1243 | ||
1244 | class StandbyModule(MgrStandbyModule): | |
91327a77 AA |
1245 | def __init__(self, *args, **kwargs): |
1246 | super(StandbyModule, self).__init__(*args, **kwargs) | |
1247 | self.shutdown_event = threading.Event() | |
1248 | ||
94b18763 | 1249 | def serve(self): |
494da23a TL |
1250 | server_addr = self.get_localized_module_option( |
1251 | 'server_addr', get_default_addr()) | |
11fdf7f2 TL |
1252 | server_port = self.get_localized_module_option( |
1253 | 'server_port', DEFAULT_PORT) | |
1254 | self.log.info("server_addr: %s server_port: %s" % | |
1255 | (server_addr, server_port)) | |
94b18763 FG |
1256 | cherrypy.config.update({ |
1257 | 'server.socket_host': server_addr, | |
1258 | 'server.socket_port': int(server_port), | |
1259 | 'engine.autoreload.on': False | |
1260 | }) | |
1261 | ||
1262 | module = self | |
1263 | ||
1264 | class Root(object): | |
94b18763 FG |
1265 | @cherrypy.expose |
1266 | def index(self): | |
1267 | active_uri = module.get_active_uri() | |
1268 | return '''<!DOCTYPE html> | |
1269 | <html> | |
9f95a23c TL |
1270 | <head><title>Ceph Exporter</title></head> |
1271 | <body> | |
1272 | <h1>Ceph Exporter</h1> | |
94b18763 | 1273 | <p><a href='{}metrics'>Metrics</a></p> |
9f95a23c | 1274 | </body> |
94b18763 FG |
1275 | </html>'''.format(active_uri) |
1276 | ||
1277 | @cherrypy.expose | |
1278 | def metrics(self): | |
1279 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1280 | return '' | |
1281 | ||
1282 | cherrypy.tree.mount(Root(), '/', {}) | |
1283 | self.log.info('Starting engine...') | |
1284 | cherrypy.engine.start() | |
94b18763 | 1285 | self.log.info('Engine started.') |
91327a77 AA |
1286 | # Wait for shutdown event |
1287 | self.shutdown_event.wait() | |
1288 | self.shutdown_event.clear() | |
1289 | cherrypy.engine.stop() | |
1290 | self.log.info('Engine stopped.') | |
94b18763 FG |
1291 | |
1292 | def shutdown(self): | |
1293 | self.log.info("Stopping engine...") | |
91327a77 | 1294 | self.shutdown_event.set() |
94b18763 | 1295 | self.log.info("Stopped engine") |