]>
Commit | Line | Data |
---|---|---|
c07f9fc5 | 1 | import cherrypy |
a8e16298 | 2 | from distutils.version import StrictVersion |
3efd9988 FG |
3 | import json |
4 | import errno | |
c07f9fc5 FG |
5 | import math |
6 | import os | |
11fdf7f2 | 7 | import re |
94b18763 | 8 | import socket |
91327a77 AA |
9 | import threading |
10 | import time | |
11fdf7f2 | 11 | from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES |
f6b5b4d7 | 12 | from mgr_util import get_default_addr, profile_method |
11fdf7f2 | 13 | from rbd import RBD |
f6b5b4d7 TL |
14 | try: |
15 | from typing import Optional, Dict, Any, Set | |
16 | except: | |
17 | pass | |
c07f9fc5 FG |
18 | |
19 | # Defaults for the Prometheus HTTP server. Can also set in config-key | |
20 | # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations | |
21 | # for Prometheus exporter port registry | |
22 | ||
c07f9fc5 FG |
23 | DEFAULT_PORT = 9283 |
24 | ||
a8e16298 TL |
25 | # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify |
26 | # that the ports its listening on are in fact bound. When using the any address | |
27 | # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes) | |
28 | # ipv6 isn't yet configured / supported and CherryPy throws an uncaught | |
29 | # exception. | |
30 | if cherrypy is not None: | |
31 | v = StrictVersion(cherrypy.__version__) | |
32 | # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on | |
33 | # centos:7) and back to at least 3.0.0. | |
34 | if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"): | |
35 | # https://github.com/cherrypy/cherrypy/issues/1100 | |
36 | from cherrypy.process import servers | |
37 | servers.wait_for_occupied_port = lambda host, port: None | |
c07f9fc5 | 38 | |
9f95a23c | 39 | |
c07f9fc5 | 40 | # cherrypy likes to sys.exit on error. don't let it take us down too! |
3efd9988 | 41 | def os_exit_noop(*args, **kwargs): |
c07f9fc5 FG |
42 | pass |
43 | ||
44 | ||
45 | os._exit = os_exit_noop | |
46 | ||
c07f9fc5 FG |
47 | # to access things in class Module from subclass Root. Because |
48 | # it's a dict, the writer doesn't need to declare 'global' for access | |
49 | ||
f6b5b4d7 | 50 | _global_instance = None # type: Optional[Module] |
c07f9fc5 FG |
51 | |
52 | ||
3efd9988 | 53 | def health_status_to_number(status): |
3efd9988 FG |
54 | if status == 'HEALTH_OK': |
55 | return 0 | |
56 | elif status == 'HEALTH_WARN': | |
57 | return 1 | |
58 | elif status == 'HEALTH_ERR': | |
59 | return 2 | |
c07f9fc5 | 60 | |
11fdf7f2 TL |
61 | |
62 | DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes'] | |
63 | ||
64 | DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty', | |
3efd9988 | 65 | 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes'] |
c07f9fc5 | 66 | |
11fdf7f2 TL |
67 | OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec', |
68 | 'recovering_keys_per_sec', 'num_objects_recovered', | |
69 | 'num_bytes_recovered', 'num_bytes_recovered') | |
70 | ||
94b18763 FG |
71 | OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance', |
72 | 'norecover', 'noscrub', 'nodeep-scrub') | |
3efd9988 | 73 | |
28e407b8 | 74 | FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name') |
b32b8144 | 75 | |
28e407b8 AA |
76 | MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank', |
77 | 'ceph_version') | |
3efd9988 | 78 | |
11fdf7f2 TL |
79 | MON_METADATA = ('ceph_daemon', 'hostname', |
80 | 'public_addr', 'rank', 'ceph_version') | |
c07f9fc5 | 81 | |
494da23a TL |
82 | MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
83 | ||
84 | MGR_STATUS = ('ceph_daemon',) | |
85 | ||
86 | MGR_MODULE_STATUS = ('name',) | |
87 | ||
88 | MGR_MODULE_CAN_RUN = ('name',) | |
89 | ||
a8e16298 TL |
90 | OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class', |
91 | 'front_iface', 'hostname', 'objectstore', 'public_addr', | |
92 | 'ceph_version') | |
c07f9fc5 | 93 | |
94b18763 | 94 | OSD_STATUS = ['weight', 'up', 'in'] |
c07f9fc5 | 95 | |
94b18763 | 96 | OSD_STATS = ['apply_latency_ms', 'commit_latency_ms'] |
c07f9fc5 | 97 | |
94b18763 | 98 | POOL_METADATA = ('pool_id', 'name') |
c07f9fc5 | 99 | |
28e407b8 | 100 | RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
c07f9fc5 | 101 | |
11fdf7f2 TL |
102 | RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname', |
103 | 'ceph_version') | |
104 | ||
105 | DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', | |
106 | 'wal_device', 'instance') | |
28e407b8 AA |
107 | |
108 | NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] | |
c07f9fc5 | 109 | |
c07f9fc5 | 110 | |
91327a77 AA |
111 | class Metric(object): |
112 | def __init__(self, mtype, name, desc, labels=None): | |
113 | self.mtype = mtype | |
114 | self.name = name | |
115 | self.desc = desc | |
116 | self.labelnames = labels # tuple if present | |
117 | self.value = {} # indexed by label values | |
118 | ||
119 | def clear(self): | |
120 | self.value = {} | |
121 | ||
122 | def set(self, value, labelvalues=None): | |
123 | # labelvalues must be a tuple | |
124 | labelvalues = labelvalues or ('',) | |
125 | self.value[labelvalues] = value | |
3efd9988 | 126 | |
91327a77 AA |
127 | def str_expfmt(self): |
128 | ||
129 | def promethize(path): | |
130 | ''' replace illegal metric name characters ''' | |
81eedcae | 131 | result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus') |
91327a77 AA |
132 | |
133 | # Hyphens usually turn into underscores, unless they are | |
134 | # trailing | |
135 | if result.endswith("-"): | |
136 | result = result[0:-1] + "_minus" | |
137 | else: | |
138 | result = result.replace("-", "_") | |
139 | ||
140 | return "ceph_{0}".format(result) | |
141 | ||
142 | def floatstr(value): | |
143 | ''' represent as Go-compatible float ''' | |
144 | if value == float('inf'): | |
145 | return '+Inf' | |
146 | if value == float('-inf'): | |
147 | return '-Inf' | |
148 | if math.isnan(value): | |
149 | return 'NaN' | |
150 | return repr(float(value)) | |
151 | ||
152 | name = promethize(self.name) | |
153 | expfmt = ''' | |
154 | # HELP {name} {desc} | |
155 | # TYPE {name} {mtype}'''.format( | |
156 | name=name, | |
157 | desc=self.desc, | |
158 | mtype=self.mtype, | |
159 | ) | |
160 | ||
161 | for labelvalues, value in self.value.items(): | |
162 | if self.labelnames: | |
f6b5b4d7 TL |
163 | labels_list = zip(self.labelnames, labelvalues) |
164 | labels = ','.join('%s="%s"' % (k, v) for k, v in labels_list) | |
91327a77 AA |
165 | else: |
166 | labels = '' | |
167 | if labels: | |
168 | fmtstr = '\n{name}{{{labels}}} {value}' | |
169 | else: | |
170 | fmtstr = '\n{name} {value}' | |
171 | expfmt += fmtstr.format( | |
172 | name=name, | |
173 | labels=labels, | |
174 | value=floatstr(value), | |
175 | ) | |
176 | return expfmt | |
177 | ||
178 | ||
f6b5b4d7 TL |
179 | class MetricCollectionThread(threading.Thread): |
180 | def __init__(self, module): | |
181 | # type: (Module) -> None | |
182 | self.mod = module | |
183 | super(MetricCollectionThread, self).__init__(target=self.collect) | |
184 | ||
185 | def collect(self): | |
186 | self.mod.log.info('starting metric collection thread') | |
187 | while True: | |
188 | self.mod.log.debug('collecting cache in thread') | |
189 | if self.mod.have_mon_connection(): | |
190 | start_time = time.time() | |
191 | data = self.mod.collect() | |
192 | duration = time.time() - start_time | |
193 | ||
194 | self.mod.log.debug('collecting cache in thread done') | |
195 | ||
196 | sleep_time = self.mod.scrape_interval - duration | |
197 | if sleep_time < 0: | |
198 | self.mod.log.warning( | |
199 | 'Collecting data took more time than configured scrape interval. ' | |
200 | 'This possibly results in stale data. Please check the ' | |
201 | '`stale_cache_strategy` configuration option. ' | |
202 | 'Collecting data took {:.2f} seconds but scrape interval is configured ' | |
203 | 'to be {:.0f} seconds.'.format( | |
204 | duration, | |
205 | self.mod.scrape_interval, | |
206 | ) | |
207 | ) | |
208 | sleep_time = 0 | |
209 | ||
210 | with self.mod.collect_lock: | |
211 | self.mod.collect_cache = data | |
212 | self.mod.collect_time = duration | |
213 | ||
214 | time.sleep(sleep_time) | |
215 | else: | |
216 | self.mod.log.error('No MON connection') | |
217 | time.sleep(self.mod.scrape_interval) | |
218 | ||
219 | ||
91327a77 AA |
220 | class Module(MgrModule): |
221 | COMMANDS = [ | |
222 | { | |
11fdf7f2 TL |
223 | "cmd": "prometheus file_sd_config", |
224 | "desc": "Return file_sd compatible prometheus config for mgr cluster", | |
225 | "perm": "r" | |
91327a77 AA |
226 | }, |
227 | ] | |
228 | ||
11fdf7f2 TL |
229 | MODULE_OPTIONS = [ |
230 | {'name': 'server_addr'}, | |
231 | {'name': 'server_port'}, | |
232 | {'name': 'scrape_interval'}, | |
f6b5b4d7 | 233 | {'name': 'stale_cache_strategy'}, |
11fdf7f2 | 234 | {'name': 'rbd_stats_pools'}, |
e306af50 | 235 | {'name': 'rbd_stats_pools_refresh_interval', 'type': 'int', 'default': 300}, |
91327a77 AA |
236 | ] |
237 | ||
f6b5b4d7 TL |
238 | STALE_CACHE_FAIL = 'fail' |
239 | STALE_CACHE_RETURN = 'return' | |
240 | ||
91327a77 AA |
241 | def __init__(self, *args, **kwargs): |
242 | super(Module, self).__init__(*args, **kwargs) | |
243 | self.metrics = self._setup_static_metrics() | |
244 | self.shutdown_event = threading.Event() | |
f6b5b4d7 TL |
245 | self.collect_lock = threading.Lock() |
246 | self.collect_time = 0.0 | |
247 | self.scrape_interval = 15.0 | |
248 | self.stale_cache_strategy = self.STALE_CACHE_FAIL | |
91327a77 | 249 | self.collect_cache = None |
11fdf7f2 TL |
250 | self.rbd_stats = { |
251 | 'pools': {}, | |
252 | 'pools_refresh_time': 0, | |
253 | 'counters_info': { | |
254 | 'write_ops': {'type': self.PERFCOUNTER_COUNTER, | |
255 | 'desc': 'RBD image writes count'}, | |
256 | 'read_ops': {'type': self.PERFCOUNTER_COUNTER, | |
257 | 'desc': 'RBD image reads count'}, | |
258 | 'write_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
259 | 'desc': 'RBD image bytes written'}, | |
260 | 'read_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
261 | 'desc': 'RBD image bytes read'}, | |
262 | 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
263 | 'desc': 'RBD image writes latency (msec)'}, | |
264 | 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
265 | 'desc': 'RBD image reads latency (msec)'}, | |
266 | }, | |
f6b5b4d7 TL |
267 | } # type: Dict[str, Any] |
268 | global _global_instance | |
269 | _global_instance = self | |
270 | MetricCollectionThread(_global_instance).start() | |
3efd9988 FG |
271 | |
272 | def _setup_static_metrics(self): | |
273 | metrics = {} | |
274 | metrics['health_status'] = Metric( | |
275 | 'untyped', | |
276 | 'health_status', | |
277 | 'Cluster health status' | |
278 | ) | |
94b18763 | 279 | metrics['mon_quorum_status'] = Metric( |
3efd9988 | 280 | 'gauge', |
94b18763 FG |
281 | 'mon_quorum_status', |
282 | 'Monitors in quorum', | |
283 | ('ceph_daemon',) | |
284 | ) | |
285 | metrics['fs_metadata'] = Metric( | |
286 | 'untyped', | |
287 | 'fs_metadata', | |
288 | 'FS Metadata', | |
289 | FS_METADATA | |
290 | ) | |
291 | metrics['mds_metadata'] = Metric( | |
292 | 'untyped', | |
293 | 'mds_metadata', | |
294 | 'MDS Metadata', | |
295 | MDS_METADATA | |
296 | ) | |
297 | metrics['mon_metadata'] = Metric( | |
298 | 'untyped', | |
299 | 'mon_metadata', | |
300 | 'MON Metadata', | |
301 | MON_METADATA | |
3efd9988 | 302 | ) |
494da23a TL |
303 | metrics['mgr_metadata'] = Metric( |
304 | 'gauge', | |
305 | 'mgr_metadata', | |
306 | 'MGR metadata', | |
307 | MGR_METADATA | |
308 | ) | |
309 | metrics['mgr_status'] = Metric( | |
310 | 'gauge', | |
311 | 'mgr_status', | |
312 | 'MGR status (0=standby, 1=active)', | |
313 | MGR_STATUS | |
314 | ) | |
315 | metrics['mgr_module_status'] = Metric( | |
316 | 'gauge', | |
317 | 'mgr_module_status', | |
318 | 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)', | |
319 | MGR_MODULE_STATUS | |
320 | ) | |
321 | metrics['mgr_module_can_run'] = Metric( | |
322 | 'gauge', | |
323 | 'mgr_module_can_run', | |
324 | 'MGR module runnable state i.e. can it run (0=no, 1=yes)', | |
325 | MGR_MODULE_CAN_RUN | |
326 | ) | |
3efd9988 FG |
327 | metrics['osd_metadata'] = Metric( |
328 | 'untyped', | |
329 | 'osd_metadata', | |
330 | 'OSD Metadata', | |
331 | OSD_METADATA | |
332 | ) | |
c07f9fc5 | 333 | |
3efd9988 FG |
334 | # The reason for having this separate to OSD_METADATA is |
335 | # so that we can stably use the same tag names that | |
336 | # the Prometheus node_exporter does | |
337 | metrics['disk_occupation'] = Metric( | |
b32b8144 | 338 | 'untyped', |
3efd9988 FG |
339 | 'disk_occupation', |
340 | 'Associate Ceph daemon with disk used', | |
341 | DISK_OCCUPATION | |
342 | ) | |
c07f9fc5 | 343 | |
3efd9988 FG |
344 | metrics['pool_metadata'] = Metric( |
345 | 'untyped', | |
346 | 'pool_metadata', | |
347 | 'POOL Metadata', | |
348 | POOL_METADATA | |
349 | ) | |
94b18763 FG |
350 | |
351 | metrics['rgw_metadata'] = Metric( | |
352 | 'untyped', | |
353 | 'rgw_metadata', | |
354 | 'RGW Metadata', | |
355 | RGW_METADATA | |
356 | ) | |
357 | ||
11fdf7f2 TL |
358 | metrics['rbd_mirror_metadata'] = Metric( |
359 | 'untyped', | |
360 | 'rbd_mirror_metadata', | |
361 | 'RBD Mirror Metadata', | |
362 | RBD_MIRROR_METADATA | |
363 | ) | |
364 | ||
94b18763 FG |
365 | metrics['pg_total'] = Metric( |
366 | 'gauge', | |
367 | 'pg_total', | |
92f5a8d4 TL |
368 | 'PG Total Count per Pool', |
369 | ('pool_id',) | |
94b18763 FG |
370 | ) |
371 | ||
372 | for flag in OSD_FLAGS: | |
373 | path = 'osd_flag_{}'.format(flag) | |
374 | metrics[path] = Metric( | |
375 | 'untyped', | |
376 | path, | |
377 | 'OSD Flag {}'.format(flag) | |
378 | ) | |
3efd9988 FG |
379 | for state in OSD_STATUS: |
380 | path = 'osd_{}'.format(state) | |
3efd9988 FG |
381 | metrics[path] = Metric( |
382 | 'untyped', | |
c07f9fc5 | 383 | path, |
3efd9988 FG |
384 | 'OSD status {}'.format(state), |
385 | ('ceph_daemon',) | |
c07f9fc5 | 386 | ) |
b32b8144 FG |
387 | for stat in OSD_STATS: |
388 | path = 'osd_{}'.format(stat) | |
b32b8144 FG |
389 | metrics[path] = Metric( |
390 | 'gauge', | |
391 | path, | |
392 | 'OSD stat {}'.format(stat), | |
393 | ('ceph_daemon',) | |
394 | ) | |
11fdf7f2 TL |
395 | for stat in OSD_POOL_STATS: |
396 | path = 'pool_{}'.format(stat) | |
397 | metrics[path] = Metric( | |
398 | 'gauge', | |
399 | path, | |
9f95a23c | 400 | "OSD pool stats: {}".format(stat), |
11fdf7f2 TL |
401 | ('pool_id',) |
402 | ) | |
3efd9988 FG |
403 | for state in PG_STATES: |
404 | path = 'pg_{}'.format(state) | |
3efd9988 FG |
405 | metrics[path] = Metric( |
406 | 'gauge', | |
407 | path, | |
92f5a8d4 TL |
408 | 'PG {} per pool'.format(state), |
409 | ('pool_id',) | |
3efd9988 FG |
410 | ) |
411 | for state in DF_CLUSTER: | |
412 | path = 'cluster_{}'.format(state) | |
3efd9988 FG |
413 | metrics[path] = Metric( |
414 | 'gauge', | |
415 | path, | |
416 | 'DF {}'.format(state), | |
417 | ) | |
418 | for state in DF_POOL: | |
419 | path = 'pool_{}'.format(state) | |
3efd9988 FG |
420 | metrics[path] = Metric( |
421 | 'gauge', | |
422 | path, | |
423 | 'DF pool {}'.format(state), | |
424 | ('pool_id',) | |
425 | ) | |
28e407b8 AA |
426 | for state in NUM_OBJECTS: |
427 | path = 'num_objects_{}'.format(state) | |
428 | metrics[path] = Metric( | |
429 | 'gauge', | |
430 | path, | |
431 | 'Number of {} objects'.format(state), | |
432 | ) | |
3efd9988 FG |
433 | |
434 | return metrics | |
c07f9fc5 | 435 | |
f6b5b4d7 | 436 | @profile_method() |
3efd9988 FG |
437 | def get_health(self): |
438 | health = json.loads(self.get('health')['json']) | |
91327a77 AA |
439 | self.metrics['health_status'].set( |
440 | health_status_to_number(health['status']) | |
c07f9fc5 FG |
441 | ) |
442 | ||
f6b5b4d7 | 443 | @profile_method() |
11fdf7f2 TL |
444 | def get_pool_stats(self): |
445 | # retrieve pool stats to provide per pool recovery metrics | |
446 | # (osd_pool_stats moved to mgr in Mimic) | |
447 | pstats = self.get('osd_pool_stats') | |
448 | for pool in pstats['pool_stats']: | |
449 | for stat in OSD_POOL_STATS: | |
450 | self.metrics['pool_{}'.format(stat)].set( | |
451 | pool['recovery_rate'].get(stat, 0), | |
452 | (pool['pool_id'],) | |
453 | ) | |
454 | ||
f6b5b4d7 | 455 | @profile_method() |
3efd9988 FG |
456 | def get_df(self): |
457 | # maybe get the to-be-exported metrics from a config? | |
458 | df = self.get('df') | |
459 | for stat in DF_CLUSTER: | |
91327a77 | 460 | self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat]) |
3efd9988 FG |
461 | |
462 | for pool in df['pools']: | |
463 | for stat in DF_POOL: | |
91327a77 AA |
464 | self.metrics['pool_{}'.format(stat)].set( |
465 | pool['stats'][stat], | |
466 | (pool['id'],) | |
467 | ) | |
94b18763 | 468 | |
f6b5b4d7 | 469 | @profile_method() |
94b18763 FG |
470 | def get_fs(self): |
471 | fs_map = self.get('fs_map') | |
472 | servers = self.get_service_list() | |
9f95a23c TL |
473 | self.log.debug('standbys: {}'.format(fs_map['standbys'])) |
474 | # export standby mds metadata, default standby fs_id is '-1' | |
475 | for standby in fs_map['standbys']: | |
476 | id_ = standby['name'] | |
477 | host_version = servers.get((id_, 'mds'), ('', '')) | |
478 | self.metrics['mds_metadata'].set(1, ( | |
479 | 'mds.{}'.format(id_), '-1', | |
480 | host_version[0], standby['addr'], | |
481 | standby['rank'], host_version[1] | |
482 | )) | |
94b18763 FG |
483 | for fs in fs_map['filesystems']: |
484 | # collect fs metadata | |
11fdf7f2 TL |
485 | data_pools = ",".join([str(pool) |
486 | for pool in fs['mdsmap']['data_pools']]) | |
91327a77 AA |
487 | self.metrics['fs_metadata'].set(1, ( |
488 | data_pools, | |
489 | fs['id'], | |
490 | fs['mdsmap']['metadata_pool'], | |
491 | fs['mdsmap']['fs_name'] | |
492 | )) | |
28e407b8 | 493 | self.log.debug('mdsmap: {}'.format(fs['mdsmap'])) |
94b18763 FG |
494 | for gid, daemon in fs['mdsmap']['info'].items(): |
495 | id_ = daemon['name'] | |
11fdf7f2 | 496 | host_version = servers.get((id_, 'mds'), ('', '')) |
91327a77 AA |
497 | self.metrics['mds_metadata'].set(1, ( |
498 | 'mds.{}'.format(id_), fs['id'], | |
499 | host_version[0], daemon['addr'], | |
500 | daemon['rank'], host_version[1] | |
501 | )) | |
3efd9988 | 502 | |
f6b5b4d7 | 503 | @profile_method() |
3efd9988 FG |
504 | def get_quorum_status(self): |
505 | mon_status = json.loads(self.get('mon_status')['json']) | |
94b18763 FG |
506 | servers = self.get_service_list() |
507 | for mon in mon_status['monmap']['mons']: | |
508 | rank = mon['rank'] | |
509 | id_ = mon['name'] | |
11fdf7f2 | 510 | host_version = servers.get((id_, 'mon'), ('', '')) |
91327a77 AA |
511 | self.metrics['mon_metadata'].set(1, ( |
512 | 'mon.{}'.format(id_), host_version[0], | |
513 | mon['public_addr'].split(':')[0], rank, | |
514 | host_version[1] | |
515 | )) | |
94b18763 | 516 | in_quorum = int(rank in mon_status['quorum']) |
91327a77 AA |
517 | self.metrics['mon_quorum_status'].set(in_quorum, ( |
518 | 'mon.{}'.format(id_), | |
519 | )) | |
3efd9988 | 520 | |
f6b5b4d7 | 521 | @profile_method() |
494da23a TL |
522 | def get_mgr_status(self): |
523 | mgr_map = self.get('mgr_map') | |
524 | servers = self.get_service_list() | |
525 | ||
526 | active = mgr_map['active_name'] | |
527 | standbys = [s.get('name') for s in mgr_map['standbys']] | |
528 | ||
529 | all_mgrs = list(standbys) | |
530 | all_mgrs.append(active) | |
531 | ||
532 | all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']} | |
533 | ||
eafe8130 | 534 | ceph_release = None |
494da23a TL |
535 | for mgr in all_mgrs: |
536 | host_version = servers.get((mgr, 'mgr'), ('', '')) | |
537 | if mgr == active: | |
538 | _state = 1 | |
539 | ceph_release = host_version[1].split()[-2] # e.g. nautilus | |
540 | else: | |
541 | _state = 0 | |
801d1391 | 542 | |
494da23a TL |
543 | self.metrics['mgr_metadata'].set(1, ( |
544 | 'mgr.{}'.format(mgr), host_version[0], | |
545 | host_version[1] | |
546 | )) | |
547 | self.metrics['mgr_status'].set(_state, ( | |
801d1391 | 548 | 'mgr.{}'.format(mgr), |
494da23a | 549 | )) |
eafe8130 | 550 | always_on_modules = mgr_map['always_on_modules'].get(ceph_release, []) |
494da23a TL |
551 | active_modules = list(always_on_modules) |
552 | active_modules.extend(mgr_map['modules']) | |
553 | ||
554 | for mod_name in all_modules.keys(): | |
555 | ||
556 | if mod_name in always_on_modules: | |
557 | _state = 2 | |
558 | elif mod_name in active_modules: | |
559 | _state = 1 | |
560 | else: | |
561 | _state = 0 | |
562 | ||
563 | _can_run = 1 if all_modules[mod_name] else 0 | |
564 | self.metrics['mgr_module_status'].set(_state, (mod_name,)) | |
565 | self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,)) | |
566 | ||
f6b5b4d7 | 567 | @profile_method() |
3efd9988 | 568 | def get_pg_status(self): |
94b18763 | 569 | |
92f5a8d4 TL |
570 | pg_summary = self.get('pg_summary') |
571 | ||
572 | for pool in pg_summary['by_pool']: | |
801d1391 TL |
573 | num_by_state = dict((state, 0) for state in PG_STATES) |
574 | num_by_state['total'] = 0 | |
92f5a8d4 | 575 | |
801d1391 | 576 | for state_name, count in pg_summary['by_pool'][pool].items(): |
92f5a8d4 | 577 | for state in state_name.split('+'): |
801d1391 TL |
578 | num_by_state[state] += count |
579 | num_by_state['total'] += count | |
580 | ||
581 | for state, num in num_by_state.items(): | |
582 | try: | |
583 | self.metrics["pg_{}".format(state)].set(num, (pool,)) | |
584 | except KeyError: | |
e306af50 | 585 | self.log.warning("skipping pg in unknown state {}".format(state)) |
b32b8144 | 586 | |
f6b5b4d7 | 587 | @profile_method() |
b32b8144 FG |
588 | def get_osd_stats(self): |
589 | osd_stats = self.get('osd_stats') | |
590 | for osd in osd_stats['osd_stats']: | |
591 | id_ = osd['osd'] | |
592 | for stat in OSD_STATS: | |
94b18763 | 593 | val = osd['perf_stat'][stat] |
91327a77 AA |
594 | self.metrics['osd_{}'.format(stat)].set(val, ( |
595 | 'osd.{}'.format(id_), | |
596 | )) | |
94b18763 FG |
597 | |
598 | def get_service_list(self): | |
599 | ret = {} | |
600 | for server in self.list_servers(): | |
601 | version = server.get('ceph_version', '') | |
602 | host = server.get('hostname', '') | |
603 | for service in server.get('services', []): | |
604 | ret.update({(service['id'], service['type']): (host, version)}) | |
605 | return ret | |
3efd9988 | 606 | |
f6b5b4d7 | 607 | @profile_method() |
3efd9988 FG |
608 | def get_metadata_and_osd_status(self): |
609 | osd_map = self.get('osd_map') | |
94b18763 FG |
610 | osd_flags = osd_map['flags'].split(',') |
611 | for flag in OSD_FLAGS: | |
91327a77 AA |
612 | self.metrics['osd_flag_{}'.format(flag)].set( |
613 | int(flag in osd_flags) | |
614 | ) | |
94b18763 | 615 | |
3efd9988 | 616 | osd_devices = self.get('osd_map_crush')['devices'] |
94b18763 | 617 | servers = self.get_service_list() |
3efd9988 | 618 | for osd in osd_map['osds']: |
94b18763 | 619 | # id can be used to link osd metrics and metadata |
3efd9988 | 620 | id_ = osd['osd'] |
94b18763 | 621 | # collect osd metadata |
3efd9988 FG |
622 | p_addr = osd['public_addr'].split(':')[0] |
623 | c_addr = osd['cluster_addr'].split(':')[0] | |
94b18763 FG |
624 | if p_addr == "-" or c_addr == "-": |
625 | self.log.info( | |
626 | "Missing address metadata for osd {0}, skipping occupation" | |
627 | " and metadata records for this osd".format(id_) | |
628 | ) | |
629 | continue | |
630 | ||
631 | dev_class = None | |
632 | for osd_device in osd_devices: | |
633 | if osd_device['id'] == id_: | |
634 | dev_class = osd_device.get('class', '') | |
635 | break | |
636 | ||
637 | if dev_class is None: | |
9f95a23c TL |
638 | self.log.info("OSD {0} is missing from CRUSH map, " |
639 | "skipping output".format(id_)) | |
94b18763 FG |
640 | continue |
641 | ||
11fdf7f2 | 642 | host_version = servers.get((str(id_), 'osd'), ('', '')) |
94b18763 | 643 | |
a8e16298 TL |
644 | # collect disk occupation metadata |
645 | osd_metadata = self.get_metadata("osd", str(id_)) | |
646 | if osd_metadata is None: | |
647 | continue | |
648 | ||
649 | obj_store = osd_metadata.get('osd_objectstore', '') | |
650 | f_iface = osd_metadata.get('front_iface', '') | |
651 | b_iface = osd_metadata.get('back_iface', '') | |
652 | ||
91327a77 | 653 | self.metrics['osd_metadata'].set(1, ( |
a8e16298 | 654 | b_iface, |
28e407b8 | 655 | 'osd.{}'.format(id_), |
3efd9988 | 656 | c_addr, |
94b18763 | 657 | dev_class, |
a8e16298 | 658 | f_iface, |
28e407b8 | 659 | host_version[0], |
a8e16298 TL |
660 | obj_store, |
661 | p_addr, | |
662 | host_version[1] | |
3efd9988 | 663 | )) |
94b18763 FG |
664 | |
665 | # collect osd status | |
3efd9988 FG |
666 | for state in OSD_STATUS: |
667 | status = osd[state] | |
91327a77 AA |
668 | self.metrics['osd_{}'.format(state)].set(status, ( |
669 | 'osd.{}'.format(id_), | |
670 | )) | |
3efd9988 | 671 | |
92f5a8d4 | 672 | osd_dev_node = None |
a8e16298 | 673 | if obj_store == "filestore": |
11fdf7f2 TL |
674 | # collect filestore backend device |
675 | osd_dev_node = osd_metadata.get( | |
676 | 'backend_filestore_dev_node', None) | |
677 | # collect filestore journal device | |
f64942e4 AA |
678 | osd_wal_dev_node = osd_metadata.get('osd_journal', '') |
679 | osd_db_dev_node = '' | |
a8e16298 | 680 | elif obj_store == "bluestore": |
11fdf7f2 TL |
681 | # collect bluestore backend device |
682 | osd_dev_node = osd_metadata.get( | |
683 | 'bluestore_bdev_dev_node', None) | |
684 | # collect bluestore wal backend | |
f64942e4 | 685 | osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '') |
11fdf7f2 | 686 | # collect bluestore db backend |
f64942e4 AA |
687 | osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '') |
688 | if osd_dev_node and osd_dev_node == "unknown": | |
689 | osd_dev_node = None | |
690 | ||
3efd9988 FG |
691 | osd_hostname = osd_metadata.get('hostname', None) |
692 | if osd_dev_node and osd_hostname: | |
693 | self.log.debug("Got dev for osd {0}: {1}/{2}".format( | |
694 | id_, osd_hostname, osd_dev_node)) | |
91327a77 | 695 | self.metrics['disk_occupation'].set(1, ( |
28e407b8 | 696 | "osd.{0}".format(id_), |
3efd9988 | 697 | osd_dev_node, |
f64942e4 AA |
698 | osd_db_dev_node, |
699 | osd_wal_dev_node, | |
28e407b8 | 700 | osd_hostname |
3efd9988 FG |
701 | )) |
702 | else: | |
703 | self.log.info("Missing dev node metadata for osd {0}, skipping " | |
11fdf7f2 | 704 | "occupation record for this osd".format(id_)) |
3efd9988 FG |
705 | |
706 | for pool in osd_map['pools']: | |
11fdf7f2 TL |
707 | self.metrics['pool_metadata'].set( |
708 | 1, (pool['pool'], pool['pool_name'])) | |
94b18763 | 709 | |
11fdf7f2 | 710 | # Populate other servers metadata |
94b18763 FG |
711 | for key, value in servers.items(): |
712 | service_id, service_type = key | |
11fdf7f2 TL |
713 | if service_type == 'rgw': |
714 | hostname, version = value | |
715 | self.metrics['rgw_metadata'].set( | |
716 | 1, | |
9f95a23c TL |
717 | ('{}.{}'.format(service_type, service_id), |
718 | hostname, version) | |
11fdf7f2 TL |
719 | ) |
720 | elif service_type == 'rbd-mirror': | |
721 | mirror_metadata = self.get_metadata('rbd-mirror', service_id) | |
722 | if mirror_metadata is None: | |
723 | continue | |
724 | mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type, | |
725 | service_id) | |
726 | self.metrics['rbd_mirror_metadata'].set( | |
727 | 1, (mirror_metadata.get(k, '') | |
728 | for k in RBD_MIRROR_METADATA) | |
729 | ) | |
3efd9988 | 730 | |
f6b5b4d7 | 731 | @profile_method() |
28e407b8 AA |
732 | def get_num_objects(self): |
733 | pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum'] | |
734 | for obj in NUM_OBJECTS: | |
735 | stat = 'num_objects_{}'.format(obj) | |
91327a77 | 736 | self.metrics[stat].set(pg_sum[stat]) |
28e407b8 | 737 | |
f6b5b4d7 | 738 | @profile_method() |
11fdf7f2 TL |
739 | def get_rbd_stats(self): |
740 | # Per RBD image stats is collected by registering a dynamic osd perf | |
741 | # stats query that tells OSDs to group stats for requests associated | |
742 | # with RBD objects by pool, namespace, and image id, which are | |
743 | # extracted from the request object names or other attributes. | |
744 | # The RBD object names have the following prefixes: | |
745 | # - rbd_data.{image_id}. (data stored in the same pool as metadata) | |
746 | # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool) | |
747 | # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled) | |
748 | # The pool_id in the object name is the id of the pool with the image | |
749 | # metdata, and should be used in the image spec. If there is no pool_id | |
750 | # in the object name, the image pool is the pool where the object is | |
751 | # located. | |
752 | ||
753 | # Parse rbd_stats_pools option, which is a comma or space separated | |
754 | # list of pool[/namespace] entries. If no namespace is specifed the | |
f6b5b4d7 TL |
755 | # stats are collected for every namespace in the pool. The wildcard |
756 | # '*' can be used to indicate all pools or namespaces | |
11fdf7f2 | 757 | pools_string = self.get_localized_module_option('rbd_stats_pools', '') |
f6b5b4d7 TL |
758 | pool_keys = [] |
759 | for x in re.split('[\s,]+', pools_string): | |
760 | if not x: | |
761 | continue | |
762 | ||
763 | s = x.split('/', 2) | |
11fdf7f2 | 764 | pool_name = s[0] |
f6b5b4d7 TL |
765 | namespace_name = None |
766 | if len(s) == 2: | |
767 | namespace_name = s[1] | |
768 | ||
769 | if pool_name == "*": | |
770 | # collect for all pools | |
771 | osd_map = self.get('osd_map') | |
772 | for pool in osd_map['pools']: | |
773 | if 'rbd' not in pool.get('application_metadata', {}): | |
774 | continue | |
775 | pool_keys.append((pool['pool_name'], namespace_name)) | |
776 | else: | |
777 | pool_keys.append((pool_name, namespace_name)) | |
778 | ||
779 | pools = {} # type: Dict[str, Set[str]] | |
780 | for pool_key in pool_keys: | |
781 | pool_name = pool_key[0] | |
782 | namespace_name = pool_key[1] | |
783 | if not namespace_name or namespace_name == "*": | |
11fdf7f2 TL |
784 | # empty set means collect for all namespaces |
785 | pools[pool_name] = set() | |
786 | continue | |
f6b5b4d7 | 787 | |
11fdf7f2 TL |
788 | if pool_name not in pools: |
789 | pools[pool_name] = set() | |
790 | elif not pools[pool_name]: | |
791 | continue | |
f6b5b4d7 | 792 | pools[pool_name].add(namespace_name) |
11fdf7f2 TL |
793 | |
794 | rbd_stats_pools = {} | |
f6b5b4d7 | 795 | for pool_id in self.rbd_stats['pools'].keys(): |
11fdf7f2 TL |
796 | name = self.rbd_stats['pools'][pool_id]['name'] |
797 | if name not in pools: | |
798 | del self.rbd_stats['pools'][pool_id] | |
799 | else: | |
800 | rbd_stats_pools[name] = \ | |
801 | self.rbd_stats['pools'][pool_id]['ns_names'] | |
802 | ||
803 | pools_refreshed = False | |
804 | if pools: | |
805 | next_refresh = self.rbd_stats['pools_refresh_time'] + \ | |
806 | self.get_localized_module_option( | |
807 | 'rbd_stats_pools_refresh_interval', 300) | |
808 | if rbd_stats_pools != pools or time.time() >= next_refresh: | |
809 | self.refresh_rbd_stats_pools(pools) | |
810 | pools_refreshed = True | |
811 | ||
812 | pool_ids = list(self.rbd_stats['pools']) | |
813 | pool_ids.sort() | |
814 | pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$' | |
815 | ||
816 | nspace_names = [] | |
817 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
818 | if pool['ns_names']: | |
819 | nspace_names.extend(pool['ns_names']) | |
820 | else: | |
821 | nspace_names = [] | |
822 | break | |
823 | if nspace_names: | |
824 | namespace_regex = '^(' + \ | |
825 | "|".join([re.escape(x) | |
826 | for x in set(nspace_names)]) + ')$' | |
827 | else: | |
828 | namespace_regex = '^(.*)$' | |
829 | ||
830 | if 'query' in self.rbd_stats and \ | |
831 | (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or | |
832 | namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']): | |
833 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
834 | del self.rbd_stats['query_id'] | |
835 | del self.rbd_stats['query'] | |
836 | ||
837 | if not self.rbd_stats['pools']: | |
838 | return | |
839 | ||
840 | counters_info = self.rbd_stats['counters_info'] | |
841 | ||
842 | if 'query_id' not in self.rbd_stats: | |
843 | query = { | |
844 | 'key_descriptor': [ | |
845 | {'type': 'pool_id', 'regex': pool_id_regex}, | |
846 | {'type': 'namespace', 'regex': namespace_regex}, | |
847 | {'type': 'object_name', | |
848 | 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'}, | |
849 | ], | |
850 | 'performance_counter_descriptors': list(counters_info), | |
851 | } | |
852 | query_id = self.add_osd_perf_query(query) | |
853 | if query_id is None: | |
854 | self.log.error('failed to add query %s' % query) | |
855 | return | |
856 | self.rbd_stats['query'] = query | |
857 | self.rbd_stats['query_id'] = query_id | |
858 | ||
859 | res = self.get_osd_perf_counters(self.rbd_stats['query_id']) | |
860 | for c in res['counters']: | |
861 | # if the pool id is not found in the object name use id of the | |
862 | # pool where the object is located | |
863 | if c['k'][2][0]: | |
864 | pool_id = int(c['k'][2][0]) | |
865 | else: | |
866 | pool_id = int(c['k'][0][0]) | |
867 | if pool_id not in self.rbd_stats['pools'] and not pools_refreshed: | |
868 | self.refresh_rbd_stats_pools(pools) | |
869 | pools_refreshed = True | |
870 | if pool_id not in self.rbd_stats['pools']: | |
871 | continue | |
872 | pool = self.rbd_stats['pools'][pool_id] | |
873 | nspace_name = c['k'][1][0] | |
874 | if nspace_name not in pool['images']: | |
875 | continue | |
876 | image_id = c['k'][2][1] | |
877 | if image_id not in pool['images'][nspace_name] and \ | |
878 | not pools_refreshed: | |
879 | self.refresh_rbd_stats_pools(pools) | |
880 | pool = self.rbd_stats['pools'][pool_id] | |
881 | pools_refreshed = True | |
882 | if image_id not in pool['images'][nspace_name]: | |
883 | continue | |
884 | counters = pool['images'][nspace_name][image_id]['c'] | |
885 | for i in range(len(c['c'])): | |
886 | counters[i][0] += c['c'][i][0] | |
887 | counters[i][1] += c['c'][i][1] | |
888 | ||
889 | label_names = ("pool", "namespace", "image") | |
890 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
891 | pool_name = pool['name'] | |
892 | for nspace_name, images in pool['images'].items(): | |
893 | for image_id in images: | |
894 | image_name = images[image_id]['n'] | |
895 | counters = images[image_id]['c'] | |
896 | i = 0 | |
897 | for key in counters_info: | |
898 | counter_info = counters_info[key] | |
899 | stattype = self._stattype_to_str(counter_info['type']) | |
900 | labels = (pool_name, nspace_name, image_name) | |
901 | if counter_info['type'] == self.PERFCOUNTER_COUNTER: | |
902 | path = 'rbd_' + key | |
903 | if path not in self.metrics: | |
904 | self.metrics[path] = Metric( | |
905 | stattype, | |
906 | path, | |
907 | counter_info['desc'], | |
908 | label_names, | |
909 | ) | |
910 | self.metrics[path].set(counters[i][0], labels) | |
911 | elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG: | |
912 | path = 'rbd_' + key + '_sum' | |
913 | if path not in self.metrics: | |
914 | self.metrics[path] = Metric( | |
915 | stattype, | |
916 | path, | |
917 | counter_info['desc'] + ' Total', | |
918 | label_names, | |
919 | ) | |
920 | self.metrics[path].set(counters[i][0], labels) | |
921 | path = 'rbd_' + key + '_count' | |
922 | if path not in self.metrics: | |
923 | self.metrics[path] = Metric( | |
924 | 'counter', | |
925 | path, | |
926 | counter_info['desc'] + ' Count', | |
927 | label_names, | |
928 | ) | |
929 | self.metrics[path].set(counters[i][1], labels) | |
930 | i += 1 | |
931 | ||
932 | def refresh_rbd_stats_pools(self, pools): | |
933 | self.log.debug('refreshing rbd pools %s' % (pools)) | |
934 | ||
935 | rbd = RBD() | |
936 | counters_info = self.rbd_stats['counters_info'] | |
937 | for pool_name, cfg_ns_names in pools.items(): | |
938 | try: | |
939 | pool_id = self.rados.pool_lookup(pool_name) | |
940 | with self.rados.open_ioctx(pool_name) as ioctx: | |
941 | if pool_id not in self.rbd_stats['pools']: | |
942 | self.rbd_stats['pools'][pool_id] = {'images': {}} | |
943 | pool = self.rbd_stats['pools'][pool_id] | |
944 | pool['name'] = pool_name | |
945 | pool['ns_names'] = cfg_ns_names | |
946 | if cfg_ns_names: | |
947 | nspace_names = list(cfg_ns_names) | |
948 | else: | |
949 | nspace_names = [''] + rbd.namespace_list(ioctx) | |
950 | for nspace_name in pool['images']: | |
951 | if nspace_name not in nspace_names: | |
952 | del pool['images'][nspace_name] | |
953 | for nspace_name in nspace_names: | |
954 | if (nspace_name and | |
955 | not rbd.namespace_exists(ioctx, nspace_name)): | |
956 | self.log.debug('unknown namespace %s for pool %s' % | |
957 | (nspace_name, pool_name)) | |
958 | continue | |
959 | ioctx.set_namespace(nspace_name) | |
960 | if nspace_name not in pool['images']: | |
961 | pool['images'][nspace_name] = {} | |
962 | namespace = pool['images'][nspace_name] | |
963 | images = {} | |
964 | for image_meta in RBD().list2(ioctx): | |
965 | image = {'n': image_meta['name']} | |
966 | image_id = image_meta['id'] | |
967 | if image_id in namespace: | |
968 | image['c'] = namespace[image_id]['c'] | |
969 | else: | |
970 | image['c'] = [[0, 0] for x in counters_info] | |
971 | images[image_id] = image | |
972 | pool['images'][nspace_name] = images | |
973 | except Exception as e: | |
974 | self.log.error('failed listing pool %s: %s' % (pool_name, e)) | |
975 | self.rbd_stats['pools_refresh_time'] = time.time() | |
976 | ||
977 | def shutdown_rbd_stats(self): | |
978 | if 'query_id' in self.rbd_stats: | |
979 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
980 | del self.rbd_stats['query_id'] | |
981 | del self.rbd_stats['query'] | |
982 | self.rbd_stats['pools'].clear() | |
983 | ||
e306af50 TL |
984 | def add_fixed_name_metrics(self): |
985 | """ | |
986 | Add fixed name metrics from existing ones that have details in their names | |
987 | that should be in labels (not in name). | |
988 | For backward compatibility, a new fixed name metric is created (instead of replacing) | |
989 | and details are put in new labels. | |
990 | Intended for RGW sync perf. counters but extendable as required. | |
991 | See: https://tracker.ceph.com/issues/45311 | |
992 | """ | |
993 | new_metrics = {} | |
994 | for metric_path in self.metrics.keys(): | |
995 | # Address RGW sync perf. counters. | |
996 | match = re.search('^data-sync-from-(.*)\.', metric_path) | |
997 | if match: | |
998 | new_path = re.sub('from-([^.]*)', 'from-zone', metric_path) | |
999 | if new_path not in new_metrics: | |
1000 | new_metrics[new_path] = Metric( | |
1001 | self.metrics[metric_path].mtype, | |
1002 | new_path, | |
1003 | self.metrics[metric_path].desc, | |
1004 | self.metrics[metric_path].labelnames + ('source_zone',) | |
1005 | ) | |
1006 | for label_values, value in self.metrics[metric_path].value.items(): | |
1007 | new_metrics[new_path].set(value, label_values + (match.group(1),)) | |
1008 | ||
1009 | self.metrics.update(new_metrics) | |
1010 | ||
f6b5b4d7 | 1011 | @profile_method(True) |
c07f9fc5 | 1012 | def collect(self): |
91327a77 AA |
1013 | # Clear the metrics before scraping |
1014 | for k in self.metrics.keys(): | |
1015 | self.metrics[k].clear() | |
1016 | ||
3efd9988 FG |
1017 | self.get_health() |
1018 | self.get_df() | |
11fdf7f2 | 1019 | self.get_pool_stats() |
94b18763 | 1020 | self.get_fs() |
b32b8144 | 1021 | self.get_osd_stats() |
3efd9988 | 1022 | self.get_quorum_status() |
494da23a | 1023 | self.get_mgr_status() |
3efd9988 FG |
1024 | self.get_metadata_and_osd_status() |
1025 | self.get_pg_status() | |
28e407b8 | 1026 | self.get_num_objects() |
3efd9988 | 1027 | |
94b18763 | 1028 | for daemon, counters in self.get_all_perf_counters().items(): |
3efd9988 | 1029 | for path, counter_info in counters.items(): |
28e407b8 | 1030 | # Skip histograms, they are represented by long running avgs |
3efd9988 | 1031 | stattype = self._stattype_to_str(counter_info['type']) |
3efd9988 FG |
1032 | if not stattype or stattype == 'histogram': |
1033 | self.log.debug('ignoring %s, type %s' % (path, stattype)) | |
1034 | continue | |
1035 | ||
81eedcae TL |
1036 | path, label_names, labels = self._perfpath_to_path_labels( |
1037 | daemon, path) | |
1038 | ||
28e407b8 | 1039 | # Get the value of the counter |
11fdf7f2 TL |
1040 | value = self._perfvalue_to_value( |
1041 | counter_info['type'], counter_info['value']) | |
28e407b8 AA |
1042 | |
1043 | # Represent the long running avgs as sum/count pairs | |
1044 | if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG: | |
1045 | _path = path + '_sum' | |
91327a77 AA |
1046 | if _path not in self.metrics: |
1047 | self.metrics[_path] = Metric( | |
1048 | stattype, | |
1049 | _path, | |
1050 | counter_info['description'] + ' Total', | |
81eedcae | 1051 | label_names, |
91327a77 | 1052 | ) |
81eedcae | 1053 | self.metrics[_path].set(value, labels) |
28e407b8 AA |
1054 | |
1055 | _path = path + '_count' | |
91327a77 AA |
1056 | if _path not in self.metrics: |
1057 | self.metrics[_path] = Metric( | |
1058 | 'counter', | |
1059 | _path, | |
1060 | counter_info['description'] + ' Count', | |
81eedcae | 1061 | label_names, |
91327a77 | 1062 | ) |
81eedcae | 1063 | self.metrics[_path].set(counter_info['count'], labels,) |
28e407b8 | 1064 | else: |
91327a77 AA |
1065 | if path not in self.metrics: |
1066 | self.metrics[path] = Metric( | |
1067 | stattype, | |
1068 | path, | |
1069 | counter_info['description'], | |
81eedcae | 1070 | label_names, |
91327a77 | 1071 | ) |
81eedcae | 1072 | self.metrics[path].set(value, labels) |
91327a77 | 1073 | |
e306af50 | 1074 | self.add_fixed_name_metrics() |
11fdf7f2 TL |
1075 | self.get_rbd_stats() |
1076 | ||
91327a77 AA |
1077 | # Return formatted metrics and clear no longer used data |
1078 | _metrics = [m.str_expfmt() for m in self.metrics.values()] | |
1079 | for k in self.metrics.keys(): | |
1080 | self.metrics[k].clear() | |
1081 | ||
1082 | return ''.join(_metrics) + '\n' | |
c07f9fc5 | 1083 | |
11fdf7f2 TL |
1084 | def get_file_sd_config(self): |
1085 | servers = self.list_servers() | |
1086 | targets = [] | |
1087 | for server in servers: | |
1088 | hostname = server.get('hostname', '') | |
1089 | for service in server.get('services', []): | |
1090 | if service['type'] != 'mgr': | |
1091 | continue | |
1092 | id_ = service['id'] | |
1093 | # get port for prometheus module at mgr with id_ | |
1094 | # TODO use get_config_prefix or get_config here once | |
1095 | # https://github.com/ceph/ceph/pull/20458 is merged | |
1096 | result = CommandResult("") | |
f6b5b4d7 TL |
1097 | assert isinstance(_global_instance, Module) |
1098 | _global_instance.send_command( | |
11fdf7f2 TL |
1099 | result, "mon", '', |
1100 | json.dumps({ | |
1101 | "prefix": "config-key get", | |
1102 | 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_), | |
1103 | }), | |
1104 | "") | |
1105 | r, outb, outs = result.wait() | |
1106 | if r != 0: | |
f6b5b4d7 | 1107 | _global_instance.log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs)) |
11fdf7f2 TL |
1108 | targets.append('{}:{}'.format(hostname, DEFAULT_PORT)) |
1109 | else: | |
1110 | port = json.loads(outb) | |
1111 | targets.append('{}:{}'.format(hostname, port)) | |
1112 | ||
1113 | ret = [ | |
1114 | { | |
1115 | "targets": targets, | |
1116 | "labels": {} | |
1117 | } | |
1118 | ] | |
1119 | return 0, json.dumps(ret), "" | |
1120 | ||
1121 | def self_test(self): | |
1122 | self.collect() | |
1123 | self.get_file_sd_config() | |
1124 | ||
1125 | def handle_command(self, inbuf, cmd): | |
1126 | if cmd['prefix'] == 'prometheus file_sd_config': | |
1127 | return self.get_file_sd_config() | |
3efd9988 FG |
1128 | else: |
1129 | return (-errno.EINVAL, '', | |
1130 | "Command not found '{0}'".format(cmd['prefix'])) | |
c07f9fc5 FG |
1131 | |
1132 | def serve(self): | |
1133 | ||
1134 | class Root(object): | |
1135 | ||
1136 | # collapse everything to '/' | |
1137 | def _cp_dispatch(self, vpath): | |
1138 | cherrypy.request.path = '' | |
1139 | return self | |
1140 | ||
c07f9fc5 FG |
1141 | @cherrypy.expose |
1142 | def index(self): | |
3efd9988 FG |
1143 | return '''<!DOCTYPE html> |
1144 | <html> | |
9f95a23c TL |
1145 | <head><title>Ceph Exporter</title></head> |
1146 | <body> | |
1147 | <h1>Ceph Exporter</h1> | |
1148 | <p><a href='/metrics'>Metrics</a></p> | |
1149 | </body> | |
3efd9988 FG |
1150 | </html>''' |
1151 | ||
1152 | @cherrypy.expose | |
1153 | def metrics(self): | |
91327a77 | 1154 | # Lock the function execution |
f6b5b4d7 TL |
1155 | assert isinstance(_global_instance, Module) |
1156 | with _global_instance.collect_lock: | |
1157 | return self._metrics(_global_instance) | |
91327a77 | 1158 | |
11fdf7f2 TL |
1159 | @staticmethod |
1160 | def _metrics(instance): | |
f6b5b4d7 TL |
1161 | # type: (Module) -> Any |
1162 | # Return cached data if available | |
1163 | if not instance.collect_cache: | |
1164 | raise cherrypy.HTTPError(503, 'No cached data available yet') | |
91327a77 | 1165 | |
f6b5b4d7 TL |
1166 | def respond(): |
1167 | assert isinstance(instance, Module) | |
91327a77 AA |
1168 | cherrypy.response.headers['Content-Type'] = 'text/plain' |
1169 | return instance.collect_cache | |
f6b5b4d7 TL |
1170 | |
1171 | if instance.collect_time < instance.scrape_interval: | |
1172 | # Respond if cache isn't stale | |
1173 | return respond() | |
1174 | ||
1175 | if instance.stale_cache_strategy == instance.STALE_CACHE_RETURN: | |
1176 | # Respond even if cache is stale | |
1177 | instance.log.info( | |
1178 | 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' | |
1179 | 'returning metrics from stale cache.'.format( | |
1180 | instance.collect_time, | |
1181 | instance.collect_time - instance.scrape_interval | |
1182 | ) | |
1183 | ) | |
1184 | return respond() | |
1185 | ||
1186 | if instance.stale_cache_strategy == instance.STALE_CACHE_FAIL: | |
1187 | # Fail if cache is stale | |
1188 | msg = ( | |
1189 | 'Gathering data took {:.2f} seconds, metrics are stale for {:.2f} seconds, ' | |
1190 | 'returning "service unavailable".'.format( | |
1191 | instance.collect_time, | |
1192 | instance.collect_time - instance.scrape_interval, | |
1193 | ) | |
1194 | ) | |
1195 | instance.log.error(msg) | |
1196 | raise cherrypy.HTTPError(503, msg) | |
c07f9fc5 | 1197 | |
91327a77 | 1198 | # Make the cache timeout for collecting configurable |
f6b5b4d7 TL |
1199 | self.scrape_interval = float(self.get_localized_module_option('scrape_interval', 15.0)) |
1200 | ||
1201 | self.stale_cache_strategy = self.get_localized_module_option('stale_cache_strategy', 'log') | |
1202 | if self.stale_cache_strategy not in [self.STALE_CACHE_FAIL, | |
1203 | self.STALE_CACHE_RETURN]: | |
1204 | self.stale_cache_strategy = self.STALE_CACHE_FAIL | |
91327a77 | 1205 | |
11fdf7f2 | 1206 | server_addr = self.get_localized_module_option( |
494da23a | 1207 | 'server_addr', get_default_addr()) |
11fdf7f2 TL |
1208 | server_port = self.get_localized_module_option( |
1209 | 'server_port', DEFAULT_PORT) | |
c07f9fc5 FG |
1210 | self.log.info( |
1211 | "server_addr: %s server_port: %s" % | |
1212 | (server_addr, server_port) | |
1213 | ) | |
c07f9fc5 | 1214 | |
94b18763 FG |
1215 | # Publish the URI that others may use to access the service we're |
1216 | # about to start serving | |
1217 | self.set_uri('http://{0}:{1}/'.format( | |
eafe8130 | 1218 | socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr, |
94b18763 FG |
1219 | server_port |
1220 | )) | |
1221 | ||
c07f9fc5 FG |
1222 | cherrypy.config.update({ |
1223 | 'server.socket_host': server_addr, | |
3efd9988 | 1224 | 'server.socket_port': int(server_port), |
c07f9fc5 FG |
1225 | 'engine.autoreload.on': False |
1226 | }) | |
1227 | cherrypy.tree.mount(Root(), "/") | |
94b18763 | 1228 | self.log.info('Starting engine...') |
c07f9fc5 | 1229 | cherrypy.engine.start() |
94b18763 | 1230 | self.log.info('Engine started.') |
91327a77 AA |
1231 | # wait for the shutdown event |
1232 | self.shutdown_event.wait() | |
1233 | self.shutdown_event.clear() | |
1234 | cherrypy.engine.stop() | |
1235 | self.log.info('Engine stopped.') | |
11fdf7f2 | 1236 | self.shutdown_rbd_stats() |
94b18763 FG |
1237 | |
1238 | def shutdown(self): | |
1239 | self.log.info('Stopping engine...') | |
91327a77 | 1240 | self.shutdown_event.set() |
94b18763 FG |
1241 | |
1242 | ||
1243 | class StandbyModule(MgrStandbyModule): | |
91327a77 AA |
1244 | def __init__(self, *args, **kwargs): |
1245 | super(StandbyModule, self).__init__(*args, **kwargs) | |
1246 | self.shutdown_event = threading.Event() | |
1247 | ||
94b18763 | 1248 | def serve(self): |
494da23a TL |
1249 | server_addr = self.get_localized_module_option( |
1250 | 'server_addr', get_default_addr()) | |
11fdf7f2 TL |
1251 | server_port = self.get_localized_module_option( |
1252 | 'server_port', DEFAULT_PORT) | |
1253 | self.log.info("server_addr: %s server_port: %s" % | |
1254 | (server_addr, server_port)) | |
94b18763 FG |
1255 | cherrypy.config.update({ |
1256 | 'server.socket_host': server_addr, | |
1257 | 'server.socket_port': int(server_port), | |
1258 | 'engine.autoreload.on': False | |
1259 | }) | |
1260 | ||
1261 | module = self | |
1262 | ||
1263 | class Root(object): | |
94b18763 FG |
1264 | @cherrypy.expose |
1265 | def index(self): | |
1266 | active_uri = module.get_active_uri() | |
1267 | return '''<!DOCTYPE html> | |
1268 | <html> | |
9f95a23c TL |
1269 | <head><title>Ceph Exporter</title></head> |
1270 | <body> | |
1271 | <h1>Ceph Exporter</h1> | |
94b18763 | 1272 | <p><a href='{}metrics'>Metrics</a></p> |
9f95a23c | 1273 | </body> |
94b18763 FG |
1274 | </html>'''.format(active_uri) |
1275 | ||
1276 | @cherrypy.expose | |
1277 | def metrics(self): | |
1278 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1279 | return '' | |
1280 | ||
1281 | cherrypy.tree.mount(Root(), '/', {}) | |
1282 | self.log.info('Starting engine...') | |
1283 | cherrypy.engine.start() | |
94b18763 | 1284 | self.log.info('Engine started.') |
91327a77 AA |
1285 | # Wait for shutdown event |
1286 | self.shutdown_event.wait() | |
1287 | self.shutdown_event.clear() | |
1288 | cherrypy.engine.stop() | |
1289 | self.log.info('Engine stopped.') | |
94b18763 FG |
1290 | |
1291 | def shutdown(self): | |
1292 | self.log.info("Stopping engine...") | |
91327a77 | 1293 | self.shutdown_event.set() |
94b18763 | 1294 | self.log.info("Stopped engine") |