]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/prometheus/module.py
c721a8579d9dba28dfbe26e4241398e150fefd68
2 from distutils
.version
import StrictVersion
11 from mgr_module
import MgrModule
, MgrStandbyModule
, CommandResult
, PG_STATES
12 from mgr_util
import get_default_addr
15 # Defaults for the Prometheus HTTP server. Can also set in config-key
16 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
17 # for Prometheus exporter port registry
21 # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
22 # that the ports its listening on are in fact bound. When using the any address
23 # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
24 # ipv6 isn't yet configured / supported and CherryPy throws an uncaught
26 if cherrypy
is not None:
27 v
= StrictVersion(cherrypy
.__version
__)
28 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
29 # centos:7) and back to at least 3.0.0.
30 if StrictVersion("3.1.2") <= v
< StrictVersion("3.2.3"):
31 # https://github.com/cherrypy/cherrypy/issues/1100
32 from cherrypy
.process
import servers
33 servers
.wait_for_occupied_port
= lambda host
, port
: None
36 # cherrypy likes to sys.exit on error. don't let it take us down too!
37 def os_exit_noop(*args
, **kwargs
):
41 os
._exit
= os_exit_noop
43 # to access things in class Module from subclass Root. Because
44 # it's a dict, the writer doesn't need to declare 'global' for access
46 _global_instance
= {'plugin': None}
49 def global_instance():
50 assert _global_instance
['plugin'] is not None
51 return _global_instance
['plugin']
54 def health_status_to_number(status
):
55 if status
== 'HEALTH_OK':
57 elif status
== 'HEALTH_WARN':
59 elif status
== 'HEALTH_ERR':
63 DF_CLUSTER
= ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
65 DF_POOL
= ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty',
66 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
68 OSD_POOL_STATS
= ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
69 'recovering_keys_per_sec', 'num_objects_recovered',
70 'num_bytes_recovered', 'num_bytes_recovered')
72 OSD_FLAGS
= ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
73 'norecover', 'noscrub', 'nodeep-scrub')
75 FS_METADATA
= ('data_pools', 'fs_id', 'metadata_pool', 'name')
77 MDS_METADATA
= ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
80 MON_METADATA
= ('ceph_daemon', 'hostname',
81 'public_addr', 'rank', 'ceph_version')
83 MGR_METADATA
= ('ceph_daemon', 'hostname', 'ceph_version')
85 MGR_STATUS
= ('ceph_daemon',)
87 MGR_MODULE_STATUS
= ('name',)
89 MGR_MODULE_CAN_RUN
= ('name',)
91 OSD_METADATA
= ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
92 'front_iface', 'hostname', 'objectstore', 'public_addr',
95 OSD_STATUS
= ['weight', 'up', 'in']
97 OSD_STATS
= ['apply_latency_ms', 'commit_latency_ms']
99 POOL_METADATA
= ('pool_id', 'name')
101 RGW_METADATA
= ('ceph_daemon', 'hostname', 'ceph_version')
103 RBD_MIRROR_METADATA
= ('ceph_daemon', 'id', 'instance_id', 'hostname',
106 DISK_OCCUPATION
= ('ceph_daemon', 'device', 'db_device',
107 'wal_device', 'instance')
109 NUM_OBJECTS
= ['degraded', 'misplaced', 'unfound']
112 class Metric(object):
113 def __init__(self
, mtype
, name
, desc
, labels
=None):
117 self
.labelnames
= labels
# tuple if present
118 self
.value
= {} # indexed by label values
123 def set(self
, value
, labelvalues
=None):
124 # labelvalues must be a tuple
125 labelvalues
= labelvalues
or ('',)
126 self
.value
[labelvalues
] = value
128 def str_expfmt(self
):
130 def promethize(path
):
131 ''' replace illegal metric name characters '''
132 result
= re
.sub(r
'[./\s]|::', '_', path
).replace('+', '_plus')
134 # Hyphens usually turn into underscores, unless they are
136 if result
.endswith("-"):
137 result
= result
[0:-1] + "_minus"
139 result
= result
.replace("-", "_")
141 return "ceph_{0}".format(result
)
144 ''' represent as Go-compatible float '''
145 if value
== float('inf'):
147 if value
== float('-inf'):
149 if math
.isnan(value
):
151 return repr(float(value
))
153 name
= promethize(self
.name
)
156 # TYPE {name} {mtype}'''.format(
162 for labelvalues
, value
in self
.value
.items():
164 labels
= zip(self
.labelnames
, labelvalues
)
165 labels
= ','.join('%s="%s"' % (k
, v
) for k
, v
in labels
)
169 fmtstr
= '\n{name}{{{labels}}} {value}'
171 fmtstr
= '\n{name} {value}'
172 expfmt
+= fmtstr
.format(
175 value
=floatstr(value
),
180 class Module(MgrModule
):
183 "cmd": "prometheus file_sd_config",
184 "desc": "Return file_sd compatible prometheus config for mgr cluster",
190 {'name': 'server_addr'},
191 {'name': 'server_port'},
192 {'name': 'scrape_interval'},
193 {'name': 'rbd_stats_pools'},
194 {'name': 'rbd_stats_pools_refresh_interval', 'type': 'int', 'default': 300},
197 def __init__(self
, *args
, **kwargs
):
198 super(Module
, self
).__init
__(*args
, **kwargs
)
199 self
.metrics
= self
._setup
_static
_metrics
()
200 self
.shutdown_event
= threading
.Event()
201 self
.collect_lock
= threading
.RLock()
202 self
.collect_time
= 0
203 self
.collect_timeout
= 5.0
204 self
.collect_cache
= None
207 'pools_refresh_time': 0,
209 'write_ops': {'type': self
.PERFCOUNTER_COUNTER
,
210 'desc': 'RBD image writes count'},
211 'read_ops': {'type': self
.PERFCOUNTER_COUNTER
,
212 'desc': 'RBD image reads count'},
213 'write_bytes': {'type': self
.PERFCOUNTER_COUNTER
,
214 'desc': 'RBD image bytes written'},
215 'read_bytes': {'type': self
.PERFCOUNTER_COUNTER
,
216 'desc': 'RBD image bytes read'},
217 'write_latency': {'type': self
.PERFCOUNTER_LONGRUNAVG
,
218 'desc': 'RBD image writes latency (msec)'},
219 'read_latency': {'type': self
.PERFCOUNTER_LONGRUNAVG
,
220 'desc': 'RBD image reads latency (msec)'},
223 _global_instance
['plugin'] = self
225 def _setup_static_metrics(self
):
227 metrics
['health_status'] = Metric(
230 'Cluster health status'
232 metrics
['mon_quorum_status'] = Metric(
235 'Monitors in quorum',
238 metrics
['fs_metadata'] = Metric(
244 metrics
['mds_metadata'] = Metric(
250 metrics
['mon_metadata'] = Metric(
256 metrics
['mgr_metadata'] = Metric(
262 metrics
['mgr_status'] = Metric(
265 'MGR status (0=standby, 1=active)',
268 metrics
['mgr_module_status'] = Metric(
271 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
274 metrics
['mgr_module_can_run'] = Metric(
276 'mgr_module_can_run',
277 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
280 metrics
['osd_metadata'] = Metric(
287 # The reason for having this separate to OSD_METADATA is
288 # so that we can stably use the same tag names that
289 # the Prometheus node_exporter does
290 metrics
['disk_occupation'] = Metric(
293 'Associate Ceph daemon with disk used',
297 metrics
['pool_metadata'] = Metric(
304 metrics
['rgw_metadata'] = Metric(
311 metrics
['rbd_mirror_metadata'] = Metric(
313 'rbd_mirror_metadata',
314 'RBD Mirror Metadata',
318 metrics
['pg_total'] = Metric(
321 'PG Total Count per Pool',
325 for flag
in OSD_FLAGS
:
326 path
= 'osd_flag_{}'.format(flag
)
327 metrics
[path
] = Metric(
330 'OSD Flag {}'.format(flag
)
332 for state
in OSD_STATUS
:
333 path
= 'osd_{}'.format(state
)
334 metrics
[path
] = Metric(
337 'OSD status {}'.format(state
),
340 for stat
in OSD_STATS
:
341 path
= 'osd_{}'.format(stat
)
342 metrics
[path
] = Metric(
345 'OSD stat {}'.format(stat
),
348 for stat
in OSD_POOL_STATS
:
349 path
= 'pool_{}'.format(stat
)
350 metrics
[path
] = Metric(
353 "OSD pool stats: {}".format(stat
),
356 for state
in PG_STATES
:
357 path
= 'pg_{}'.format(state
)
358 metrics
[path
] = Metric(
361 'PG {} per pool'.format(state
),
364 for state
in DF_CLUSTER
:
365 path
= 'cluster_{}'.format(state
)
366 metrics
[path
] = Metric(
369 'DF {}'.format(state
),
371 for state
in DF_POOL
:
372 path
= 'pool_{}'.format(state
)
373 metrics
[path
] = Metric(
376 'DF pool {}'.format(state
),
379 for state
in NUM_OBJECTS
:
380 path
= 'num_objects_{}'.format(state
)
381 metrics
[path
] = Metric(
384 'Number of {} objects'.format(state
),
389 def get_health(self
):
390 health
= json
.loads(self
.get('health')['json'])
391 self
.metrics
['health_status'].set(
392 health_status_to_number(health
['status'])
395 def get_pool_stats(self
):
396 # retrieve pool stats to provide per pool recovery metrics
397 # (osd_pool_stats moved to mgr in Mimic)
398 pstats
= self
.get('osd_pool_stats')
399 for pool
in pstats
['pool_stats']:
400 for stat
in OSD_POOL_STATS
:
401 self
.metrics
['pool_{}'.format(stat
)].set(
402 pool
['recovery_rate'].get(stat
, 0),
407 # maybe get the to-be-exported metrics from a config?
409 for stat
in DF_CLUSTER
:
410 self
.metrics
['cluster_{}'.format(stat
)].set(df
['stats'][stat
])
412 for pool
in df
['pools']:
414 self
.metrics
['pool_{}'.format(stat
)].set(
420 fs_map
= self
.get('fs_map')
421 servers
= self
.get_service_list()
422 self
.log
.debug('standbys: {}'.format(fs_map
['standbys']))
423 # export standby mds metadata, default standby fs_id is '-1'
424 for standby
in fs_map
['standbys']:
425 id_
= standby
['name']
426 host_version
= servers
.get((id_
, 'mds'), ('', ''))
427 self
.metrics
['mds_metadata'].set(1, (
428 'mds.{}'.format(id_
), '-1',
429 host_version
[0], standby
['addr'],
430 standby
['rank'], host_version
[1]
432 for fs
in fs_map
['filesystems']:
433 # collect fs metadata
434 data_pools
= ",".join([str(pool
)
435 for pool
in fs
['mdsmap']['data_pools']])
436 self
.metrics
['fs_metadata'].set(1, (
439 fs
['mdsmap']['metadata_pool'],
440 fs
['mdsmap']['fs_name']
442 self
.log
.debug('mdsmap: {}'.format(fs
['mdsmap']))
443 for gid
, daemon
in fs
['mdsmap']['info'].items():
445 host_version
= servers
.get((id_
, 'mds'), ('', ''))
446 self
.metrics
['mds_metadata'].set(1, (
447 'mds.{}'.format(id_
), fs
['id'],
448 host_version
[0], daemon
['addr'],
449 daemon
['rank'], host_version
[1]
452 def get_quorum_status(self
):
453 mon_status
= json
.loads(self
.get('mon_status')['json'])
454 servers
= self
.get_service_list()
455 for mon
in mon_status
['monmap']['mons']:
458 host_version
= servers
.get((id_
, 'mon'), ('', ''))
459 self
.metrics
['mon_metadata'].set(1, (
460 'mon.{}'.format(id_
), host_version
[0],
461 mon
['public_addr'].split(':')[0], rank
,
464 in_quorum
= int(rank
in mon_status
['quorum'])
465 self
.metrics
['mon_quorum_status'].set(in_quorum
, (
466 'mon.{}'.format(id_
),
469 def get_mgr_status(self
):
470 mgr_map
= self
.get('mgr_map')
471 servers
= self
.get_service_list()
473 active
= mgr_map
['active_name']
474 standbys
= [s
.get('name') for s
in mgr_map
['standbys']]
476 all_mgrs
= list(standbys
)
477 all_mgrs
.append(active
)
479 all_modules
= {module
.get('name'):module
.get('can_run') for module
in mgr_map
['available_modules']}
483 host_version
= servers
.get((mgr
, 'mgr'), ('', ''))
486 ceph_release
= host_version
[1].split()[-2] # e.g. nautilus
490 self
.metrics
['mgr_metadata'].set(1, (
491 'mgr.{}'.format(mgr
), host_version
[0],
494 self
.metrics
['mgr_status'].set(_state
, (
495 'mgr.{}'.format(mgr
),
497 always_on_modules
= mgr_map
['always_on_modules'].get(ceph_release
, [])
498 active_modules
= list(always_on_modules
)
499 active_modules
.extend(mgr_map
['modules'])
501 for mod_name
in all_modules
.keys():
503 if mod_name
in always_on_modules
:
505 elif mod_name
in active_modules
:
510 _can_run
= 1 if all_modules
[mod_name
] else 0
511 self
.metrics
['mgr_module_status'].set(_state
, (mod_name
,))
512 self
.metrics
['mgr_module_can_run'].set(_can_run
, (mod_name
,))
514 def get_pg_status(self
):
516 pg_summary
= self
.get('pg_summary')
518 for pool
in pg_summary
['by_pool']:
519 num_by_state
= dict((state
, 0) for state
in PG_STATES
)
520 num_by_state
['total'] = 0
522 for state_name
, count
in pg_summary
['by_pool'][pool
].items():
523 for state
in state_name
.split('+'):
524 num_by_state
[state
] += count
525 num_by_state
['total'] += count
527 for state
, num
in num_by_state
.items():
529 self
.metrics
["pg_{}".format(state
)].set(num
, (pool
,))
531 self
.log
.warning("skipping pg in unknown state {}".format(state
))
533 def get_osd_stats(self
):
534 osd_stats
= self
.get('osd_stats')
535 for osd
in osd_stats
['osd_stats']:
537 for stat
in OSD_STATS
:
538 val
= osd
['perf_stat'][stat
]
539 self
.metrics
['osd_{}'.format(stat
)].set(val
, (
540 'osd.{}'.format(id_
),
543 def get_service_list(self
):
545 for server
in self
.list_servers():
546 version
= server
.get('ceph_version', '')
547 host
= server
.get('hostname', '')
548 for service
in server
.get('services', []):
549 ret
.update({(service
['id'], service
['type']): (host
, version
)})
552 def get_metadata_and_osd_status(self
):
553 osd_map
= self
.get('osd_map')
554 osd_flags
= osd_map
['flags'].split(',')
555 for flag
in OSD_FLAGS
:
556 self
.metrics
['osd_flag_{}'.format(flag
)].set(
557 int(flag
in osd_flags
)
560 osd_devices
= self
.get('osd_map_crush')['devices']
561 servers
= self
.get_service_list()
562 for osd
in osd_map
['osds']:
563 # id can be used to link osd metrics and metadata
565 # collect osd metadata
566 p_addr
= osd
['public_addr'].split(':')[0]
567 c_addr
= osd
['cluster_addr'].split(':')[0]
568 if p_addr
== "-" or c_addr
== "-":
570 "Missing address metadata for osd {0}, skipping occupation"
571 " and metadata records for this osd".format(id_
)
576 for osd_device
in osd_devices
:
577 if osd_device
['id'] == id_
:
578 dev_class
= osd_device
.get('class', '')
581 if dev_class
is None:
582 self
.log
.info("OSD {0} is missing from CRUSH map, "
583 "skipping output".format(id_
))
586 host_version
= servers
.get((str(id_
), 'osd'), ('', ''))
588 # collect disk occupation metadata
589 osd_metadata
= self
.get_metadata("osd", str(id_
))
590 if osd_metadata
is None:
593 obj_store
= osd_metadata
.get('osd_objectstore', '')
594 f_iface
= osd_metadata
.get('front_iface', '')
595 b_iface
= osd_metadata
.get('back_iface', '')
597 self
.metrics
['osd_metadata'].set(1, (
599 'osd.{}'.format(id_
),
610 for state
in OSD_STATUS
:
612 self
.metrics
['osd_{}'.format(state
)].set(status
, (
613 'osd.{}'.format(id_
),
617 if obj_store
== "filestore":
618 # collect filestore backend device
619 osd_dev_node
= osd_metadata
.get(
620 'backend_filestore_dev_node', None)
621 # collect filestore journal device
622 osd_wal_dev_node
= osd_metadata
.get('osd_journal', '')
624 elif obj_store
== "bluestore":
625 # collect bluestore backend device
626 osd_dev_node
= osd_metadata
.get(
627 'bluestore_bdev_dev_node', None)
628 # collect bluestore wal backend
629 osd_wal_dev_node
= osd_metadata
.get('bluefs_wal_dev_node', '')
630 # collect bluestore db backend
631 osd_db_dev_node
= osd_metadata
.get('bluefs_db_dev_node', '')
632 if osd_dev_node
and osd_dev_node
== "unknown":
635 osd_hostname
= osd_metadata
.get('hostname', None)
636 if osd_dev_node
and osd_hostname
:
637 self
.log
.debug("Got dev for osd {0}: {1}/{2}".format(
638 id_
, osd_hostname
, osd_dev_node
))
639 self
.metrics
['disk_occupation'].set(1, (
640 "osd.{0}".format(id_
),
647 self
.log
.info("Missing dev node metadata for osd {0}, skipping "
648 "occupation record for this osd".format(id_
))
650 for pool
in osd_map
['pools']:
651 self
.metrics
['pool_metadata'].set(
652 1, (pool
['pool'], pool
['pool_name']))
654 # Populate other servers metadata
655 for key
, value
in servers
.items():
656 service_id
, service_type
= key
657 if service_type
== 'rgw':
658 hostname
, version
= value
659 self
.metrics
['rgw_metadata'].set(
661 ('{}.{}'.format(service_type
, service_id
),
664 elif service_type
== 'rbd-mirror':
665 mirror_metadata
= self
.get_metadata('rbd-mirror', service_id
)
666 if mirror_metadata
is None:
668 mirror_metadata
['ceph_daemon'] = '{}.{}'.format(service_type
,
670 self
.metrics
['rbd_mirror_metadata'].set(
671 1, (mirror_metadata
.get(k
, '')
672 for k
in RBD_MIRROR_METADATA
)
675 def get_num_objects(self
):
676 pg_sum
= self
.get('pg_summary')['pg_stats_sum']['stat_sum']
677 for obj
in NUM_OBJECTS
:
678 stat
= 'num_objects_{}'.format(obj
)
679 self
.metrics
[stat
].set(pg_sum
[stat
])
681 def get_rbd_stats(self
):
682 # Per RBD image stats is collected by registering a dynamic osd perf
683 # stats query that tells OSDs to group stats for requests associated
684 # with RBD objects by pool, namespace, and image id, which are
685 # extracted from the request object names or other attributes.
686 # The RBD object names have the following prefixes:
687 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
688 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
689 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
690 # The pool_id in the object name is the id of the pool with the image
691 # metdata, and should be used in the image spec. If there is no pool_id
692 # in the object name, the image pool is the pool where the object is
695 # Parse rbd_stats_pools option, which is a comma or space separated
696 # list of pool[/namespace] entries. If no namespace is specifed the
697 # stats are collected for every namespace in the pool.
698 pools_string
= self
.get_localized_module_option('rbd_stats_pools', '')
700 for p
in [x
for x
in re
.split('[\s,]+', pools_string
) if x
]:
704 # empty set means collect for all namespaces
705 pools
[pool_name
] = set()
707 if pool_name
not in pools
:
708 pools
[pool_name
] = set()
709 elif not pools
[pool_name
]:
711 pools
[pool_name
].add(s
[1])
714 for pool_id
in list(self
.rbd_stats
['pools']):
715 name
= self
.rbd_stats
['pools'][pool_id
]['name']
716 if name
not in pools
:
717 del self
.rbd_stats
['pools'][pool_id
]
719 rbd_stats_pools
[name
] = \
720 self
.rbd_stats
['pools'][pool_id
]['ns_names']
722 pools_refreshed
= False
724 next_refresh
= self
.rbd_stats
['pools_refresh_time'] + \
725 self
.get_localized_module_option(
726 'rbd_stats_pools_refresh_interval', 300)
727 if rbd_stats_pools
!= pools
or time
.time() >= next_refresh
:
728 self
.refresh_rbd_stats_pools(pools
)
729 pools_refreshed
= True
731 pool_ids
= list(self
.rbd_stats
['pools'])
733 pool_id_regex
= '^(' + '|'.join([str(x
) for x
in pool_ids
]) + ')$'
736 for pool_id
, pool
in self
.rbd_stats
['pools'].items():
738 nspace_names
.extend(pool
['ns_names'])
743 namespace_regex
= '^(' + \
744 "|".join([re
.escape(x
)
745 for x
in set(nspace_names
)]) + ')$'
747 namespace_regex
= '^(.*)$'
749 if 'query' in self
.rbd_stats
and \
750 (pool_id_regex
!= self
.rbd_stats
['query']['key_descriptor'][0]['regex'] or
751 namespace_regex
!= self
.rbd_stats
['query']['key_descriptor'][1]['regex']):
752 self
.remove_osd_perf_query(self
.rbd_stats
['query_id'])
753 del self
.rbd_stats
['query_id']
754 del self
.rbd_stats
['query']
756 if not self
.rbd_stats
['pools']:
759 counters_info
= self
.rbd_stats
['counters_info']
761 if 'query_id' not in self
.rbd_stats
:
764 {'type': 'pool_id', 'regex': pool_id_regex
},
765 {'type': 'namespace', 'regex': namespace_regex
},
766 {'type': 'object_name',
767 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
769 'performance_counter_descriptors': list(counters_info
),
771 query_id
= self
.add_osd_perf_query(query
)
773 self
.log
.error('failed to add query %s' % query
)
775 self
.rbd_stats
['query'] = query
776 self
.rbd_stats
['query_id'] = query_id
778 res
= self
.get_osd_perf_counters(self
.rbd_stats
['query_id'])
779 for c
in res
['counters']:
780 # if the pool id is not found in the object name use id of the
781 # pool where the object is located
783 pool_id
= int(c
['k'][2][0])
785 pool_id
= int(c
['k'][0][0])
786 if pool_id
not in self
.rbd_stats
['pools'] and not pools_refreshed
:
787 self
.refresh_rbd_stats_pools(pools
)
788 pools_refreshed
= True
789 if pool_id
not in self
.rbd_stats
['pools']:
791 pool
= self
.rbd_stats
['pools'][pool_id
]
792 nspace_name
= c
['k'][1][0]
793 if nspace_name
not in pool
['images']:
795 image_id
= c
['k'][2][1]
796 if image_id
not in pool
['images'][nspace_name
] and \
798 self
.refresh_rbd_stats_pools(pools
)
799 pool
= self
.rbd_stats
['pools'][pool_id
]
800 pools_refreshed
= True
801 if image_id
not in pool
['images'][nspace_name
]:
803 counters
= pool
['images'][nspace_name
][image_id
]['c']
804 for i
in range(len(c
['c'])):
805 counters
[i
][0] += c
['c'][i
][0]
806 counters
[i
][1] += c
['c'][i
][1]
808 label_names
= ("pool", "namespace", "image")
809 for pool_id
, pool
in self
.rbd_stats
['pools'].items():
810 pool_name
= pool
['name']
811 for nspace_name
, images
in pool
['images'].items():
812 for image_id
in images
:
813 image_name
= images
[image_id
]['n']
814 counters
= images
[image_id
]['c']
816 for key
in counters_info
:
817 counter_info
= counters_info
[key
]
818 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
819 labels
= (pool_name
, nspace_name
, image_name
)
820 if counter_info
['type'] == self
.PERFCOUNTER_COUNTER
:
822 if path
not in self
.metrics
:
823 self
.metrics
[path
] = Metric(
826 counter_info
['desc'],
829 self
.metrics
[path
].set(counters
[i
][0], labels
)
830 elif counter_info
['type'] == self
.PERFCOUNTER_LONGRUNAVG
:
831 path
= 'rbd_' + key
+ '_sum'
832 if path
not in self
.metrics
:
833 self
.metrics
[path
] = Metric(
836 counter_info
['desc'] + ' Total',
839 self
.metrics
[path
].set(counters
[i
][0], labels
)
840 path
= 'rbd_' + key
+ '_count'
841 if path
not in self
.metrics
:
842 self
.metrics
[path
] = Metric(
845 counter_info
['desc'] + ' Count',
848 self
.metrics
[path
].set(counters
[i
][1], labels
)
851 def refresh_rbd_stats_pools(self
, pools
):
852 self
.log
.debug('refreshing rbd pools %s' % (pools
))
855 counters_info
= self
.rbd_stats
['counters_info']
856 for pool_name
, cfg_ns_names
in pools
.items():
858 pool_id
= self
.rados
.pool_lookup(pool_name
)
859 with self
.rados
.open_ioctx(pool_name
) as ioctx
:
860 if pool_id
not in self
.rbd_stats
['pools']:
861 self
.rbd_stats
['pools'][pool_id
] = {'images': {}}
862 pool
= self
.rbd_stats
['pools'][pool_id
]
863 pool
['name'] = pool_name
864 pool
['ns_names'] = cfg_ns_names
866 nspace_names
= list(cfg_ns_names
)
868 nspace_names
= [''] + rbd
.namespace_list(ioctx
)
869 for nspace_name
in pool
['images']:
870 if nspace_name
not in nspace_names
:
871 del pool
['images'][nspace_name
]
872 for nspace_name
in nspace_names
:
874 not rbd
.namespace_exists(ioctx
, nspace_name
)):
875 self
.log
.debug('unknown namespace %s for pool %s' %
876 (nspace_name
, pool_name
))
878 ioctx
.set_namespace(nspace_name
)
879 if nspace_name
not in pool
['images']:
880 pool
['images'][nspace_name
] = {}
881 namespace
= pool
['images'][nspace_name
]
883 for image_meta
in RBD().list2(ioctx
):
884 image
= {'n': image_meta
['name']}
885 image_id
= image_meta
['id']
886 if image_id
in namespace
:
887 image
['c'] = namespace
[image_id
]['c']
889 image
['c'] = [[0, 0] for x
in counters_info
]
890 images
[image_id
] = image
891 pool
['images'][nspace_name
] = images
892 except Exception as e
:
893 self
.log
.error('failed listing pool %s: %s' % (pool_name
, e
))
894 self
.rbd_stats
['pools_refresh_time'] = time
.time()
896 def shutdown_rbd_stats(self
):
897 if 'query_id' in self
.rbd_stats
:
898 self
.remove_osd_perf_query(self
.rbd_stats
['query_id'])
899 del self
.rbd_stats
['query_id']
900 del self
.rbd_stats
['query']
901 self
.rbd_stats
['pools'].clear()
903 def add_fixed_name_metrics(self
):
905 Add fixed name metrics from existing ones that have details in their names
906 that should be in labels (not in name).
907 For backward compatibility, a new fixed name metric is created (instead of replacing)
908 and details are put in new labels.
909 Intended for RGW sync perf. counters but extendable as required.
910 See: https://tracker.ceph.com/issues/45311
913 for metric_path
in self
.metrics
.keys():
914 # Address RGW sync perf. counters.
915 match
= re
.search('^data-sync-from-(.*)\.', metric_path
)
917 new_path
= re
.sub('from-([^.]*)', 'from-zone', metric_path
)
918 if new_path
not in new_metrics
:
919 new_metrics
[new_path
] = Metric(
920 self
.metrics
[metric_path
].mtype
,
922 self
.metrics
[metric_path
].desc
,
923 self
.metrics
[metric_path
].labelnames
+ ('source_zone',)
925 for label_values
, value
in self
.metrics
[metric_path
].value
.items():
926 new_metrics
[new_path
].set(value
, label_values
+ (match
.group(1),))
928 self
.metrics
.update(new_metrics
)
931 # Clear the metrics before scraping
932 for k
in self
.metrics
.keys():
933 self
.metrics
[k
].clear()
937 self
.get_pool_stats()
940 self
.get_quorum_status()
941 self
.get_mgr_status()
942 self
.get_metadata_and_osd_status()
944 self
.get_num_objects()
946 for daemon
, counters
in self
.get_all_perf_counters().items():
947 for path
, counter_info
in counters
.items():
948 # Skip histograms, they are represented by long running avgs
949 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
950 if not stattype
or stattype
== 'histogram':
951 self
.log
.debug('ignoring %s, type %s' % (path
, stattype
))
954 path
, label_names
, labels
= self
._perfpath
_to
_path
_labels
(
957 # Get the value of the counter
958 value
= self
._perfvalue
_to
_value
(
959 counter_info
['type'], counter_info
['value'])
961 # Represent the long running avgs as sum/count pairs
962 if counter_info
['type'] & self
.PERFCOUNTER_LONGRUNAVG
:
963 _path
= path
+ '_sum'
964 if _path
not in self
.metrics
:
965 self
.metrics
[_path
] = Metric(
968 counter_info
['description'] + ' Total',
971 self
.metrics
[_path
].set(value
, labels
)
973 _path
= path
+ '_count'
974 if _path
not in self
.metrics
:
975 self
.metrics
[_path
] = Metric(
978 counter_info
['description'] + ' Count',
981 self
.metrics
[_path
].set(counter_info
['count'], labels
,)
983 if path
not in self
.metrics
:
984 self
.metrics
[path
] = Metric(
987 counter_info
['description'],
990 self
.metrics
[path
].set(value
, labels
)
992 self
.add_fixed_name_metrics()
995 # Return formatted metrics and clear no longer used data
996 _metrics
= [m
.str_expfmt() for m
in self
.metrics
.values()]
997 for k
in self
.metrics
.keys():
998 self
.metrics
[k
].clear()
1000 return ''.join(_metrics
) + '\n'
1002 def get_file_sd_config(self
):
1003 servers
= self
.list_servers()
1005 for server
in servers
:
1006 hostname
= server
.get('hostname', '')
1007 for service
in server
.get('services', []):
1008 if service
['type'] != 'mgr':
1011 # get port for prometheus module at mgr with id_
1012 # TODO use get_config_prefix or get_config here once
1013 # https://github.com/ceph/ceph/pull/20458 is merged
1014 result
= CommandResult("")
1015 global_instance().send_command(
1018 "prefix": "config-key get",
1019 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_
),
1022 r
, outb
, outs
= result
.wait()
1024 global_instance().log
.error("Failed to retrieve port for mgr {}: {}".format(id_
, outs
))
1025 targets
.append('{}:{}'.format(hostname
, DEFAULT_PORT
))
1027 port
= json
.loads(outb
)
1028 targets
.append('{}:{}'.format(hostname
, port
))
1036 return 0, json
.dumps(ret
), ""
1038 def self_test(self
):
1040 self
.get_file_sd_config()
1042 def handle_command(self
, inbuf
, cmd
):
1043 if cmd
['prefix'] == 'prometheus file_sd_config':
1044 return self
.get_file_sd_config()
1046 return (-errno
.EINVAL
, '',
1047 "Command not found '{0}'".format(cmd
['prefix']))
1053 # collapse everything to '/'
1054 def _cp_dispatch(self
, vpath
):
1055 cherrypy
.request
.path
= ''
1060 return '''<!DOCTYPE html>
1062 <head><title>Ceph Exporter</title></head>
1064 <h1>Ceph Exporter</h1>
1065 <p><a href='/metrics'>Metrics</a></p>
1071 instance
= global_instance()
1072 # Lock the function execution
1074 instance
.collect_lock
.acquire()
1075 return self
._metrics
(instance
)
1077 instance
.collect_lock
.release()
1080 def _metrics(instance
):
1081 # Return cached data if available and collected before the
1083 if instance
.collect_cache
and time
.time() - instance
.collect_time
< instance
.collect_timeout
:
1084 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
1085 return instance
.collect_cache
1087 if instance
.have_mon_connection():
1088 instance
.collect_cache
= None
1089 instance
.collect_time
= time
.time()
1090 instance
.collect_cache
= instance
.collect()
1091 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
1092 return instance
.collect_cache
1094 raise cherrypy
.HTTPError(503, 'No MON connection')
1096 # Make the cache timeout for collecting configurable
1097 self
.collect_timeout
= float(self
.get_localized_module_option(
1098 'scrape_interval', 5.0))
1100 server_addr
= self
.get_localized_module_option(
1101 'server_addr', get_default_addr())
1102 server_port
= self
.get_localized_module_option(
1103 'server_port', DEFAULT_PORT
)
1105 "server_addr: %s server_port: %s" %
1106 (server_addr
, server_port
)
1109 # Publish the URI that others may use to access the service we're
1110 # about to start serving
1111 self
.set_uri('http://{0}:{1}/'.format(
1112 socket
.getfqdn() if server_addr
in ['::', '0.0.0.0'] else server_addr
,
1116 cherrypy
.config
.update({
1117 'server.socket_host': server_addr
,
1118 'server.socket_port': int(server_port
),
1119 'engine.autoreload.on': False
1121 cherrypy
.tree
.mount(Root(), "/")
1122 self
.log
.info('Starting engine...')
1123 cherrypy
.engine
.start()
1124 self
.log
.info('Engine started.')
1125 # wait for the shutdown event
1126 self
.shutdown_event
.wait()
1127 self
.shutdown_event
.clear()
1128 cherrypy
.engine
.stop()
1129 self
.log
.info('Engine stopped.')
1130 self
.shutdown_rbd_stats()
1133 self
.log
.info('Stopping engine...')
1134 self
.shutdown_event
.set()
1137 class StandbyModule(MgrStandbyModule
):
1138 def __init__(self
, *args
, **kwargs
):
1139 super(StandbyModule
, self
).__init
__(*args
, **kwargs
)
1140 self
.shutdown_event
= threading
.Event()
1143 server_addr
= self
.get_localized_module_option(
1144 'server_addr', get_default_addr())
1145 server_port
= self
.get_localized_module_option(
1146 'server_port', DEFAULT_PORT
)
1147 self
.log
.info("server_addr: %s server_port: %s" %
1148 (server_addr
, server_port
))
1149 cherrypy
.config
.update({
1150 'server.socket_host': server_addr
,
1151 'server.socket_port': int(server_port
),
1152 'engine.autoreload.on': False
1160 active_uri
= module
.get_active_uri()
1161 return '''<!DOCTYPE html>
1163 <head><title>Ceph Exporter</title></head>
1165 <h1>Ceph Exporter</h1>
1166 <p><a href='{}metrics'>Metrics</a></p>
1168 </html>'''.format(active_uri
)
1172 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
1175 cherrypy
.tree
.mount(Root(), '/', {})
1176 self
.log
.info('Starting engine...')
1177 cherrypy
.engine
.start()
1178 self
.log
.info('Engine started.')
1179 # Wait for shutdown event
1180 self
.shutdown_event
.wait()
1181 self
.shutdown_event
.clear()
1182 cherrypy
.engine
.stop()
1183 self
.log
.info('Engine stopped.')
1186 self
.log
.info("Stopping engine...")
1187 self
.shutdown_event
.set()
1188 self
.log
.info("Stopped engine")