]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/prometheus/module.py
c215181540965bb4d6f0cc141a625552e037bced
2 from distutils
.version
import StrictVersion
11 from mgr_module
import MgrModule
, MgrStandbyModule
, CommandResult
, PG_STATES
12 from mgr_util
import get_default_addr
15 # Defaults for the Prometheus HTTP server. Can also set in config-key
16 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
17 # for Prometheus exporter port registry
21 # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
22 # that the ports its listening on are in fact bound. When using the any address
23 # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
24 # ipv6 isn't yet configured / supported and CherryPy throws an uncaught
26 if cherrypy
is not None:
27 v
= StrictVersion(cherrypy
.__version
__)
28 # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
29 # centos:7) and back to at least 3.0.0.
30 if StrictVersion("3.1.2") <= v
< StrictVersion("3.2.3"):
31 # https://github.com/cherrypy/cherrypy/issues/1100
32 from cherrypy
.process
import servers
33 servers
.wait_for_occupied_port
= lambda host
, port
: None
35 # cherrypy likes to sys.exit on error. don't let it take us down too!
36 def os_exit_noop(*args
, **kwargs
):
40 os
._exit
= os_exit_noop
42 # to access things in class Module from subclass Root. Because
43 # it's a dict, the writer doesn't need to declare 'global' for access
45 _global_instance
= {'plugin': None}
48 def global_instance():
49 assert _global_instance
['plugin'] is not None
50 return _global_instance
['plugin']
53 def health_status_to_number(status
):
54 if status
== 'HEALTH_OK':
56 elif status
== 'HEALTH_WARN':
58 elif status
== 'HEALTH_ERR':
62 DF_CLUSTER
= ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes']
64 DF_POOL
= ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty',
65 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
67 OSD_POOL_STATS
= ('recovering_objects_per_sec', 'recovering_bytes_per_sec',
68 'recovering_keys_per_sec', 'num_objects_recovered',
69 'num_bytes_recovered', 'num_bytes_recovered')
71 OSD_FLAGS
= ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance',
72 'norecover', 'noscrub', 'nodeep-scrub')
74 FS_METADATA
= ('data_pools', 'fs_id', 'metadata_pool', 'name')
76 MDS_METADATA
= ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank',
79 MON_METADATA
= ('ceph_daemon', 'hostname',
80 'public_addr', 'rank', 'ceph_version')
82 MGR_METADATA
= ('ceph_daemon', 'hostname', 'ceph_version')
84 MGR_STATUS
= ('ceph_daemon',)
86 MGR_MODULE_STATUS
= ('name',)
88 MGR_MODULE_CAN_RUN
= ('name',)
90 OSD_METADATA
= ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
91 'front_iface', 'hostname', 'objectstore', 'public_addr',
94 OSD_STATUS
= ['weight', 'up', 'in']
96 OSD_STATS
= ['apply_latency_ms', 'commit_latency_ms']
98 POOL_METADATA
= ('pool_id', 'name')
100 RGW_METADATA
= ('ceph_daemon', 'hostname', 'ceph_version')
102 RBD_MIRROR_METADATA
= ('ceph_daemon', 'id', 'instance_id', 'hostname',
105 DISK_OCCUPATION
= ('ceph_daemon', 'device', 'db_device',
106 'wal_device', 'instance')
108 NUM_OBJECTS
= ['degraded', 'misplaced', 'unfound']
111 class Metric(object):
112 def __init__(self
, mtype
, name
, desc
, labels
=None):
116 self
.labelnames
= labels
# tuple if present
117 self
.value
= {} # indexed by label values
122 def set(self
, value
, labelvalues
=None):
123 # labelvalues must be a tuple
124 labelvalues
= labelvalues
or ('',)
125 self
.value
[labelvalues
] = value
127 def str_expfmt(self
):
129 def promethize(path
):
130 ''' replace illegal metric name characters '''
131 result
= re
.sub(r
'[./\s]|::', '_', path
).replace('+', '_plus')
133 # Hyphens usually turn into underscores, unless they are
135 if result
.endswith("-"):
136 result
= result
[0:-1] + "_minus"
138 result
= result
.replace("-", "_")
140 return "ceph_{0}".format(result
)
143 ''' represent as Go-compatible float '''
144 if value
== float('inf'):
146 if value
== float('-inf'):
148 if math
.isnan(value
):
150 return repr(float(value
))
152 name
= promethize(self
.name
)
155 # TYPE {name} {mtype}'''.format(
161 for labelvalues
, value
in self
.value
.items():
163 labels
= zip(self
.labelnames
, labelvalues
)
164 labels
= ','.join('%s="%s"' % (k
, v
) for k
, v
in labels
)
168 fmtstr
= '\n{name}{{{labels}}} {value}'
170 fmtstr
= '\n{name} {value}'
171 expfmt
+= fmtstr
.format(
174 value
=floatstr(value
),
179 class Module(MgrModule
):
182 "cmd": "prometheus file_sd_config",
183 "desc": "Return file_sd compatible prometheus config for mgr cluster",
189 {'name': 'server_addr'},
190 {'name': 'server_port'},
191 {'name': 'scrape_interval'},
192 {'name': 'rbd_stats_pools'},
193 {'name': 'rbd_stats_pools_refresh_interval'},
196 def __init__(self
, *args
, **kwargs
):
197 super(Module
, self
).__init
__(*args
, **kwargs
)
198 self
.metrics
= self
._setup
_static
_metrics
()
199 self
.shutdown_event
= threading
.Event()
200 self
.collect_lock
= threading
.RLock()
201 self
.collect_time
= 0
202 self
.collect_timeout
= 5.0
203 self
.collect_cache
= None
206 'pools_refresh_time': 0,
208 'write_ops': {'type': self
.PERFCOUNTER_COUNTER
,
209 'desc': 'RBD image writes count'},
210 'read_ops': {'type': self
.PERFCOUNTER_COUNTER
,
211 'desc': 'RBD image reads count'},
212 'write_bytes': {'type': self
.PERFCOUNTER_COUNTER
,
213 'desc': 'RBD image bytes written'},
214 'read_bytes': {'type': self
.PERFCOUNTER_COUNTER
,
215 'desc': 'RBD image bytes read'},
216 'write_latency': {'type': self
.PERFCOUNTER_LONGRUNAVG
,
217 'desc': 'RBD image writes latency (msec)'},
218 'read_latency': {'type': self
.PERFCOUNTER_LONGRUNAVG
,
219 'desc': 'RBD image reads latency (msec)'},
222 _global_instance
['plugin'] = self
224 def _setup_static_metrics(self
):
226 metrics
['health_status'] = Metric(
229 'Cluster health status'
231 metrics
['mon_quorum_status'] = Metric(
234 'Monitors in quorum',
237 metrics
['fs_metadata'] = Metric(
243 metrics
['mds_metadata'] = Metric(
249 metrics
['mon_metadata'] = Metric(
255 metrics
['mgr_metadata'] = Metric(
261 metrics
['mgr_status'] = Metric(
264 'MGR status (0=standby, 1=active)',
267 metrics
['mgr_module_status'] = Metric(
270 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)',
273 metrics
['mgr_module_can_run'] = Metric(
275 'mgr_module_can_run',
276 'MGR module runnable state i.e. can it run (0=no, 1=yes)',
279 metrics
['osd_metadata'] = Metric(
286 # The reason for having this separate to OSD_METADATA is
287 # so that we can stably use the same tag names that
288 # the Prometheus node_exporter does
289 metrics
['disk_occupation'] = Metric(
292 'Associate Ceph daemon with disk used',
296 metrics
['pool_metadata'] = Metric(
303 metrics
['rgw_metadata'] = Metric(
310 metrics
['rbd_mirror_metadata'] = Metric(
312 'rbd_mirror_metadata',
313 'RBD Mirror Metadata',
317 metrics
['pg_total'] = Metric(
323 metrics
['scrape_duration_seconds'] = Metric(
325 'scrape_duration_secs',
326 'Time taken to gather metrics from Ceph (secs)'
329 for flag
in OSD_FLAGS
:
330 path
= 'osd_flag_{}'.format(flag
)
331 metrics
[path
] = Metric(
334 'OSD Flag {}'.format(flag
)
336 for state
in OSD_STATUS
:
337 path
= 'osd_{}'.format(state
)
338 metrics
[path
] = Metric(
341 'OSD status {}'.format(state
),
344 for stat
in OSD_STATS
:
345 path
= 'osd_{}'.format(stat
)
346 metrics
[path
] = Metric(
349 'OSD stat {}'.format(stat
),
352 for stat
in OSD_POOL_STATS
:
353 path
= 'pool_{}'.format(stat
)
354 metrics
[path
] = Metric(
357 "OSD POOL STATS: {}".format(stat
),
360 for state
in PG_STATES
:
361 path
= 'pg_{}'.format(state
)
362 metrics
[path
] = Metric(
365 'PG {}'.format(state
),
367 for state
in DF_CLUSTER
:
368 path
= 'cluster_{}'.format(state
)
369 metrics
[path
] = Metric(
372 'DF {}'.format(state
),
374 for state
in DF_POOL
:
375 path
= 'pool_{}'.format(state
)
376 metrics
[path
] = Metric(
379 'DF pool {}'.format(state
),
382 for state
in NUM_OBJECTS
:
383 path
= 'num_objects_{}'.format(state
)
384 metrics
[path
] = Metric(
387 'Number of {} objects'.format(state
),
392 def get_health(self
):
393 health
= json
.loads(self
.get('health')['json'])
394 self
.metrics
['health_status'].set(
395 health_status_to_number(health
['status'])
398 def get_pool_stats(self
):
399 # retrieve pool stats to provide per pool recovery metrics
400 # (osd_pool_stats moved to mgr in Mimic)
401 pstats
= self
.get('osd_pool_stats')
402 for pool
in pstats
['pool_stats']:
403 for stat
in OSD_POOL_STATS
:
404 self
.metrics
['pool_{}'.format(stat
)].set(
405 pool
['recovery_rate'].get(stat
, 0),
410 # maybe get the to-be-exported metrics from a config?
412 for stat
in DF_CLUSTER
:
413 self
.metrics
['cluster_{}'.format(stat
)].set(df
['stats'][stat
])
415 for pool
in df
['pools']:
417 self
.metrics
['pool_{}'.format(stat
)].set(
423 fs_map
= self
.get('fs_map')
424 servers
= self
.get_service_list()
426 for fs
in fs_map
['filesystems']:
427 # collect fs metadata
428 data_pools
= ",".join([str(pool
)
429 for pool
in fs
['mdsmap']['data_pools']])
430 self
.metrics
['fs_metadata'].set(1, (
433 fs
['mdsmap']['metadata_pool'],
434 fs
['mdsmap']['fs_name']
436 self
.log
.debug('mdsmap: {}'.format(fs
['mdsmap']))
437 for gid
, daemon
in fs
['mdsmap']['info'].items():
439 host_version
= servers
.get((id_
, 'mds'), ('', ''))
440 self
.metrics
['mds_metadata'].set(1, (
441 'mds.{}'.format(id_
), fs
['id'],
442 host_version
[0], daemon
['addr'],
443 daemon
['rank'], host_version
[1]
446 def get_quorum_status(self
):
447 mon_status
= json
.loads(self
.get('mon_status')['json'])
448 servers
= self
.get_service_list()
449 for mon
in mon_status
['monmap']['mons']:
452 host_version
= servers
.get((id_
, 'mon'), ('', ''))
453 self
.metrics
['mon_metadata'].set(1, (
454 'mon.{}'.format(id_
), host_version
[0],
455 mon
['public_addr'].split(':')[0], rank
,
458 in_quorum
= int(rank
in mon_status
['quorum'])
459 self
.metrics
['mon_quorum_status'].set(in_quorum
, (
460 'mon.{}'.format(id_
),
463 def get_mgr_status(self
):
464 mgr_map
= self
.get('mgr_map')
465 servers
= self
.get_service_list()
467 active
= mgr_map
['active_name']
468 standbys
= [s
.get('name') for s
in mgr_map
['standbys']]
470 all_mgrs
= list(standbys
)
471 all_mgrs
.append(active
)
473 all_modules
= {module
.get('name'):module
.get('can_run') for module
in mgr_map
['available_modules']}
476 host_version
= servers
.get((mgr
, 'mgr'), ('', ''))
479 ceph_release
= host_version
[1].split()[-2] # e.g. nautilus
483 self
.metrics
['mgr_metadata'].set(1, (
484 'mgr.{}'.format(mgr
), host_version
[0],
487 self
.metrics
['mgr_status'].set(_state
, (
488 'mgr.{}'.format(mgr
),
490 always_on_modules
= mgr_map
['always_on_modules'][ceph_release
]
491 active_modules
= list(always_on_modules
)
492 active_modules
.extend(mgr_map
['modules'])
494 for mod_name
in all_modules
.keys():
496 if mod_name
in always_on_modules
:
498 elif mod_name
in active_modules
:
503 _can_run
= 1 if all_modules
[mod_name
] else 0
504 self
.metrics
['mgr_module_status'].set(_state
, (mod_name
,))
505 self
.metrics
['mgr_module_can_run'].set(_can_run
, (mod_name
,))
507 def get_pg_status(self
):
508 # TODO add per pool status?
509 pg_status
= self
.get('pg_status')
511 # Set total count of PGs, first
512 self
.metrics
['pg_total'].set(pg_status
['num_pgs'])
515 for pg
in pg_status
['pgs_by_state']:
516 for state
in pg
['state_name'].split('+'):
517 reported_states
[state
] = reported_states
.get(
518 state
, 0) + pg
['count']
520 for state
in reported_states
:
521 path
= 'pg_{}'.format(state
)
523 self
.metrics
[path
].set(reported_states
[state
])
525 self
.log
.warn("skipping pg in unknown state {}".format(state
))
527 for state
in PG_STATES
:
528 if state
not in reported_states
:
530 self
.metrics
['pg_{}'.format(state
)].set(0)
533 "skipping pg in unknown state {}".format(state
))
535 def get_osd_stats(self
):
536 osd_stats
= self
.get('osd_stats')
537 for osd
in osd_stats
['osd_stats']:
539 for stat
in OSD_STATS
:
540 val
= osd
['perf_stat'][stat
]
541 self
.metrics
['osd_{}'.format(stat
)].set(val
, (
542 'osd.{}'.format(id_
),
545 def get_service_list(self
):
547 for server
in self
.list_servers():
548 version
= server
.get('ceph_version', '')
549 host
= server
.get('hostname', '')
550 for service
in server
.get('services', []):
551 ret
.update({(service
['id'], service
['type']): (host
, version
)})
554 def get_metadata_and_osd_status(self
):
555 osd_map
= self
.get('osd_map')
556 osd_flags
= osd_map
['flags'].split(',')
557 for flag
in OSD_FLAGS
:
558 self
.metrics
['osd_flag_{}'.format(flag
)].set(
559 int(flag
in osd_flags
)
562 osd_devices
= self
.get('osd_map_crush')['devices']
563 servers
= self
.get_service_list()
564 for osd
in osd_map
['osds']:
565 # id can be used to link osd metrics and metadata
567 # collect osd metadata
568 p_addr
= osd
['public_addr'].split(':')[0]
569 c_addr
= osd
['cluster_addr'].split(':')[0]
570 if p_addr
== "-" or c_addr
== "-":
572 "Missing address metadata for osd {0}, skipping occupation"
573 " and metadata records for this osd".format(id_
)
578 for osd_device
in osd_devices
:
579 if osd_device
['id'] == id_
:
580 dev_class
= osd_device
.get('class', '')
583 if dev_class
is None:
585 "OSD {0} is missing from CRUSH map, skipping output".format(
589 host_version
= servers
.get((str(id_
), 'osd'), ('', ''))
591 # collect disk occupation metadata
592 osd_metadata
= self
.get_metadata("osd", str(id_
))
593 if osd_metadata
is None:
596 obj_store
= osd_metadata
.get('osd_objectstore', '')
597 f_iface
= osd_metadata
.get('front_iface', '')
598 b_iface
= osd_metadata
.get('back_iface', '')
600 self
.metrics
['osd_metadata'].set(1, (
602 'osd.{}'.format(id_
),
613 for state
in OSD_STATUS
:
615 self
.metrics
['osd_{}'.format(state
)].set(status
, (
616 'osd.{}'.format(id_
),
619 if obj_store
== "filestore":
620 # collect filestore backend device
621 osd_dev_node
= osd_metadata
.get(
622 'backend_filestore_dev_node', None)
623 # collect filestore journal device
624 osd_wal_dev_node
= osd_metadata
.get('osd_journal', '')
626 elif obj_store
== "bluestore":
627 # collect bluestore backend device
628 osd_dev_node
= osd_metadata
.get(
629 'bluestore_bdev_dev_node', None)
630 # collect bluestore wal backend
631 osd_wal_dev_node
= osd_metadata
.get('bluefs_wal_dev_node', '')
632 # collect bluestore db backend
633 osd_db_dev_node
= osd_metadata
.get('bluefs_db_dev_node', '')
634 if osd_dev_node
and osd_dev_node
== "unknown":
637 osd_hostname
= osd_metadata
.get('hostname', None)
638 if osd_dev_node
and osd_hostname
:
639 self
.log
.debug("Got dev for osd {0}: {1}/{2}".format(
640 id_
, osd_hostname
, osd_dev_node
))
641 self
.metrics
['disk_occupation'].set(1, (
642 "osd.{0}".format(id_
),
649 self
.log
.info("Missing dev node metadata for osd {0}, skipping "
650 "occupation record for this osd".format(id_
))
653 for pool
in osd_map
['pools']:
654 self
.metrics
['pool_metadata'].set(
655 1, (pool
['pool'], pool
['pool_name']))
657 # Populate other servers metadata
658 for key
, value
in servers
.items():
659 service_id
, service_type
= key
660 if service_type
== 'rgw':
661 hostname
, version
= value
662 self
.metrics
['rgw_metadata'].set(
664 ('{}.{}'.format(service_type
, service_id
), hostname
, version
)
666 elif service_type
== 'rbd-mirror':
667 mirror_metadata
= self
.get_metadata('rbd-mirror', service_id
)
668 if mirror_metadata
is None:
670 mirror_metadata
['ceph_daemon'] = '{}.{}'.format(service_type
,
672 self
.metrics
['rbd_mirror_metadata'].set(
673 1, (mirror_metadata
.get(k
, '')
674 for k
in RBD_MIRROR_METADATA
)
677 def get_num_objects(self
):
678 pg_sum
= self
.get('pg_summary')['pg_stats_sum']['stat_sum']
679 for obj
in NUM_OBJECTS
:
680 stat
= 'num_objects_{}'.format(obj
)
681 self
.metrics
[stat
].set(pg_sum
[stat
])
683 def get_rbd_stats(self
):
684 # Per RBD image stats is collected by registering a dynamic osd perf
685 # stats query that tells OSDs to group stats for requests associated
686 # with RBD objects by pool, namespace, and image id, which are
687 # extracted from the request object names or other attributes.
688 # The RBD object names have the following prefixes:
689 # - rbd_data.{image_id}. (data stored in the same pool as metadata)
690 # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
691 # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
692 # The pool_id in the object name is the id of the pool with the image
693 # metdata, and should be used in the image spec. If there is no pool_id
694 # in the object name, the image pool is the pool where the object is
697 # Parse rbd_stats_pools option, which is a comma or space separated
698 # list of pool[/namespace] entries. If no namespace is specifed the
699 # stats are collected for every namespace in the pool.
700 pools_string
= self
.get_localized_module_option('rbd_stats_pools', '')
702 for p
in [x
for x
in re
.split('[\s,]+', pools_string
) if x
]:
706 # empty set means collect for all namespaces
707 pools
[pool_name
] = set()
709 if pool_name
not in pools
:
710 pools
[pool_name
] = set()
711 elif not pools
[pool_name
]:
713 pools
[pool_name
].add(s
[1])
716 for pool_id
in list(self
.rbd_stats
['pools']):
717 name
= self
.rbd_stats
['pools'][pool_id
]['name']
718 if name
not in pools
:
719 del self
.rbd_stats
['pools'][pool_id
]
721 rbd_stats_pools
[name
] = \
722 self
.rbd_stats
['pools'][pool_id
]['ns_names']
724 pools_refreshed
= False
726 next_refresh
= self
.rbd_stats
['pools_refresh_time'] + \
727 self
.get_localized_module_option(
728 'rbd_stats_pools_refresh_interval', 300)
729 if rbd_stats_pools
!= pools
or time
.time() >= next_refresh
:
730 self
.refresh_rbd_stats_pools(pools
)
731 pools_refreshed
= True
733 pool_ids
= list(self
.rbd_stats
['pools'])
735 pool_id_regex
= '^(' + '|'.join([str(x
) for x
in pool_ids
]) + ')$'
738 for pool_id
, pool
in self
.rbd_stats
['pools'].items():
740 nspace_names
.extend(pool
['ns_names'])
745 namespace_regex
= '^(' + \
746 "|".join([re
.escape(x
)
747 for x
in set(nspace_names
)]) + ')$'
749 namespace_regex
= '^(.*)$'
751 if 'query' in self
.rbd_stats
and \
752 (pool_id_regex
!= self
.rbd_stats
['query']['key_descriptor'][0]['regex'] or
753 namespace_regex
!= self
.rbd_stats
['query']['key_descriptor'][1]['regex']):
754 self
.remove_osd_perf_query(self
.rbd_stats
['query_id'])
755 del self
.rbd_stats
['query_id']
756 del self
.rbd_stats
['query']
758 if not self
.rbd_stats
['pools']:
761 counters_info
= self
.rbd_stats
['counters_info']
763 if 'query_id' not in self
.rbd_stats
:
766 {'type': 'pool_id', 'regex': pool_id_regex
},
767 {'type': 'namespace', 'regex': namespace_regex
},
768 {'type': 'object_name',
769 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
771 'performance_counter_descriptors': list(counters_info
),
773 query_id
= self
.add_osd_perf_query(query
)
775 self
.log
.error('failed to add query %s' % query
)
777 self
.rbd_stats
['query'] = query
778 self
.rbd_stats
['query_id'] = query_id
780 res
= self
.get_osd_perf_counters(self
.rbd_stats
['query_id'])
781 for c
in res
['counters']:
782 # if the pool id is not found in the object name use id of the
783 # pool where the object is located
785 pool_id
= int(c
['k'][2][0])
787 pool_id
= int(c
['k'][0][0])
788 if pool_id
not in self
.rbd_stats
['pools'] and not pools_refreshed
:
789 self
.refresh_rbd_stats_pools(pools
)
790 pools_refreshed
= True
791 if pool_id
not in self
.rbd_stats
['pools']:
793 pool
= self
.rbd_stats
['pools'][pool_id
]
794 nspace_name
= c
['k'][1][0]
795 if nspace_name
not in pool
['images']:
797 image_id
= c
['k'][2][1]
798 if image_id
not in pool
['images'][nspace_name
] and \
800 self
.refresh_rbd_stats_pools(pools
)
801 pool
= self
.rbd_stats
['pools'][pool_id
]
802 pools_refreshed
= True
803 if image_id
not in pool
['images'][nspace_name
]:
805 counters
= pool
['images'][nspace_name
][image_id
]['c']
806 for i
in range(len(c
['c'])):
807 counters
[i
][0] += c
['c'][i
][0]
808 counters
[i
][1] += c
['c'][i
][1]
810 label_names
= ("pool", "namespace", "image")
811 for pool_id
, pool
in self
.rbd_stats
['pools'].items():
812 pool_name
= pool
['name']
813 for nspace_name
, images
in pool
['images'].items():
814 for image_id
in images
:
815 image_name
= images
[image_id
]['n']
816 counters
= images
[image_id
]['c']
818 for key
in counters_info
:
819 counter_info
= counters_info
[key
]
820 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
821 labels
= (pool_name
, nspace_name
, image_name
)
822 if counter_info
['type'] == self
.PERFCOUNTER_COUNTER
:
824 if path
not in self
.metrics
:
825 self
.metrics
[path
] = Metric(
828 counter_info
['desc'],
831 self
.metrics
[path
].set(counters
[i
][0], labels
)
832 elif counter_info
['type'] == self
.PERFCOUNTER_LONGRUNAVG
:
833 path
= 'rbd_' + key
+ '_sum'
834 if path
not in self
.metrics
:
835 self
.metrics
[path
] = Metric(
838 counter_info
['desc'] + ' Total',
841 self
.metrics
[path
].set(counters
[i
][0], labels
)
842 path
= 'rbd_' + key
+ '_count'
843 if path
not in self
.metrics
:
844 self
.metrics
[path
] = Metric(
847 counter_info
['desc'] + ' Count',
850 self
.metrics
[path
].set(counters
[i
][1], labels
)
853 def refresh_rbd_stats_pools(self
, pools
):
854 self
.log
.debug('refreshing rbd pools %s' % (pools
))
857 counters_info
= self
.rbd_stats
['counters_info']
858 for pool_name
, cfg_ns_names
in pools
.items():
860 pool_id
= self
.rados
.pool_lookup(pool_name
)
861 with self
.rados
.open_ioctx(pool_name
) as ioctx
:
862 if pool_id
not in self
.rbd_stats
['pools']:
863 self
.rbd_stats
['pools'][pool_id
] = {'images': {}}
864 pool
= self
.rbd_stats
['pools'][pool_id
]
865 pool
['name'] = pool_name
866 pool
['ns_names'] = cfg_ns_names
868 nspace_names
= list(cfg_ns_names
)
870 nspace_names
= [''] + rbd
.namespace_list(ioctx
)
871 for nspace_name
in pool
['images']:
872 if nspace_name
not in nspace_names
:
873 del pool
['images'][nspace_name
]
874 for nspace_name
in nspace_names
:
876 not rbd
.namespace_exists(ioctx
, nspace_name
)):
877 self
.log
.debug('unknown namespace %s for pool %s' %
878 (nspace_name
, pool_name
))
880 ioctx
.set_namespace(nspace_name
)
881 if nspace_name
not in pool
['images']:
882 pool
['images'][nspace_name
] = {}
883 namespace
= pool
['images'][nspace_name
]
885 for image_meta
in RBD().list2(ioctx
):
886 image
= {'n': image_meta
['name']}
887 image_id
= image_meta
['id']
888 if image_id
in namespace
:
889 image
['c'] = namespace
[image_id
]['c']
891 image
['c'] = [[0, 0] for x
in counters_info
]
892 images
[image_id
] = image
893 pool
['images'][nspace_name
] = images
894 except Exception as e
:
895 self
.log
.error('failed listing pool %s: %s' % (pool_name
, e
))
896 self
.rbd_stats
['pools_refresh_time'] = time
.time()
898 def shutdown_rbd_stats(self
):
899 if 'query_id' in self
.rbd_stats
:
900 self
.remove_osd_perf_query(self
.rbd_stats
['query_id'])
901 del self
.rbd_stats
['query_id']
902 del self
.rbd_stats
['query']
903 self
.rbd_stats
['pools'].clear()
906 # Clear the metrics before scraping
907 for k
in self
.metrics
.keys():
908 self
.metrics
[k
].clear()
910 _start_time
= time
.time()
914 self
.get_pool_stats()
917 self
.get_quorum_status()
918 self
.get_mgr_status()
919 self
.get_metadata_and_osd_status()
921 self
.get_num_objects()
923 for daemon
, counters
in self
.get_all_perf_counters().items():
924 for path
, counter_info
in counters
.items():
925 # Skip histograms, they are represented by long running avgs
926 stattype
= self
._stattype
_to
_str
(counter_info
['type'])
927 if not stattype
or stattype
== 'histogram':
928 self
.log
.debug('ignoring %s, type %s' % (path
, stattype
))
931 path
, label_names
, labels
= self
._perfpath
_to
_path
_labels
(
934 # Get the value of the counter
935 value
= self
._perfvalue
_to
_value
(
936 counter_info
['type'], counter_info
['value'])
938 # Represent the long running avgs as sum/count pairs
939 if counter_info
['type'] & self
.PERFCOUNTER_LONGRUNAVG
:
940 _path
= path
+ '_sum'
941 if _path
not in self
.metrics
:
942 self
.metrics
[_path
] = Metric(
945 counter_info
['description'] + ' Total',
948 self
.metrics
[_path
].set(value
, labels
)
950 _path
= path
+ '_count'
951 if _path
not in self
.metrics
:
952 self
.metrics
[_path
] = Metric(
955 counter_info
['description'] + ' Count',
958 self
.metrics
[_path
].set(counter_info
['count'], labels
,)
960 if path
not in self
.metrics
:
961 self
.metrics
[path
] = Metric(
964 counter_info
['description'],
967 self
.metrics
[path
].set(value
, labels
)
971 _end_time
= time
.time()
972 self
.metrics
['scrape_duration_seconds'].set(_end_time
- _start_time
)
974 # Return formatted metrics and clear no longer used data
975 _metrics
= [m
.str_expfmt() for m
in self
.metrics
.values()]
976 for k
in self
.metrics
.keys():
977 self
.metrics
[k
].clear()
979 return ''.join(_metrics
) + '\n'
981 def get_file_sd_config(self
):
982 servers
= self
.list_servers()
984 for server
in servers
:
985 hostname
= server
.get('hostname', '')
986 for service
in server
.get('services', []):
987 if service
['type'] != 'mgr':
990 # get port for prometheus module at mgr with id_
991 # TODO use get_config_prefix or get_config here once
992 # https://github.com/ceph/ceph/pull/20458 is merged
993 result
= CommandResult("")
994 global_instance().send_command(
997 "prefix": "config-key get",
998 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_
),
1001 r
, outb
, outs
= result
.wait()
1003 global_instance().log
.error("Failed to retrieve port for mgr {}: {}".format(id_
, outs
))
1004 targets
.append('{}:{}'.format(hostname
, DEFAULT_PORT
))
1006 port
= json
.loads(outb
)
1007 targets
.append('{}:{}'.format(hostname
, port
))
1015 return 0, json
.dumps(ret
), ""
1017 def self_test(self
):
1019 self
.get_file_sd_config()
1021 def handle_command(self
, inbuf
, cmd
):
1022 if cmd
['prefix'] == 'prometheus file_sd_config':
1023 return self
.get_file_sd_config()
1025 return (-errno
.EINVAL
, '',
1026 "Command not found '{0}'".format(cmd
['prefix']))
1032 # collapse everything to '/'
1033 def _cp_dispatch(self
, vpath
):
1034 cherrypy
.request
.path
= ''
1039 return '''<!DOCTYPE html>
1041 <head><title>Ceph Exporter</title></head>
1043 <h1>Ceph Exporter</h1>
1044 <p><a href='/metrics'>Metrics</a></p>
1050 instance
= global_instance()
1051 # Lock the function execution
1053 instance
.collect_lock
.acquire()
1054 return self
._metrics
(instance
)
1056 instance
.collect_lock
.release()
1059 def _metrics(instance
):
1060 # Return cached data if available and collected before the cache times out
1061 if instance
.collect_cache
and time
.time() - instance
.collect_time
< instance
.collect_timeout
:
1062 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
1063 return instance
.collect_cache
1065 if instance
.have_mon_connection():
1066 instance
.collect_cache
= None
1067 instance
.collect_time
= time
.time()
1068 instance
.collect_cache
= instance
.collect()
1069 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
1070 return instance
.collect_cache
1072 raise cherrypy
.HTTPError(503, 'No MON connection')
1074 # Make the cache timeout for collecting configurable
1075 self
.collect_timeout
= self
.get_localized_module_option(
1076 'scrape_interval', 5.0)
1078 server_addr
= self
.get_localized_module_option(
1079 'server_addr', get_default_addr())
1080 server_port
= self
.get_localized_module_option(
1081 'server_port', DEFAULT_PORT
)
1083 "server_addr: %s server_port: %s" %
1084 (server_addr
, server_port
)
1087 # Publish the URI that others may use to access the service we're
1088 # about to start serving
1089 self
.set_uri('http://{0}:{1}/'.format(
1090 socket
.getfqdn() if server_addr
== '::' else server_addr
,
1094 cherrypy
.config
.update({
1095 'server.socket_host': server_addr
,
1096 'server.socket_port': int(server_port
),
1097 'engine.autoreload.on': False
1099 cherrypy
.tree
.mount(Root(), "/")
1100 self
.log
.info('Starting engine...')
1101 cherrypy
.engine
.start()
1102 self
.log
.info('Engine started.')
1103 # wait for the shutdown event
1104 self
.shutdown_event
.wait()
1105 self
.shutdown_event
.clear()
1106 cherrypy
.engine
.stop()
1107 self
.log
.info('Engine stopped.')
1108 self
.shutdown_rbd_stats()
1111 self
.log
.info('Stopping engine...')
1112 self
.shutdown_event
.set()
1115 class StandbyModule(MgrStandbyModule
):
1116 def __init__(self
, *args
, **kwargs
):
1117 super(StandbyModule
, self
).__init
__(*args
, **kwargs
)
1118 self
.shutdown_event
= threading
.Event()
1121 server_addr
= self
.get_localized_module_option(
1122 'server_addr', get_default_addr())
1123 server_port
= self
.get_localized_module_option(
1124 'server_port', DEFAULT_PORT
)
1125 self
.log
.info("server_addr: %s server_port: %s" %
1126 (server_addr
, server_port
))
1127 cherrypy
.config
.update({
1128 'server.socket_host': server_addr
,
1129 'server.socket_port': int(server_port
),
1130 'engine.autoreload.on': False
1138 active_uri
= module
.get_active_uri()
1139 return '''<!DOCTYPE html>
1141 <head><title>Ceph Exporter</title></head>
1143 <h1>Ceph Exporter</h1>
1144 <p><a href='{}metrics'>Metrics</a></p>
1146 </html>'''.format(active_uri
)
1150 cherrypy
.response
.headers
['Content-Type'] = 'text/plain'
1153 cherrypy
.tree
.mount(Root(), '/', {})
1154 self
.log
.info('Starting engine...')
1155 cherrypy
.engine
.start()
1156 self
.log
.info('Engine started.')
1157 # Wait for shutdown event
1158 self
.shutdown_event
.wait()
1159 self
.shutdown_event
.clear()
1160 cherrypy
.engine
.stop()
1161 self
.log
.info('Engine stopped.')
1164 self
.log
.info("Stopping engine...")
1165 self
.shutdown_event
.set()
1166 self
.log
.info("Stopped engine")