]>
Commit | Line | Data |
---|---|---|
c07f9fc5 | 1 | import cherrypy |
a8e16298 | 2 | from distutils.version import StrictVersion |
3efd9988 FG |
3 | import json |
4 | import errno | |
c07f9fc5 FG |
5 | import math |
6 | import os | |
11fdf7f2 | 7 | import re |
94b18763 | 8 | import socket |
91327a77 AA |
9 | import threading |
10 | import time | |
11fdf7f2 | 11 | from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES |
494da23a | 12 | from mgr_util import get_default_addr |
11fdf7f2 | 13 | from rbd import RBD |
c07f9fc5 FG |
14 | |
15 | # Defaults for the Prometheus HTTP server. Can also set in config-key | |
16 | # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations | |
17 | # for Prometheus exporter port registry | |
18 | ||
c07f9fc5 FG |
19 | DEFAULT_PORT = 9283 |
20 | ||
a8e16298 TL |
21 | # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify |
22 | # that the ports its listening on are in fact bound. When using the any address | |
23 | # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes) | |
24 | # ipv6 isn't yet configured / supported and CherryPy throws an uncaught | |
25 | # exception. | |
26 | if cherrypy is not None: | |
27 | v = StrictVersion(cherrypy.__version__) | |
28 | # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on | |
29 | # centos:7) and back to at least 3.0.0. | |
30 | if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"): | |
31 | # https://github.com/cherrypy/cherrypy/issues/1100 | |
32 | from cherrypy.process import servers | |
33 | servers.wait_for_occupied_port = lambda host, port: None | |
c07f9fc5 FG |
34 | |
35 | # cherrypy likes to sys.exit on error. don't let it take us down too! | |
3efd9988 | 36 | def os_exit_noop(*args, **kwargs): |
c07f9fc5 FG |
37 | pass |
38 | ||
39 | ||
40 | os._exit = os_exit_noop | |
41 | ||
c07f9fc5 FG |
42 | # to access things in class Module from subclass Root. Because |
43 | # it's a dict, the writer doesn't need to declare 'global' for access | |
44 | ||
45 | _global_instance = {'plugin': None} | |
46 | ||
47 | ||
48 | def global_instance(): | |
49 | assert _global_instance['plugin'] is not None | |
50 | return _global_instance['plugin'] | |
51 | ||
52 | ||
3efd9988 | 53 | def health_status_to_number(status): |
3efd9988 FG |
54 | if status == 'HEALTH_OK': |
55 | return 0 | |
56 | elif status == 'HEALTH_WARN': | |
57 | return 1 | |
58 | elif status == 'HEALTH_ERR': | |
59 | return 2 | |
c07f9fc5 | 60 | |
11fdf7f2 TL |
61 | |
62 | DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes'] | |
63 | ||
64 | DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty', | |
3efd9988 | 65 | 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes'] |
c07f9fc5 | 66 | |
11fdf7f2 TL |
67 | OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec', |
68 | 'recovering_keys_per_sec', 'num_objects_recovered', | |
69 | 'num_bytes_recovered', 'num_bytes_recovered') | |
70 | ||
94b18763 FG |
71 | OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance', |
72 | 'norecover', 'noscrub', 'nodeep-scrub') | |
3efd9988 | 73 | |
28e407b8 | 74 | FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name') |
b32b8144 | 75 | |
28e407b8 AA |
76 | MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank', |
77 | 'ceph_version') | |
3efd9988 | 78 | |
11fdf7f2 TL |
79 | MON_METADATA = ('ceph_daemon', 'hostname', |
80 | 'public_addr', 'rank', 'ceph_version') | |
c07f9fc5 | 81 | |
494da23a TL |
82 | MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
83 | ||
84 | MGR_STATUS = ('ceph_daemon',) | |
85 | ||
86 | MGR_MODULE_STATUS = ('name',) | |
87 | ||
88 | MGR_MODULE_CAN_RUN = ('name',) | |
89 | ||
a8e16298 TL |
90 | OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class', |
91 | 'front_iface', 'hostname', 'objectstore', 'public_addr', | |
92 | 'ceph_version') | |
c07f9fc5 | 93 | |
94b18763 | 94 | OSD_STATUS = ['weight', 'up', 'in'] |
c07f9fc5 | 95 | |
94b18763 | 96 | OSD_STATS = ['apply_latency_ms', 'commit_latency_ms'] |
c07f9fc5 | 97 | |
94b18763 | 98 | POOL_METADATA = ('pool_id', 'name') |
c07f9fc5 | 99 | |
28e407b8 | 100 | RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
c07f9fc5 | 101 | |
11fdf7f2 TL |
102 | RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname', |
103 | 'ceph_version') | |
104 | ||
105 | DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', | |
106 | 'wal_device', 'instance') | |
28e407b8 AA |
107 | |
108 | NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] | |
c07f9fc5 | 109 | |
c07f9fc5 | 110 | |
91327a77 AA |
111 | class Metric(object): |
112 | def __init__(self, mtype, name, desc, labels=None): | |
113 | self.mtype = mtype | |
114 | self.name = name | |
115 | self.desc = desc | |
116 | self.labelnames = labels # tuple if present | |
117 | self.value = {} # indexed by label values | |
118 | ||
119 | def clear(self): | |
120 | self.value = {} | |
121 | ||
122 | def set(self, value, labelvalues=None): | |
123 | # labelvalues must be a tuple | |
124 | labelvalues = labelvalues or ('',) | |
125 | self.value[labelvalues] = value | |
3efd9988 | 126 | |
91327a77 AA |
127 | def str_expfmt(self): |
128 | ||
129 | def promethize(path): | |
130 | ''' replace illegal metric name characters ''' | |
81eedcae | 131 | result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus') |
91327a77 AA |
132 | |
133 | # Hyphens usually turn into underscores, unless they are | |
134 | # trailing | |
135 | if result.endswith("-"): | |
136 | result = result[0:-1] + "_minus" | |
137 | else: | |
138 | result = result.replace("-", "_") | |
139 | ||
140 | return "ceph_{0}".format(result) | |
141 | ||
142 | def floatstr(value): | |
143 | ''' represent as Go-compatible float ''' | |
144 | if value == float('inf'): | |
145 | return '+Inf' | |
146 | if value == float('-inf'): | |
147 | return '-Inf' | |
148 | if math.isnan(value): | |
149 | return 'NaN' | |
150 | return repr(float(value)) | |
151 | ||
152 | name = promethize(self.name) | |
153 | expfmt = ''' | |
154 | # HELP {name} {desc} | |
155 | # TYPE {name} {mtype}'''.format( | |
156 | name=name, | |
157 | desc=self.desc, | |
158 | mtype=self.mtype, | |
159 | ) | |
160 | ||
161 | for labelvalues, value in self.value.items(): | |
162 | if self.labelnames: | |
163 | labels = zip(self.labelnames, labelvalues) | |
164 | labels = ','.join('%s="%s"' % (k, v) for k, v in labels) | |
165 | else: | |
166 | labels = '' | |
167 | if labels: | |
168 | fmtstr = '\n{name}{{{labels}}} {value}' | |
169 | else: | |
170 | fmtstr = '\n{name} {value}' | |
171 | expfmt += fmtstr.format( | |
172 | name=name, | |
173 | labels=labels, | |
174 | value=floatstr(value), | |
175 | ) | |
176 | return expfmt | |
177 | ||
178 | ||
179 | class Module(MgrModule): | |
180 | COMMANDS = [ | |
181 | { | |
11fdf7f2 TL |
182 | "cmd": "prometheus file_sd_config", |
183 | "desc": "Return file_sd compatible prometheus config for mgr cluster", | |
184 | "perm": "r" | |
91327a77 AA |
185 | }, |
186 | ] | |
187 | ||
11fdf7f2 TL |
188 | MODULE_OPTIONS = [ |
189 | {'name': 'server_addr'}, | |
190 | {'name': 'server_port'}, | |
191 | {'name': 'scrape_interval'}, | |
192 | {'name': 'rbd_stats_pools'}, | |
193 | {'name': 'rbd_stats_pools_refresh_interval'}, | |
91327a77 AA |
194 | ] |
195 | ||
196 | def __init__(self, *args, **kwargs): | |
197 | super(Module, self).__init__(*args, **kwargs) | |
198 | self.metrics = self._setup_static_metrics() | |
199 | self.shutdown_event = threading.Event() | |
200 | self.collect_lock = threading.RLock() | |
201 | self.collect_time = 0 | |
202 | self.collect_timeout = 5.0 | |
203 | self.collect_cache = None | |
11fdf7f2 TL |
204 | self.rbd_stats = { |
205 | 'pools': {}, | |
206 | 'pools_refresh_time': 0, | |
207 | 'counters_info': { | |
208 | 'write_ops': {'type': self.PERFCOUNTER_COUNTER, | |
209 | 'desc': 'RBD image writes count'}, | |
210 | 'read_ops': {'type': self.PERFCOUNTER_COUNTER, | |
211 | 'desc': 'RBD image reads count'}, | |
212 | 'write_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
213 | 'desc': 'RBD image bytes written'}, | |
214 | 'read_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
215 | 'desc': 'RBD image bytes read'}, | |
216 | 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
217 | 'desc': 'RBD image writes latency (msec)'}, | |
218 | 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
219 | 'desc': 'RBD image reads latency (msec)'}, | |
220 | }, | |
221 | } | |
91327a77 | 222 | _global_instance['plugin'] = self |
3efd9988 FG |
223 | |
224 | def _setup_static_metrics(self): | |
225 | metrics = {} | |
226 | metrics['health_status'] = Metric( | |
227 | 'untyped', | |
228 | 'health_status', | |
229 | 'Cluster health status' | |
230 | ) | |
94b18763 | 231 | metrics['mon_quorum_status'] = Metric( |
3efd9988 | 232 | 'gauge', |
94b18763 FG |
233 | 'mon_quorum_status', |
234 | 'Monitors in quorum', | |
235 | ('ceph_daemon',) | |
236 | ) | |
237 | metrics['fs_metadata'] = Metric( | |
238 | 'untyped', | |
239 | 'fs_metadata', | |
240 | 'FS Metadata', | |
241 | FS_METADATA | |
242 | ) | |
243 | metrics['mds_metadata'] = Metric( | |
244 | 'untyped', | |
245 | 'mds_metadata', | |
246 | 'MDS Metadata', | |
247 | MDS_METADATA | |
248 | ) | |
249 | metrics['mon_metadata'] = Metric( | |
250 | 'untyped', | |
251 | 'mon_metadata', | |
252 | 'MON Metadata', | |
253 | MON_METADATA | |
3efd9988 | 254 | ) |
494da23a TL |
255 | metrics['mgr_metadata'] = Metric( |
256 | 'gauge', | |
257 | 'mgr_metadata', | |
258 | 'MGR metadata', | |
259 | MGR_METADATA | |
260 | ) | |
261 | metrics['mgr_status'] = Metric( | |
262 | 'gauge', | |
263 | 'mgr_status', | |
264 | 'MGR status (0=standby, 1=active)', | |
265 | MGR_STATUS | |
266 | ) | |
267 | metrics['mgr_module_status'] = Metric( | |
268 | 'gauge', | |
269 | 'mgr_module_status', | |
270 | 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)', | |
271 | MGR_MODULE_STATUS | |
272 | ) | |
273 | metrics['mgr_module_can_run'] = Metric( | |
274 | 'gauge', | |
275 | 'mgr_module_can_run', | |
276 | 'MGR module runnable state i.e. can it run (0=no, 1=yes)', | |
277 | MGR_MODULE_CAN_RUN | |
278 | ) | |
3efd9988 FG |
279 | metrics['osd_metadata'] = Metric( |
280 | 'untyped', | |
281 | 'osd_metadata', | |
282 | 'OSD Metadata', | |
283 | OSD_METADATA | |
284 | ) | |
c07f9fc5 | 285 | |
3efd9988 FG |
286 | # The reason for having this separate to OSD_METADATA is |
287 | # so that we can stably use the same tag names that | |
288 | # the Prometheus node_exporter does | |
289 | metrics['disk_occupation'] = Metric( | |
b32b8144 | 290 | 'untyped', |
3efd9988 FG |
291 | 'disk_occupation', |
292 | 'Associate Ceph daemon with disk used', | |
293 | DISK_OCCUPATION | |
294 | ) | |
c07f9fc5 | 295 | |
3efd9988 FG |
296 | metrics['pool_metadata'] = Metric( |
297 | 'untyped', | |
298 | 'pool_metadata', | |
299 | 'POOL Metadata', | |
300 | POOL_METADATA | |
301 | ) | |
94b18763 FG |
302 | |
303 | metrics['rgw_metadata'] = Metric( | |
304 | 'untyped', | |
305 | 'rgw_metadata', | |
306 | 'RGW Metadata', | |
307 | RGW_METADATA | |
308 | ) | |
309 | ||
11fdf7f2 TL |
310 | metrics['rbd_mirror_metadata'] = Metric( |
311 | 'untyped', | |
312 | 'rbd_mirror_metadata', | |
313 | 'RBD Mirror Metadata', | |
314 | RBD_MIRROR_METADATA | |
315 | ) | |
316 | ||
94b18763 FG |
317 | metrics['pg_total'] = Metric( |
318 | 'gauge', | |
319 | 'pg_total', | |
92f5a8d4 TL |
320 | 'PG Total Count per Pool', |
321 | ('pool_id',) | |
94b18763 FG |
322 | ) |
323 | ||
11fdf7f2 TL |
324 | metrics['scrape_duration_seconds'] = Metric( |
325 | 'gauge', | |
326 | 'scrape_duration_secs', | |
327 | 'Time taken to gather metrics from Ceph (secs)' | |
328 | ) | |
329 | ||
94b18763 FG |
330 | for flag in OSD_FLAGS: |
331 | path = 'osd_flag_{}'.format(flag) | |
332 | metrics[path] = Metric( | |
333 | 'untyped', | |
334 | path, | |
335 | 'OSD Flag {}'.format(flag) | |
336 | ) | |
3efd9988 FG |
337 | for state in OSD_STATUS: |
338 | path = 'osd_{}'.format(state) | |
3efd9988 FG |
339 | metrics[path] = Metric( |
340 | 'untyped', | |
c07f9fc5 | 341 | path, |
3efd9988 FG |
342 | 'OSD status {}'.format(state), |
343 | ('ceph_daemon',) | |
c07f9fc5 | 344 | ) |
b32b8144 FG |
345 | for stat in OSD_STATS: |
346 | path = 'osd_{}'.format(stat) | |
b32b8144 FG |
347 | metrics[path] = Metric( |
348 | 'gauge', | |
349 | path, | |
350 | 'OSD stat {}'.format(stat), | |
351 | ('ceph_daemon',) | |
352 | ) | |
11fdf7f2 TL |
353 | for stat in OSD_POOL_STATS: |
354 | path = 'pool_{}'.format(stat) | |
355 | metrics[path] = Metric( | |
356 | 'gauge', | |
357 | path, | |
358 | "OSD POOL STATS: {}".format(stat), | |
359 | ('pool_id',) | |
360 | ) | |
3efd9988 FG |
361 | for state in PG_STATES: |
362 | path = 'pg_{}'.format(state) | |
3efd9988 FG |
363 | metrics[path] = Metric( |
364 | 'gauge', | |
365 | path, | |
92f5a8d4 TL |
366 | 'PG {} per pool'.format(state), |
367 | ('pool_id',) | |
3efd9988 FG |
368 | ) |
369 | for state in DF_CLUSTER: | |
370 | path = 'cluster_{}'.format(state) | |
3efd9988 FG |
371 | metrics[path] = Metric( |
372 | 'gauge', | |
373 | path, | |
374 | 'DF {}'.format(state), | |
375 | ) | |
376 | for state in DF_POOL: | |
377 | path = 'pool_{}'.format(state) | |
3efd9988 FG |
378 | metrics[path] = Metric( |
379 | 'gauge', | |
380 | path, | |
381 | 'DF pool {}'.format(state), | |
382 | ('pool_id',) | |
383 | ) | |
28e407b8 AA |
384 | for state in NUM_OBJECTS: |
385 | path = 'num_objects_{}'.format(state) | |
386 | metrics[path] = Metric( | |
387 | 'gauge', | |
388 | path, | |
389 | 'Number of {} objects'.format(state), | |
390 | ) | |
3efd9988 FG |
391 | |
392 | return metrics | |
c07f9fc5 | 393 | |
3efd9988 FG |
394 | def get_health(self): |
395 | health = json.loads(self.get('health')['json']) | |
91327a77 AA |
396 | self.metrics['health_status'].set( |
397 | health_status_to_number(health['status']) | |
c07f9fc5 FG |
398 | ) |
399 | ||
11fdf7f2 TL |
400 | def get_pool_stats(self): |
401 | # retrieve pool stats to provide per pool recovery metrics | |
402 | # (osd_pool_stats moved to mgr in Mimic) | |
403 | pstats = self.get('osd_pool_stats') | |
404 | for pool in pstats['pool_stats']: | |
405 | for stat in OSD_POOL_STATS: | |
406 | self.metrics['pool_{}'.format(stat)].set( | |
407 | pool['recovery_rate'].get(stat, 0), | |
408 | (pool['pool_id'],) | |
409 | ) | |
410 | ||
3efd9988 FG |
411 | def get_df(self): |
412 | # maybe get the to-be-exported metrics from a config? | |
413 | df = self.get('df') | |
414 | for stat in DF_CLUSTER: | |
91327a77 | 415 | self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat]) |
3efd9988 FG |
416 | |
417 | for pool in df['pools']: | |
418 | for stat in DF_POOL: | |
91327a77 AA |
419 | self.metrics['pool_{}'.format(stat)].set( |
420 | pool['stats'][stat], | |
421 | (pool['id'],) | |
422 | ) | |
94b18763 FG |
423 | |
424 | def get_fs(self): | |
425 | fs_map = self.get('fs_map') | |
426 | servers = self.get_service_list() | |
427 | active_daemons = [] | |
428 | for fs in fs_map['filesystems']: | |
429 | # collect fs metadata | |
11fdf7f2 TL |
430 | data_pools = ",".join([str(pool) |
431 | for pool in fs['mdsmap']['data_pools']]) | |
91327a77 AA |
432 | self.metrics['fs_metadata'].set(1, ( |
433 | data_pools, | |
434 | fs['id'], | |
435 | fs['mdsmap']['metadata_pool'], | |
436 | fs['mdsmap']['fs_name'] | |
437 | )) | |
28e407b8 | 438 | self.log.debug('mdsmap: {}'.format(fs['mdsmap'])) |
94b18763 FG |
439 | for gid, daemon in fs['mdsmap']['info'].items(): |
440 | id_ = daemon['name'] | |
11fdf7f2 | 441 | host_version = servers.get((id_, 'mds'), ('', '')) |
91327a77 AA |
442 | self.metrics['mds_metadata'].set(1, ( |
443 | 'mds.{}'.format(id_), fs['id'], | |
444 | host_version[0], daemon['addr'], | |
445 | daemon['rank'], host_version[1] | |
446 | )) | |
3efd9988 FG |
447 | |
448 | def get_quorum_status(self): | |
449 | mon_status = json.loads(self.get('mon_status')['json']) | |
94b18763 FG |
450 | servers = self.get_service_list() |
451 | for mon in mon_status['monmap']['mons']: | |
452 | rank = mon['rank'] | |
453 | id_ = mon['name'] | |
11fdf7f2 | 454 | host_version = servers.get((id_, 'mon'), ('', '')) |
91327a77 AA |
455 | self.metrics['mon_metadata'].set(1, ( |
456 | 'mon.{}'.format(id_), host_version[0], | |
457 | mon['public_addr'].split(':')[0], rank, | |
458 | host_version[1] | |
459 | )) | |
94b18763 | 460 | in_quorum = int(rank in mon_status['quorum']) |
91327a77 AA |
461 | self.metrics['mon_quorum_status'].set(in_quorum, ( |
462 | 'mon.{}'.format(id_), | |
463 | )) | |
3efd9988 | 464 | |
494da23a TL |
465 | def get_mgr_status(self): |
466 | mgr_map = self.get('mgr_map') | |
467 | servers = self.get_service_list() | |
468 | ||
469 | active = mgr_map['active_name'] | |
470 | standbys = [s.get('name') for s in mgr_map['standbys']] | |
471 | ||
472 | all_mgrs = list(standbys) | |
473 | all_mgrs.append(active) | |
474 | ||
475 | all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']} | |
476 | ||
eafe8130 | 477 | ceph_release = None |
494da23a TL |
478 | for mgr in all_mgrs: |
479 | host_version = servers.get((mgr, 'mgr'), ('', '')) | |
480 | if mgr == active: | |
481 | _state = 1 | |
482 | ceph_release = host_version[1].split()[-2] # e.g. nautilus | |
483 | else: | |
484 | _state = 0 | |
485 | ||
486 | self.metrics['mgr_metadata'].set(1, ( | |
487 | 'mgr.{}'.format(mgr), host_version[0], | |
488 | host_version[1] | |
489 | )) | |
490 | self.metrics['mgr_status'].set(_state, ( | |
491 | 'mgr.{}'.format(mgr), | |
492 | )) | |
eafe8130 | 493 | always_on_modules = mgr_map['always_on_modules'].get(ceph_release, []) |
494da23a TL |
494 | active_modules = list(always_on_modules) |
495 | active_modules.extend(mgr_map['modules']) | |
496 | ||
497 | for mod_name in all_modules.keys(): | |
498 | ||
499 | if mod_name in always_on_modules: | |
500 | _state = 2 | |
501 | elif mod_name in active_modules: | |
502 | _state = 1 | |
503 | else: | |
504 | _state = 0 | |
505 | ||
506 | _can_run = 1 if all_modules[mod_name] else 0 | |
507 | self.metrics['mgr_module_status'].set(_state, (mod_name,)) | |
508 | self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,)) | |
509 | ||
3efd9988 | 510 | def get_pg_status(self): |
94b18763 | 511 | |
92f5a8d4 TL |
512 | pg_summary = self.get('pg_summary') |
513 | ||
514 | for pool in pg_summary['by_pool']: | |
515 | total = 0 | |
516 | for state_name, count in pg_summary['by_pool'][pool].items(): | |
517 | reported_states = {} | |
518 | ||
519 | for state in state_name.split('+'): | |
520 | reported_states[state] = reported_states.get( | |
521 | state, 0) + count | |
522 | ||
523 | for state in reported_states: | |
524 | path = 'pg_{}'.format(state) | |
525 | try: | |
526 | self.metrics[path].set(reported_states[state],(pool,)) | |
527 | except KeyError: | |
528 | self.log.warn("skipping pg in unknown state {}".format(state)) | |
529 | ||
530 | for state in PG_STATES: | |
531 | if state not in reported_states: | |
532 | try: | |
533 | self.metrics['pg_{}'.format(state)].set(0,(pool,)) | |
534 | except KeyError: | |
535 | self.log.warn( | |
536 | "skipping pg in unknown state {}".format(state)) | |
537 | total = total + count | |
538 | self.metrics['pg_total'].set(total,(pool,)) | |
b32b8144 FG |
539 | |
540 | def get_osd_stats(self): | |
541 | osd_stats = self.get('osd_stats') | |
542 | for osd in osd_stats['osd_stats']: | |
543 | id_ = osd['osd'] | |
544 | for stat in OSD_STATS: | |
94b18763 | 545 | val = osd['perf_stat'][stat] |
91327a77 AA |
546 | self.metrics['osd_{}'.format(stat)].set(val, ( |
547 | 'osd.{}'.format(id_), | |
548 | )) | |
94b18763 FG |
549 | |
550 | def get_service_list(self): | |
551 | ret = {} | |
552 | for server in self.list_servers(): | |
553 | version = server.get('ceph_version', '') | |
554 | host = server.get('hostname', '') | |
555 | for service in server.get('services', []): | |
556 | ret.update({(service['id'], service['type']): (host, version)}) | |
557 | return ret | |
3efd9988 FG |
558 | |
559 | def get_metadata_and_osd_status(self): | |
560 | osd_map = self.get('osd_map') | |
94b18763 FG |
561 | osd_flags = osd_map['flags'].split(',') |
562 | for flag in OSD_FLAGS: | |
91327a77 AA |
563 | self.metrics['osd_flag_{}'.format(flag)].set( |
564 | int(flag in osd_flags) | |
565 | ) | |
94b18763 | 566 | |
3efd9988 | 567 | osd_devices = self.get('osd_map_crush')['devices'] |
94b18763 | 568 | servers = self.get_service_list() |
3efd9988 | 569 | for osd in osd_map['osds']: |
94b18763 | 570 | # id can be used to link osd metrics and metadata |
3efd9988 | 571 | id_ = osd['osd'] |
94b18763 | 572 | # collect osd metadata |
3efd9988 FG |
573 | p_addr = osd['public_addr'].split(':')[0] |
574 | c_addr = osd['cluster_addr'].split(':')[0] | |
94b18763 FG |
575 | if p_addr == "-" or c_addr == "-": |
576 | self.log.info( | |
577 | "Missing address metadata for osd {0}, skipping occupation" | |
578 | " and metadata records for this osd".format(id_) | |
579 | ) | |
580 | continue | |
581 | ||
582 | dev_class = None | |
583 | for osd_device in osd_devices: | |
584 | if osd_device['id'] == id_: | |
585 | dev_class = osd_device.get('class', '') | |
586 | break | |
587 | ||
588 | if dev_class is None: | |
589 | self.log.info( | |
590 | "OSD {0} is missing from CRUSH map, skipping output".format( | |
591 | id_)) | |
592 | continue | |
593 | ||
11fdf7f2 | 594 | host_version = servers.get((str(id_), 'osd'), ('', '')) |
94b18763 | 595 | |
a8e16298 TL |
596 | # collect disk occupation metadata |
597 | osd_metadata = self.get_metadata("osd", str(id_)) | |
598 | if osd_metadata is None: | |
599 | continue | |
600 | ||
601 | obj_store = osd_metadata.get('osd_objectstore', '') | |
602 | f_iface = osd_metadata.get('front_iface', '') | |
603 | b_iface = osd_metadata.get('back_iface', '') | |
604 | ||
91327a77 | 605 | self.metrics['osd_metadata'].set(1, ( |
a8e16298 | 606 | b_iface, |
28e407b8 | 607 | 'osd.{}'.format(id_), |
3efd9988 | 608 | c_addr, |
94b18763 | 609 | dev_class, |
a8e16298 | 610 | f_iface, |
28e407b8 | 611 | host_version[0], |
a8e16298 TL |
612 | obj_store, |
613 | p_addr, | |
614 | host_version[1] | |
3efd9988 | 615 | )) |
94b18763 FG |
616 | |
617 | # collect osd status | |
3efd9988 FG |
618 | for state in OSD_STATUS: |
619 | status = osd[state] | |
91327a77 AA |
620 | self.metrics['osd_{}'.format(state)].set(status, ( |
621 | 'osd.{}'.format(id_), | |
622 | )) | |
3efd9988 | 623 | |
92f5a8d4 | 624 | osd_dev_node = None |
a8e16298 | 625 | if obj_store == "filestore": |
11fdf7f2 TL |
626 | # collect filestore backend device |
627 | osd_dev_node = osd_metadata.get( | |
628 | 'backend_filestore_dev_node', None) | |
629 | # collect filestore journal device | |
f64942e4 AA |
630 | osd_wal_dev_node = osd_metadata.get('osd_journal', '') |
631 | osd_db_dev_node = '' | |
a8e16298 | 632 | elif obj_store == "bluestore": |
11fdf7f2 TL |
633 | # collect bluestore backend device |
634 | osd_dev_node = osd_metadata.get( | |
635 | 'bluestore_bdev_dev_node', None) | |
636 | # collect bluestore wal backend | |
f64942e4 | 637 | osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '') |
11fdf7f2 | 638 | # collect bluestore db backend |
f64942e4 AA |
639 | osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '') |
640 | if osd_dev_node and osd_dev_node == "unknown": | |
641 | osd_dev_node = None | |
642 | ||
3efd9988 FG |
643 | osd_hostname = osd_metadata.get('hostname', None) |
644 | if osd_dev_node and osd_hostname: | |
645 | self.log.debug("Got dev for osd {0}: {1}/{2}".format( | |
646 | id_, osd_hostname, osd_dev_node)) | |
91327a77 | 647 | self.metrics['disk_occupation'].set(1, ( |
28e407b8 | 648 | "osd.{0}".format(id_), |
3efd9988 | 649 | osd_dev_node, |
f64942e4 AA |
650 | osd_db_dev_node, |
651 | osd_wal_dev_node, | |
28e407b8 | 652 | osd_hostname |
3efd9988 FG |
653 | )) |
654 | else: | |
655 | self.log.info("Missing dev node metadata for osd {0}, skipping " | |
11fdf7f2 | 656 | "occupation record for this osd".format(id_)) |
3efd9988 | 657 | |
94b18763 | 658 | pool_meta = [] |
3efd9988 | 659 | for pool in osd_map['pools']: |
11fdf7f2 TL |
660 | self.metrics['pool_metadata'].set( |
661 | 1, (pool['pool'], pool['pool_name'])) | |
94b18763 | 662 | |
11fdf7f2 | 663 | # Populate other servers metadata |
94b18763 FG |
664 | for key, value in servers.items(): |
665 | service_id, service_type = key | |
11fdf7f2 TL |
666 | if service_type == 'rgw': |
667 | hostname, version = value | |
668 | self.metrics['rgw_metadata'].set( | |
669 | 1, | |
670 | ('{}.{}'.format(service_type, service_id), hostname, version) | |
671 | ) | |
672 | elif service_type == 'rbd-mirror': | |
673 | mirror_metadata = self.get_metadata('rbd-mirror', service_id) | |
674 | if mirror_metadata is None: | |
675 | continue | |
676 | mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type, | |
677 | service_id) | |
678 | self.metrics['rbd_mirror_metadata'].set( | |
679 | 1, (mirror_metadata.get(k, '') | |
680 | for k in RBD_MIRROR_METADATA) | |
681 | ) | |
3efd9988 | 682 | |
28e407b8 AA |
683 | def get_num_objects(self): |
684 | pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum'] | |
685 | for obj in NUM_OBJECTS: | |
686 | stat = 'num_objects_{}'.format(obj) | |
91327a77 | 687 | self.metrics[stat].set(pg_sum[stat]) |
28e407b8 | 688 | |
11fdf7f2 TL |
689 | def get_rbd_stats(self): |
690 | # Per RBD image stats is collected by registering a dynamic osd perf | |
691 | # stats query that tells OSDs to group stats for requests associated | |
692 | # with RBD objects by pool, namespace, and image id, which are | |
693 | # extracted from the request object names or other attributes. | |
694 | # The RBD object names have the following prefixes: | |
695 | # - rbd_data.{image_id}. (data stored in the same pool as metadata) | |
696 | # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool) | |
697 | # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled) | |
698 | # The pool_id in the object name is the id of the pool with the image | |
699 | # metdata, and should be used in the image spec. If there is no pool_id | |
700 | # in the object name, the image pool is the pool where the object is | |
701 | # located. | |
702 | ||
703 | # Parse rbd_stats_pools option, which is a comma or space separated | |
704 | # list of pool[/namespace] entries. If no namespace is specifed the | |
705 | # stats are collected for every namespace in the pool. | |
706 | pools_string = self.get_localized_module_option('rbd_stats_pools', '') | |
707 | pools = {} | |
708 | for p in [x for x in re.split('[\s,]+', pools_string) if x]: | |
709 | s = p.split('/', 2) | |
710 | pool_name = s[0] | |
711 | if len(s) == 1: | |
712 | # empty set means collect for all namespaces | |
713 | pools[pool_name] = set() | |
714 | continue | |
715 | if pool_name not in pools: | |
716 | pools[pool_name] = set() | |
717 | elif not pools[pool_name]: | |
718 | continue | |
719 | pools[pool_name].add(s[1]) | |
720 | ||
721 | rbd_stats_pools = {} | |
722 | for pool_id in list(self.rbd_stats['pools']): | |
723 | name = self.rbd_stats['pools'][pool_id]['name'] | |
724 | if name not in pools: | |
725 | del self.rbd_stats['pools'][pool_id] | |
726 | else: | |
727 | rbd_stats_pools[name] = \ | |
728 | self.rbd_stats['pools'][pool_id]['ns_names'] | |
729 | ||
730 | pools_refreshed = False | |
731 | if pools: | |
732 | next_refresh = self.rbd_stats['pools_refresh_time'] + \ | |
733 | self.get_localized_module_option( | |
734 | 'rbd_stats_pools_refresh_interval', 300) | |
735 | if rbd_stats_pools != pools or time.time() >= next_refresh: | |
736 | self.refresh_rbd_stats_pools(pools) | |
737 | pools_refreshed = True | |
738 | ||
739 | pool_ids = list(self.rbd_stats['pools']) | |
740 | pool_ids.sort() | |
741 | pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$' | |
742 | ||
743 | nspace_names = [] | |
744 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
745 | if pool['ns_names']: | |
746 | nspace_names.extend(pool['ns_names']) | |
747 | else: | |
748 | nspace_names = [] | |
749 | break | |
750 | if nspace_names: | |
751 | namespace_regex = '^(' + \ | |
752 | "|".join([re.escape(x) | |
753 | for x in set(nspace_names)]) + ')$' | |
754 | else: | |
755 | namespace_regex = '^(.*)$' | |
756 | ||
757 | if 'query' in self.rbd_stats and \ | |
758 | (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or | |
759 | namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']): | |
760 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
761 | del self.rbd_stats['query_id'] | |
762 | del self.rbd_stats['query'] | |
763 | ||
764 | if not self.rbd_stats['pools']: | |
765 | return | |
766 | ||
767 | counters_info = self.rbd_stats['counters_info'] | |
768 | ||
769 | if 'query_id' not in self.rbd_stats: | |
770 | query = { | |
771 | 'key_descriptor': [ | |
772 | {'type': 'pool_id', 'regex': pool_id_regex}, | |
773 | {'type': 'namespace', 'regex': namespace_regex}, | |
774 | {'type': 'object_name', | |
775 | 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'}, | |
776 | ], | |
777 | 'performance_counter_descriptors': list(counters_info), | |
778 | } | |
779 | query_id = self.add_osd_perf_query(query) | |
780 | if query_id is None: | |
781 | self.log.error('failed to add query %s' % query) | |
782 | return | |
783 | self.rbd_stats['query'] = query | |
784 | self.rbd_stats['query_id'] = query_id | |
785 | ||
786 | res = self.get_osd_perf_counters(self.rbd_stats['query_id']) | |
787 | for c in res['counters']: | |
788 | # if the pool id is not found in the object name use id of the | |
789 | # pool where the object is located | |
790 | if c['k'][2][0]: | |
791 | pool_id = int(c['k'][2][0]) | |
792 | else: | |
793 | pool_id = int(c['k'][0][0]) | |
794 | if pool_id not in self.rbd_stats['pools'] and not pools_refreshed: | |
795 | self.refresh_rbd_stats_pools(pools) | |
796 | pools_refreshed = True | |
797 | if pool_id not in self.rbd_stats['pools']: | |
798 | continue | |
799 | pool = self.rbd_stats['pools'][pool_id] | |
800 | nspace_name = c['k'][1][0] | |
801 | if nspace_name not in pool['images']: | |
802 | continue | |
803 | image_id = c['k'][2][1] | |
804 | if image_id not in pool['images'][nspace_name] and \ | |
805 | not pools_refreshed: | |
806 | self.refresh_rbd_stats_pools(pools) | |
807 | pool = self.rbd_stats['pools'][pool_id] | |
808 | pools_refreshed = True | |
809 | if image_id not in pool['images'][nspace_name]: | |
810 | continue | |
811 | counters = pool['images'][nspace_name][image_id]['c'] | |
812 | for i in range(len(c['c'])): | |
813 | counters[i][0] += c['c'][i][0] | |
814 | counters[i][1] += c['c'][i][1] | |
815 | ||
816 | label_names = ("pool", "namespace", "image") | |
817 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
818 | pool_name = pool['name'] | |
819 | for nspace_name, images in pool['images'].items(): | |
820 | for image_id in images: | |
821 | image_name = images[image_id]['n'] | |
822 | counters = images[image_id]['c'] | |
823 | i = 0 | |
824 | for key in counters_info: | |
825 | counter_info = counters_info[key] | |
826 | stattype = self._stattype_to_str(counter_info['type']) | |
827 | labels = (pool_name, nspace_name, image_name) | |
828 | if counter_info['type'] == self.PERFCOUNTER_COUNTER: | |
829 | path = 'rbd_' + key | |
830 | if path not in self.metrics: | |
831 | self.metrics[path] = Metric( | |
832 | stattype, | |
833 | path, | |
834 | counter_info['desc'], | |
835 | label_names, | |
836 | ) | |
837 | self.metrics[path].set(counters[i][0], labels) | |
838 | elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG: | |
839 | path = 'rbd_' + key + '_sum' | |
840 | if path not in self.metrics: | |
841 | self.metrics[path] = Metric( | |
842 | stattype, | |
843 | path, | |
844 | counter_info['desc'] + ' Total', | |
845 | label_names, | |
846 | ) | |
847 | self.metrics[path].set(counters[i][0], labels) | |
848 | path = 'rbd_' + key + '_count' | |
849 | if path not in self.metrics: | |
850 | self.metrics[path] = Metric( | |
851 | 'counter', | |
852 | path, | |
853 | counter_info['desc'] + ' Count', | |
854 | label_names, | |
855 | ) | |
856 | self.metrics[path].set(counters[i][1], labels) | |
857 | i += 1 | |
858 | ||
859 | def refresh_rbd_stats_pools(self, pools): | |
860 | self.log.debug('refreshing rbd pools %s' % (pools)) | |
861 | ||
862 | rbd = RBD() | |
863 | counters_info = self.rbd_stats['counters_info'] | |
864 | for pool_name, cfg_ns_names in pools.items(): | |
865 | try: | |
866 | pool_id = self.rados.pool_lookup(pool_name) | |
867 | with self.rados.open_ioctx(pool_name) as ioctx: | |
868 | if pool_id not in self.rbd_stats['pools']: | |
869 | self.rbd_stats['pools'][pool_id] = {'images': {}} | |
870 | pool = self.rbd_stats['pools'][pool_id] | |
871 | pool['name'] = pool_name | |
872 | pool['ns_names'] = cfg_ns_names | |
873 | if cfg_ns_names: | |
874 | nspace_names = list(cfg_ns_names) | |
875 | else: | |
876 | nspace_names = [''] + rbd.namespace_list(ioctx) | |
877 | for nspace_name in pool['images']: | |
878 | if nspace_name not in nspace_names: | |
879 | del pool['images'][nspace_name] | |
880 | for nspace_name in nspace_names: | |
881 | if (nspace_name and | |
882 | not rbd.namespace_exists(ioctx, nspace_name)): | |
883 | self.log.debug('unknown namespace %s for pool %s' % | |
884 | (nspace_name, pool_name)) | |
885 | continue | |
886 | ioctx.set_namespace(nspace_name) | |
887 | if nspace_name not in pool['images']: | |
888 | pool['images'][nspace_name] = {} | |
889 | namespace = pool['images'][nspace_name] | |
890 | images = {} | |
891 | for image_meta in RBD().list2(ioctx): | |
892 | image = {'n': image_meta['name']} | |
893 | image_id = image_meta['id'] | |
894 | if image_id in namespace: | |
895 | image['c'] = namespace[image_id]['c'] | |
896 | else: | |
897 | image['c'] = [[0, 0] for x in counters_info] | |
898 | images[image_id] = image | |
899 | pool['images'][nspace_name] = images | |
900 | except Exception as e: | |
901 | self.log.error('failed listing pool %s: %s' % (pool_name, e)) | |
902 | self.rbd_stats['pools_refresh_time'] = time.time() | |
903 | ||
904 | def shutdown_rbd_stats(self): | |
905 | if 'query_id' in self.rbd_stats: | |
906 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
907 | del self.rbd_stats['query_id'] | |
908 | del self.rbd_stats['query'] | |
909 | self.rbd_stats['pools'].clear() | |
910 | ||
c07f9fc5 | 911 | def collect(self): |
91327a77 AA |
912 | # Clear the metrics before scraping |
913 | for k in self.metrics.keys(): | |
914 | self.metrics[k].clear() | |
915 | ||
11fdf7f2 TL |
916 | _start_time = time.time() |
917 | ||
3efd9988 FG |
918 | self.get_health() |
919 | self.get_df() | |
11fdf7f2 | 920 | self.get_pool_stats() |
94b18763 | 921 | self.get_fs() |
b32b8144 | 922 | self.get_osd_stats() |
3efd9988 | 923 | self.get_quorum_status() |
494da23a | 924 | self.get_mgr_status() |
3efd9988 FG |
925 | self.get_metadata_and_osd_status() |
926 | self.get_pg_status() | |
28e407b8 | 927 | self.get_num_objects() |
3efd9988 | 928 | |
94b18763 | 929 | for daemon, counters in self.get_all_perf_counters().items(): |
3efd9988 | 930 | for path, counter_info in counters.items(): |
28e407b8 | 931 | # Skip histograms, they are represented by long running avgs |
3efd9988 | 932 | stattype = self._stattype_to_str(counter_info['type']) |
3efd9988 FG |
933 | if not stattype or stattype == 'histogram': |
934 | self.log.debug('ignoring %s, type %s' % (path, stattype)) | |
935 | continue | |
936 | ||
81eedcae TL |
937 | path, label_names, labels = self._perfpath_to_path_labels( |
938 | daemon, path) | |
939 | ||
28e407b8 | 940 | # Get the value of the counter |
11fdf7f2 TL |
941 | value = self._perfvalue_to_value( |
942 | counter_info['type'], counter_info['value']) | |
28e407b8 AA |
943 | |
944 | # Represent the long running avgs as sum/count pairs | |
945 | if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG: | |
946 | _path = path + '_sum' | |
91327a77 AA |
947 | if _path not in self.metrics: |
948 | self.metrics[_path] = Metric( | |
949 | stattype, | |
950 | _path, | |
951 | counter_info['description'] + ' Total', | |
81eedcae | 952 | label_names, |
91327a77 | 953 | ) |
81eedcae | 954 | self.metrics[_path].set(value, labels) |
28e407b8 AA |
955 | |
956 | _path = path + '_count' | |
91327a77 AA |
957 | if _path not in self.metrics: |
958 | self.metrics[_path] = Metric( | |
959 | 'counter', | |
960 | _path, | |
961 | counter_info['description'] + ' Count', | |
81eedcae | 962 | label_names, |
91327a77 | 963 | ) |
81eedcae | 964 | self.metrics[_path].set(counter_info['count'], labels,) |
28e407b8 | 965 | else: |
91327a77 AA |
966 | if path not in self.metrics: |
967 | self.metrics[path] = Metric( | |
968 | stattype, | |
969 | path, | |
970 | counter_info['description'], | |
81eedcae | 971 | label_names, |
91327a77 | 972 | ) |
81eedcae | 973 | self.metrics[path].set(value, labels) |
91327a77 | 974 | |
11fdf7f2 TL |
975 | self.get_rbd_stats() |
976 | ||
977 | _end_time = time.time() | |
978 | self.metrics['scrape_duration_seconds'].set(_end_time - _start_time) | |
979 | ||
91327a77 AA |
980 | # Return formatted metrics and clear no longer used data |
981 | _metrics = [m.str_expfmt() for m in self.metrics.values()] | |
982 | for k in self.metrics.keys(): | |
983 | self.metrics[k].clear() | |
984 | ||
985 | return ''.join(_metrics) + '\n' | |
c07f9fc5 | 986 | |
11fdf7f2 TL |
987 | def get_file_sd_config(self): |
988 | servers = self.list_servers() | |
989 | targets = [] | |
990 | for server in servers: | |
991 | hostname = server.get('hostname', '') | |
992 | for service in server.get('services', []): | |
993 | if service['type'] != 'mgr': | |
994 | continue | |
995 | id_ = service['id'] | |
996 | # get port for prometheus module at mgr with id_ | |
997 | # TODO use get_config_prefix or get_config here once | |
998 | # https://github.com/ceph/ceph/pull/20458 is merged | |
999 | result = CommandResult("") | |
1000 | global_instance().send_command( | |
1001 | result, "mon", '', | |
1002 | json.dumps({ | |
1003 | "prefix": "config-key get", | |
1004 | 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_), | |
1005 | }), | |
1006 | "") | |
1007 | r, outb, outs = result.wait() | |
1008 | if r != 0: | |
1009 | global_instance().log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs)) | |
1010 | targets.append('{}:{}'.format(hostname, DEFAULT_PORT)) | |
1011 | else: | |
1012 | port = json.loads(outb) | |
1013 | targets.append('{}:{}'.format(hostname, port)) | |
1014 | ||
1015 | ret = [ | |
1016 | { | |
1017 | "targets": targets, | |
1018 | "labels": {} | |
1019 | } | |
1020 | ] | |
1021 | return 0, json.dumps(ret), "" | |
1022 | ||
1023 | def self_test(self): | |
1024 | self.collect() | |
1025 | self.get_file_sd_config() | |
1026 | ||
1027 | def handle_command(self, inbuf, cmd): | |
1028 | if cmd['prefix'] == 'prometheus file_sd_config': | |
1029 | return self.get_file_sd_config() | |
3efd9988 FG |
1030 | else: |
1031 | return (-errno.EINVAL, '', | |
1032 | "Command not found '{0}'".format(cmd['prefix'])) | |
c07f9fc5 FG |
1033 | |
1034 | def serve(self): | |
1035 | ||
1036 | class Root(object): | |
1037 | ||
1038 | # collapse everything to '/' | |
1039 | def _cp_dispatch(self, vpath): | |
1040 | cherrypy.request.path = '' | |
1041 | return self | |
1042 | ||
c07f9fc5 FG |
1043 | @cherrypy.expose |
1044 | def index(self): | |
3efd9988 FG |
1045 | return '''<!DOCTYPE html> |
1046 | <html> | |
1047 | <head><title>Ceph Exporter</title></head> | |
1048 | <body> | |
1049 | <h1>Ceph Exporter</h1> | |
1050 | <p><a href='/metrics'>Metrics</a></p> | |
1051 | </body> | |
1052 | </html>''' | |
1053 | ||
1054 | @cherrypy.expose | |
1055 | def metrics(self): | |
91327a77 AA |
1056 | instance = global_instance() |
1057 | # Lock the function execution | |
1058 | try: | |
1059 | instance.collect_lock.acquire() | |
1060 | return self._metrics(instance) | |
1061 | finally: | |
1062 | instance.collect_lock.release() | |
1063 | ||
11fdf7f2 TL |
1064 | @staticmethod |
1065 | def _metrics(instance): | |
91327a77 | 1066 | # Return cached data if available and collected before the cache times out |
11fdf7f2 | 1067 | if instance.collect_cache and time.time() - instance.collect_time < instance.collect_timeout: |
94b18763 | 1068 | cherrypy.response.headers['Content-Type'] = 'text/plain' |
91327a77 AA |
1069 | return instance.collect_cache |
1070 | ||
1071 | if instance.have_mon_connection(): | |
1072 | instance.collect_cache = None | |
1073 | instance.collect_time = time.time() | |
1074 | instance.collect_cache = instance.collect() | |
1075 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1076 | return instance.collect_cache | |
94b18763 FG |
1077 | else: |
1078 | raise cherrypy.HTTPError(503, 'No MON connection') | |
c07f9fc5 | 1079 | |
91327a77 | 1080 | # Make the cache timeout for collecting configurable |
eafe8130 TL |
1081 | self.collect_timeout = float(self.get_localized_module_option( |
1082 | 'scrape_interval', 5.0)) | |
91327a77 | 1083 | |
11fdf7f2 | 1084 | server_addr = self.get_localized_module_option( |
494da23a | 1085 | 'server_addr', get_default_addr()) |
11fdf7f2 TL |
1086 | server_port = self.get_localized_module_option( |
1087 | 'server_port', DEFAULT_PORT) | |
c07f9fc5 FG |
1088 | self.log.info( |
1089 | "server_addr: %s server_port: %s" % | |
1090 | (server_addr, server_port) | |
1091 | ) | |
c07f9fc5 | 1092 | |
94b18763 FG |
1093 | # Publish the URI that others may use to access the service we're |
1094 | # about to start serving | |
1095 | self.set_uri('http://{0}:{1}/'.format( | |
eafe8130 | 1096 | socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr, |
94b18763 FG |
1097 | server_port |
1098 | )) | |
1099 | ||
c07f9fc5 FG |
1100 | cherrypy.config.update({ |
1101 | 'server.socket_host': server_addr, | |
3efd9988 | 1102 | 'server.socket_port': int(server_port), |
c07f9fc5 FG |
1103 | 'engine.autoreload.on': False |
1104 | }) | |
1105 | cherrypy.tree.mount(Root(), "/") | |
94b18763 | 1106 | self.log.info('Starting engine...') |
c07f9fc5 | 1107 | cherrypy.engine.start() |
94b18763 | 1108 | self.log.info('Engine started.') |
91327a77 AA |
1109 | # wait for the shutdown event |
1110 | self.shutdown_event.wait() | |
1111 | self.shutdown_event.clear() | |
1112 | cherrypy.engine.stop() | |
1113 | self.log.info('Engine stopped.') | |
11fdf7f2 | 1114 | self.shutdown_rbd_stats() |
94b18763 FG |
1115 | |
1116 | def shutdown(self): | |
1117 | self.log.info('Stopping engine...') | |
91327a77 | 1118 | self.shutdown_event.set() |
94b18763 FG |
1119 | |
1120 | ||
1121 | class StandbyModule(MgrStandbyModule): | |
91327a77 AA |
1122 | def __init__(self, *args, **kwargs): |
1123 | super(StandbyModule, self).__init__(*args, **kwargs) | |
1124 | self.shutdown_event = threading.Event() | |
1125 | ||
94b18763 | 1126 | def serve(self): |
494da23a TL |
1127 | server_addr = self.get_localized_module_option( |
1128 | 'server_addr', get_default_addr()) | |
11fdf7f2 TL |
1129 | server_port = self.get_localized_module_option( |
1130 | 'server_port', DEFAULT_PORT) | |
1131 | self.log.info("server_addr: %s server_port: %s" % | |
1132 | (server_addr, server_port)) | |
94b18763 FG |
1133 | cherrypy.config.update({ |
1134 | 'server.socket_host': server_addr, | |
1135 | 'server.socket_port': int(server_port), | |
1136 | 'engine.autoreload.on': False | |
1137 | }) | |
1138 | ||
1139 | module = self | |
1140 | ||
1141 | class Root(object): | |
94b18763 FG |
1142 | @cherrypy.expose |
1143 | def index(self): | |
1144 | active_uri = module.get_active_uri() | |
1145 | return '''<!DOCTYPE html> | |
1146 | <html> | |
1147 | <head><title>Ceph Exporter</title></head> | |
1148 | <body> | |
1149 | <h1>Ceph Exporter</h1> | |
1150 | <p><a href='{}metrics'>Metrics</a></p> | |
1151 | </body> | |
1152 | </html>'''.format(active_uri) | |
1153 | ||
1154 | @cherrypy.expose | |
1155 | def metrics(self): | |
1156 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1157 | return '' | |
1158 | ||
1159 | cherrypy.tree.mount(Root(), '/', {}) | |
1160 | self.log.info('Starting engine...') | |
1161 | cherrypy.engine.start() | |
94b18763 | 1162 | self.log.info('Engine started.') |
91327a77 AA |
1163 | # Wait for shutdown event |
1164 | self.shutdown_event.wait() | |
1165 | self.shutdown_event.clear() | |
1166 | cherrypy.engine.stop() | |
1167 | self.log.info('Engine stopped.') | |
94b18763 FG |
1168 | |
1169 | def shutdown(self): | |
1170 | self.log.info("Stopping engine...") | |
91327a77 | 1171 | self.shutdown_event.set() |
94b18763 | 1172 | self.log.info("Stopped engine") |