]>
Commit | Line | Data |
---|---|---|
c07f9fc5 | 1 | import cherrypy |
a8e16298 | 2 | from distutils.version import StrictVersion |
3efd9988 FG |
3 | import json |
4 | import errno | |
c07f9fc5 FG |
5 | import math |
6 | import os | |
11fdf7f2 | 7 | import re |
94b18763 | 8 | import socket |
91327a77 AA |
9 | import threading |
10 | import time | |
11fdf7f2 | 11 | from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES |
494da23a | 12 | from mgr_util import get_default_addr |
11fdf7f2 | 13 | from rbd import RBD |
c07f9fc5 FG |
14 | |
15 | # Defaults for the Prometheus HTTP server. Can also set in config-key | |
16 | # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations | |
17 | # for Prometheus exporter port registry | |
18 | ||
c07f9fc5 FG |
19 | DEFAULT_PORT = 9283 |
20 | ||
a8e16298 TL |
21 | # When the CherryPy server in 3.2.2 (and later) starts it attempts to verify |
22 | # that the ports its listening on are in fact bound. When using the any address | |
23 | # "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes) | |
24 | # ipv6 isn't yet configured / supported and CherryPy throws an uncaught | |
25 | # exception. | |
26 | if cherrypy is not None: | |
27 | v = StrictVersion(cherrypy.__version__) | |
28 | # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on | |
29 | # centos:7) and back to at least 3.0.0. | |
30 | if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"): | |
31 | # https://github.com/cherrypy/cherrypy/issues/1100 | |
32 | from cherrypy.process import servers | |
33 | servers.wait_for_occupied_port = lambda host, port: None | |
c07f9fc5 | 34 | |
9f95a23c | 35 | |
c07f9fc5 | 36 | # cherrypy likes to sys.exit on error. don't let it take us down too! |
3efd9988 | 37 | def os_exit_noop(*args, **kwargs): |
c07f9fc5 FG |
38 | pass |
39 | ||
40 | ||
41 | os._exit = os_exit_noop | |
42 | ||
c07f9fc5 FG |
43 | # to access things in class Module from subclass Root. Because |
44 | # it's a dict, the writer doesn't need to declare 'global' for access | |
45 | ||
46 | _global_instance = {'plugin': None} | |
47 | ||
48 | ||
49 | def global_instance(): | |
50 | assert _global_instance['plugin'] is not None | |
51 | return _global_instance['plugin'] | |
52 | ||
53 | ||
3efd9988 | 54 | def health_status_to_number(status): |
3efd9988 FG |
55 | if status == 'HEALTH_OK': |
56 | return 0 | |
57 | elif status == 'HEALTH_WARN': | |
58 | return 1 | |
59 | elif status == 'HEALTH_ERR': | |
60 | return 2 | |
c07f9fc5 | 61 | |
11fdf7f2 TL |
62 | |
63 | DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_used_raw_bytes'] | |
64 | ||
65 | DF_POOL = ['max_avail', 'stored', 'stored_raw', 'objects', 'dirty', | |
3efd9988 | 66 | 'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes'] |
c07f9fc5 | 67 | |
11fdf7f2 TL |
68 | OSD_POOL_STATS = ('recovering_objects_per_sec', 'recovering_bytes_per_sec', |
69 | 'recovering_keys_per_sec', 'num_objects_recovered', | |
70 | 'num_bytes_recovered', 'num_bytes_recovered') | |
71 | ||
94b18763 FG |
72 | OSD_FLAGS = ('noup', 'nodown', 'noout', 'noin', 'nobackfill', 'norebalance', |
73 | 'norecover', 'noscrub', 'nodeep-scrub') | |
3efd9988 | 74 | |
28e407b8 | 75 | FS_METADATA = ('data_pools', 'fs_id', 'metadata_pool', 'name') |
b32b8144 | 76 | |
28e407b8 AA |
77 | MDS_METADATA = ('ceph_daemon', 'fs_id', 'hostname', 'public_addr', 'rank', |
78 | 'ceph_version') | |
3efd9988 | 79 | |
11fdf7f2 TL |
80 | MON_METADATA = ('ceph_daemon', 'hostname', |
81 | 'public_addr', 'rank', 'ceph_version') | |
c07f9fc5 | 82 | |
494da23a TL |
83 | MGR_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
84 | ||
85 | MGR_STATUS = ('ceph_daemon',) | |
86 | ||
87 | MGR_MODULE_STATUS = ('name',) | |
88 | ||
89 | MGR_MODULE_CAN_RUN = ('name',) | |
90 | ||
a8e16298 TL |
91 | OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class', |
92 | 'front_iface', 'hostname', 'objectstore', 'public_addr', | |
93 | 'ceph_version') | |
c07f9fc5 | 94 | |
94b18763 | 95 | OSD_STATUS = ['weight', 'up', 'in'] |
c07f9fc5 | 96 | |
94b18763 | 97 | OSD_STATS = ['apply_latency_ms', 'commit_latency_ms'] |
c07f9fc5 | 98 | |
94b18763 | 99 | POOL_METADATA = ('pool_id', 'name') |
c07f9fc5 | 100 | |
28e407b8 | 101 | RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version') |
c07f9fc5 | 102 | |
11fdf7f2 TL |
103 | RBD_MIRROR_METADATA = ('ceph_daemon', 'id', 'instance_id', 'hostname', |
104 | 'ceph_version') | |
105 | ||
106 | DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', | |
107 | 'wal_device', 'instance') | |
28e407b8 AA |
108 | |
109 | NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] | |
c07f9fc5 | 110 | |
c07f9fc5 | 111 | |
91327a77 AA |
112 | class Metric(object): |
113 | def __init__(self, mtype, name, desc, labels=None): | |
114 | self.mtype = mtype | |
115 | self.name = name | |
116 | self.desc = desc | |
117 | self.labelnames = labels # tuple if present | |
118 | self.value = {} # indexed by label values | |
119 | ||
120 | def clear(self): | |
121 | self.value = {} | |
122 | ||
123 | def set(self, value, labelvalues=None): | |
124 | # labelvalues must be a tuple | |
125 | labelvalues = labelvalues or ('',) | |
126 | self.value[labelvalues] = value | |
3efd9988 | 127 | |
91327a77 AA |
128 | def str_expfmt(self): |
129 | ||
130 | def promethize(path): | |
131 | ''' replace illegal metric name characters ''' | |
81eedcae | 132 | result = re.sub(r'[./\s]|::', '_', path).replace('+', '_plus') |
91327a77 AA |
133 | |
134 | # Hyphens usually turn into underscores, unless they are | |
135 | # trailing | |
136 | if result.endswith("-"): | |
137 | result = result[0:-1] + "_minus" | |
138 | else: | |
139 | result = result.replace("-", "_") | |
140 | ||
141 | return "ceph_{0}".format(result) | |
142 | ||
143 | def floatstr(value): | |
144 | ''' represent as Go-compatible float ''' | |
145 | if value == float('inf'): | |
146 | return '+Inf' | |
147 | if value == float('-inf'): | |
148 | return '-Inf' | |
149 | if math.isnan(value): | |
150 | return 'NaN' | |
151 | return repr(float(value)) | |
152 | ||
153 | name = promethize(self.name) | |
154 | expfmt = ''' | |
155 | # HELP {name} {desc} | |
156 | # TYPE {name} {mtype}'''.format( | |
157 | name=name, | |
158 | desc=self.desc, | |
159 | mtype=self.mtype, | |
160 | ) | |
161 | ||
162 | for labelvalues, value in self.value.items(): | |
163 | if self.labelnames: | |
164 | labels = zip(self.labelnames, labelvalues) | |
165 | labels = ','.join('%s="%s"' % (k, v) for k, v in labels) | |
166 | else: | |
167 | labels = '' | |
168 | if labels: | |
169 | fmtstr = '\n{name}{{{labels}}} {value}' | |
170 | else: | |
171 | fmtstr = '\n{name} {value}' | |
172 | expfmt += fmtstr.format( | |
173 | name=name, | |
174 | labels=labels, | |
175 | value=floatstr(value), | |
176 | ) | |
177 | return expfmt | |
178 | ||
179 | ||
180 | class Module(MgrModule): | |
181 | COMMANDS = [ | |
182 | { | |
11fdf7f2 TL |
183 | "cmd": "prometheus file_sd_config", |
184 | "desc": "Return file_sd compatible prometheus config for mgr cluster", | |
185 | "perm": "r" | |
91327a77 AA |
186 | }, |
187 | ] | |
188 | ||
11fdf7f2 TL |
189 | MODULE_OPTIONS = [ |
190 | {'name': 'server_addr'}, | |
191 | {'name': 'server_port'}, | |
192 | {'name': 'scrape_interval'}, | |
193 | {'name': 'rbd_stats_pools'}, | |
194 | {'name': 'rbd_stats_pools_refresh_interval'}, | |
91327a77 AA |
195 | ] |
196 | ||
197 | def __init__(self, *args, **kwargs): | |
198 | super(Module, self).__init__(*args, **kwargs) | |
199 | self.metrics = self._setup_static_metrics() | |
200 | self.shutdown_event = threading.Event() | |
201 | self.collect_lock = threading.RLock() | |
202 | self.collect_time = 0 | |
203 | self.collect_timeout = 5.0 | |
204 | self.collect_cache = None | |
11fdf7f2 TL |
205 | self.rbd_stats = { |
206 | 'pools': {}, | |
207 | 'pools_refresh_time': 0, | |
208 | 'counters_info': { | |
209 | 'write_ops': {'type': self.PERFCOUNTER_COUNTER, | |
210 | 'desc': 'RBD image writes count'}, | |
211 | 'read_ops': {'type': self.PERFCOUNTER_COUNTER, | |
212 | 'desc': 'RBD image reads count'}, | |
213 | 'write_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
214 | 'desc': 'RBD image bytes written'}, | |
215 | 'read_bytes': {'type': self.PERFCOUNTER_COUNTER, | |
216 | 'desc': 'RBD image bytes read'}, | |
217 | 'write_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
218 | 'desc': 'RBD image writes latency (msec)'}, | |
219 | 'read_latency': {'type': self.PERFCOUNTER_LONGRUNAVG, | |
220 | 'desc': 'RBD image reads latency (msec)'}, | |
221 | }, | |
222 | } | |
91327a77 | 223 | _global_instance['plugin'] = self |
3efd9988 FG |
224 | |
225 | def _setup_static_metrics(self): | |
226 | metrics = {} | |
227 | metrics['health_status'] = Metric( | |
228 | 'untyped', | |
229 | 'health_status', | |
230 | 'Cluster health status' | |
231 | ) | |
94b18763 | 232 | metrics['mon_quorum_status'] = Metric( |
3efd9988 | 233 | 'gauge', |
94b18763 FG |
234 | 'mon_quorum_status', |
235 | 'Monitors in quorum', | |
236 | ('ceph_daemon',) | |
237 | ) | |
238 | metrics['fs_metadata'] = Metric( | |
239 | 'untyped', | |
240 | 'fs_metadata', | |
241 | 'FS Metadata', | |
242 | FS_METADATA | |
243 | ) | |
244 | metrics['mds_metadata'] = Metric( | |
245 | 'untyped', | |
246 | 'mds_metadata', | |
247 | 'MDS Metadata', | |
248 | MDS_METADATA | |
249 | ) | |
250 | metrics['mon_metadata'] = Metric( | |
251 | 'untyped', | |
252 | 'mon_metadata', | |
253 | 'MON Metadata', | |
254 | MON_METADATA | |
3efd9988 | 255 | ) |
494da23a TL |
256 | metrics['mgr_metadata'] = Metric( |
257 | 'gauge', | |
258 | 'mgr_metadata', | |
259 | 'MGR metadata', | |
260 | MGR_METADATA | |
261 | ) | |
262 | metrics['mgr_status'] = Metric( | |
263 | 'gauge', | |
264 | 'mgr_status', | |
265 | 'MGR status (0=standby, 1=active)', | |
266 | MGR_STATUS | |
267 | ) | |
268 | metrics['mgr_module_status'] = Metric( | |
269 | 'gauge', | |
270 | 'mgr_module_status', | |
271 | 'MGR module status (0=disabled, 1=enabled, 2=auto-enabled)', | |
272 | MGR_MODULE_STATUS | |
273 | ) | |
274 | metrics['mgr_module_can_run'] = Metric( | |
275 | 'gauge', | |
276 | 'mgr_module_can_run', | |
277 | 'MGR module runnable state i.e. can it run (0=no, 1=yes)', | |
278 | MGR_MODULE_CAN_RUN | |
279 | ) | |
3efd9988 FG |
280 | metrics['osd_metadata'] = Metric( |
281 | 'untyped', | |
282 | 'osd_metadata', | |
283 | 'OSD Metadata', | |
284 | OSD_METADATA | |
285 | ) | |
c07f9fc5 | 286 | |
3efd9988 FG |
287 | # The reason for having this separate to OSD_METADATA is |
288 | # so that we can stably use the same tag names that | |
289 | # the Prometheus node_exporter does | |
290 | metrics['disk_occupation'] = Metric( | |
b32b8144 | 291 | 'untyped', |
3efd9988 FG |
292 | 'disk_occupation', |
293 | 'Associate Ceph daemon with disk used', | |
294 | DISK_OCCUPATION | |
295 | ) | |
c07f9fc5 | 296 | |
3efd9988 FG |
297 | metrics['pool_metadata'] = Metric( |
298 | 'untyped', | |
299 | 'pool_metadata', | |
300 | 'POOL Metadata', | |
301 | POOL_METADATA | |
302 | ) | |
94b18763 FG |
303 | |
304 | metrics['rgw_metadata'] = Metric( | |
305 | 'untyped', | |
306 | 'rgw_metadata', | |
307 | 'RGW Metadata', | |
308 | RGW_METADATA | |
309 | ) | |
310 | ||
11fdf7f2 TL |
311 | metrics['rbd_mirror_metadata'] = Metric( |
312 | 'untyped', | |
313 | 'rbd_mirror_metadata', | |
314 | 'RBD Mirror Metadata', | |
315 | RBD_MIRROR_METADATA | |
316 | ) | |
317 | ||
94b18763 FG |
318 | metrics['pg_total'] = Metric( |
319 | 'gauge', | |
320 | 'pg_total', | |
92f5a8d4 TL |
321 | 'PG Total Count per Pool', |
322 | ('pool_id',) | |
94b18763 FG |
323 | ) |
324 | ||
325 | for flag in OSD_FLAGS: | |
326 | path = 'osd_flag_{}'.format(flag) | |
327 | metrics[path] = Metric( | |
328 | 'untyped', | |
329 | path, | |
330 | 'OSD Flag {}'.format(flag) | |
331 | ) | |
3efd9988 FG |
332 | for state in OSD_STATUS: |
333 | path = 'osd_{}'.format(state) | |
3efd9988 FG |
334 | metrics[path] = Metric( |
335 | 'untyped', | |
c07f9fc5 | 336 | path, |
3efd9988 FG |
337 | 'OSD status {}'.format(state), |
338 | ('ceph_daemon',) | |
c07f9fc5 | 339 | ) |
b32b8144 FG |
340 | for stat in OSD_STATS: |
341 | path = 'osd_{}'.format(stat) | |
b32b8144 FG |
342 | metrics[path] = Metric( |
343 | 'gauge', | |
344 | path, | |
345 | 'OSD stat {}'.format(stat), | |
346 | ('ceph_daemon',) | |
347 | ) | |
11fdf7f2 TL |
348 | for stat in OSD_POOL_STATS: |
349 | path = 'pool_{}'.format(stat) | |
350 | metrics[path] = Metric( | |
351 | 'gauge', | |
352 | path, | |
9f95a23c | 353 | "OSD pool stats: {}".format(stat), |
11fdf7f2 TL |
354 | ('pool_id',) |
355 | ) | |
3efd9988 FG |
356 | for state in PG_STATES: |
357 | path = 'pg_{}'.format(state) | |
3efd9988 FG |
358 | metrics[path] = Metric( |
359 | 'gauge', | |
360 | path, | |
92f5a8d4 TL |
361 | 'PG {} per pool'.format(state), |
362 | ('pool_id',) | |
3efd9988 FG |
363 | ) |
364 | for state in DF_CLUSTER: | |
365 | path = 'cluster_{}'.format(state) | |
3efd9988 FG |
366 | metrics[path] = Metric( |
367 | 'gauge', | |
368 | path, | |
369 | 'DF {}'.format(state), | |
370 | ) | |
371 | for state in DF_POOL: | |
372 | path = 'pool_{}'.format(state) | |
3efd9988 FG |
373 | metrics[path] = Metric( |
374 | 'gauge', | |
375 | path, | |
376 | 'DF pool {}'.format(state), | |
377 | ('pool_id',) | |
378 | ) | |
28e407b8 AA |
379 | for state in NUM_OBJECTS: |
380 | path = 'num_objects_{}'.format(state) | |
381 | metrics[path] = Metric( | |
382 | 'gauge', | |
383 | path, | |
384 | 'Number of {} objects'.format(state), | |
385 | ) | |
3efd9988 FG |
386 | |
387 | return metrics | |
c07f9fc5 | 388 | |
3efd9988 FG |
389 | def get_health(self): |
390 | health = json.loads(self.get('health')['json']) | |
91327a77 AA |
391 | self.metrics['health_status'].set( |
392 | health_status_to_number(health['status']) | |
c07f9fc5 FG |
393 | ) |
394 | ||
11fdf7f2 TL |
395 | def get_pool_stats(self): |
396 | # retrieve pool stats to provide per pool recovery metrics | |
397 | # (osd_pool_stats moved to mgr in Mimic) | |
398 | pstats = self.get('osd_pool_stats') | |
399 | for pool in pstats['pool_stats']: | |
400 | for stat in OSD_POOL_STATS: | |
401 | self.metrics['pool_{}'.format(stat)].set( | |
402 | pool['recovery_rate'].get(stat, 0), | |
403 | (pool['pool_id'],) | |
404 | ) | |
405 | ||
3efd9988 FG |
406 | def get_df(self): |
407 | # maybe get the to-be-exported metrics from a config? | |
408 | df = self.get('df') | |
409 | for stat in DF_CLUSTER: | |
91327a77 | 410 | self.metrics['cluster_{}'.format(stat)].set(df['stats'][stat]) |
3efd9988 FG |
411 | |
412 | for pool in df['pools']: | |
413 | for stat in DF_POOL: | |
91327a77 AA |
414 | self.metrics['pool_{}'.format(stat)].set( |
415 | pool['stats'][stat], | |
416 | (pool['id'],) | |
417 | ) | |
94b18763 FG |
418 | |
419 | def get_fs(self): | |
420 | fs_map = self.get('fs_map') | |
421 | servers = self.get_service_list() | |
9f95a23c TL |
422 | self.log.debug('standbys: {}'.format(fs_map['standbys'])) |
423 | # export standby mds metadata, default standby fs_id is '-1' | |
424 | for standby in fs_map['standbys']: | |
425 | id_ = standby['name'] | |
426 | host_version = servers.get((id_, 'mds'), ('', '')) | |
427 | self.metrics['mds_metadata'].set(1, ( | |
428 | 'mds.{}'.format(id_), '-1', | |
429 | host_version[0], standby['addr'], | |
430 | standby['rank'], host_version[1] | |
431 | )) | |
94b18763 FG |
432 | for fs in fs_map['filesystems']: |
433 | # collect fs metadata | |
11fdf7f2 TL |
434 | data_pools = ",".join([str(pool) |
435 | for pool in fs['mdsmap']['data_pools']]) | |
91327a77 AA |
436 | self.metrics['fs_metadata'].set(1, ( |
437 | data_pools, | |
438 | fs['id'], | |
439 | fs['mdsmap']['metadata_pool'], | |
440 | fs['mdsmap']['fs_name'] | |
441 | )) | |
28e407b8 | 442 | self.log.debug('mdsmap: {}'.format(fs['mdsmap'])) |
94b18763 FG |
443 | for gid, daemon in fs['mdsmap']['info'].items(): |
444 | id_ = daemon['name'] | |
11fdf7f2 | 445 | host_version = servers.get((id_, 'mds'), ('', '')) |
91327a77 AA |
446 | self.metrics['mds_metadata'].set(1, ( |
447 | 'mds.{}'.format(id_), fs['id'], | |
448 | host_version[0], daemon['addr'], | |
449 | daemon['rank'], host_version[1] | |
450 | )) | |
3efd9988 FG |
451 | |
452 | def get_quorum_status(self): | |
453 | mon_status = json.loads(self.get('mon_status')['json']) | |
94b18763 FG |
454 | servers = self.get_service_list() |
455 | for mon in mon_status['monmap']['mons']: | |
456 | rank = mon['rank'] | |
457 | id_ = mon['name'] | |
11fdf7f2 | 458 | host_version = servers.get((id_, 'mon'), ('', '')) |
91327a77 AA |
459 | self.metrics['mon_metadata'].set(1, ( |
460 | 'mon.{}'.format(id_), host_version[0], | |
461 | mon['public_addr'].split(':')[0], rank, | |
462 | host_version[1] | |
463 | )) | |
94b18763 | 464 | in_quorum = int(rank in mon_status['quorum']) |
91327a77 AA |
465 | self.metrics['mon_quorum_status'].set(in_quorum, ( |
466 | 'mon.{}'.format(id_), | |
467 | )) | |
3efd9988 | 468 | |
494da23a TL |
469 | def get_mgr_status(self): |
470 | mgr_map = self.get('mgr_map') | |
471 | servers = self.get_service_list() | |
472 | ||
473 | active = mgr_map['active_name'] | |
474 | standbys = [s.get('name') for s in mgr_map['standbys']] | |
475 | ||
476 | all_mgrs = list(standbys) | |
477 | all_mgrs.append(active) | |
478 | ||
479 | all_modules = {module.get('name'):module.get('can_run') for module in mgr_map['available_modules']} | |
480 | ||
eafe8130 | 481 | ceph_release = None |
494da23a TL |
482 | for mgr in all_mgrs: |
483 | host_version = servers.get((mgr, 'mgr'), ('', '')) | |
484 | if mgr == active: | |
485 | _state = 1 | |
486 | ceph_release = host_version[1].split()[-2] # e.g. nautilus | |
487 | else: | |
488 | _state = 0 | |
489 | ||
490 | self.metrics['mgr_metadata'].set(1, ( | |
491 | 'mgr.{}'.format(mgr), host_version[0], | |
492 | host_version[1] | |
493 | )) | |
494 | self.metrics['mgr_status'].set(_state, ( | |
495 | 'mgr.{}'.format(mgr), | |
496 | )) | |
eafe8130 | 497 | always_on_modules = mgr_map['always_on_modules'].get(ceph_release, []) |
494da23a TL |
498 | active_modules = list(always_on_modules) |
499 | active_modules.extend(mgr_map['modules']) | |
500 | ||
501 | for mod_name in all_modules.keys(): | |
502 | ||
503 | if mod_name in always_on_modules: | |
504 | _state = 2 | |
505 | elif mod_name in active_modules: | |
506 | _state = 1 | |
507 | else: | |
508 | _state = 0 | |
509 | ||
510 | _can_run = 1 if all_modules[mod_name] else 0 | |
511 | self.metrics['mgr_module_status'].set(_state, (mod_name,)) | |
512 | self.metrics['mgr_module_can_run'].set(_can_run, (mod_name,)) | |
513 | ||
3efd9988 | 514 | def get_pg_status(self): |
94b18763 | 515 | |
92f5a8d4 TL |
516 | pg_summary = self.get('pg_summary') |
517 | ||
518 | for pool in pg_summary['by_pool']: | |
519 | total = 0 | |
520 | for state_name, count in pg_summary['by_pool'][pool].items(): | |
521 | reported_states = {} | |
522 | ||
523 | for state in state_name.split('+'): | |
524 | reported_states[state] = reported_states.get( | |
525 | state, 0) + count | |
526 | ||
527 | for state in reported_states: | |
528 | path = 'pg_{}'.format(state) | |
529 | try: | |
530 | self.metrics[path].set(reported_states[state],(pool,)) | |
531 | except KeyError: | |
532 | self.log.warn("skipping pg in unknown state {}".format(state)) | |
533 | ||
534 | for state in PG_STATES: | |
535 | if state not in reported_states: | |
536 | try: | |
537 | self.metrics['pg_{}'.format(state)].set(0,(pool,)) | |
538 | except KeyError: | |
539 | self.log.warn( | |
540 | "skipping pg in unknown state {}".format(state)) | |
541 | total = total + count | |
542 | self.metrics['pg_total'].set(total,(pool,)) | |
b32b8144 FG |
543 | |
544 | def get_osd_stats(self): | |
545 | osd_stats = self.get('osd_stats') | |
546 | for osd in osd_stats['osd_stats']: | |
547 | id_ = osd['osd'] | |
548 | for stat in OSD_STATS: | |
94b18763 | 549 | val = osd['perf_stat'][stat] |
91327a77 AA |
550 | self.metrics['osd_{}'.format(stat)].set(val, ( |
551 | 'osd.{}'.format(id_), | |
552 | )) | |
94b18763 FG |
553 | |
554 | def get_service_list(self): | |
555 | ret = {} | |
556 | for server in self.list_servers(): | |
557 | version = server.get('ceph_version', '') | |
558 | host = server.get('hostname', '') | |
559 | for service in server.get('services', []): | |
560 | ret.update({(service['id'], service['type']): (host, version)}) | |
561 | return ret | |
3efd9988 FG |
562 | |
563 | def get_metadata_and_osd_status(self): | |
564 | osd_map = self.get('osd_map') | |
94b18763 FG |
565 | osd_flags = osd_map['flags'].split(',') |
566 | for flag in OSD_FLAGS: | |
91327a77 AA |
567 | self.metrics['osd_flag_{}'.format(flag)].set( |
568 | int(flag in osd_flags) | |
569 | ) | |
94b18763 | 570 | |
3efd9988 | 571 | osd_devices = self.get('osd_map_crush')['devices'] |
94b18763 | 572 | servers = self.get_service_list() |
3efd9988 | 573 | for osd in osd_map['osds']: |
94b18763 | 574 | # id can be used to link osd metrics and metadata |
3efd9988 | 575 | id_ = osd['osd'] |
94b18763 | 576 | # collect osd metadata |
3efd9988 FG |
577 | p_addr = osd['public_addr'].split(':')[0] |
578 | c_addr = osd['cluster_addr'].split(':')[0] | |
94b18763 FG |
579 | if p_addr == "-" or c_addr == "-": |
580 | self.log.info( | |
581 | "Missing address metadata for osd {0}, skipping occupation" | |
582 | " and metadata records for this osd".format(id_) | |
583 | ) | |
584 | continue | |
585 | ||
586 | dev_class = None | |
587 | for osd_device in osd_devices: | |
588 | if osd_device['id'] == id_: | |
589 | dev_class = osd_device.get('class', '') | |
590 | break | |
591 | ||
592 | if dev_class is None: | |
9f95a23c TL |
593 | self.log.info("OSD {0} is missing from CRUSH map, " |
594 | "skipping output".format(id_)) | |
94b18763 FG |
595 | continue |
596 | ||
11fdf7f2 | 597 | host_version = servers.get((str(id_), 'osd'), ('', '')) |
94b18763 | 598 | |
a8e16298 TL |
599 | # collect disk occupation metadata |
600 | osd_metadata = self.get_metadata("osd", str(id_)) | |
601 | if osd_metadata is None: | |
602 | continue | |
603 | ||
604 | obj_store = osd_metadata.get('osd_objectstore', '') | |
605 | f_iface = osd_metadata.get('front_iface', '') | |
606 | b_iface = osd_metadata.get('back_iface', '') | |
607 | ||
91327a77 | 608 | self.metrics['osd_metadata'].set(1, ( |
a8e16298 | 609 | b_iface, |
28e407b8 | 610 | 'osd.{}'.format(id_), |
3efd9988 | 611 | c_addr, |
94b18763 | 612 | dev_class, |
a8e16298 | 613 | f_iface, |
28e407b8 | 614 | host_version[0], |
a8e16298 TL |
615 | obj_store, |
616 | p_addr, | |
617 | host_version[1] | |
3efd9988 | 618 | )) |
94b18763 FG |
619 | |
620 | # collect osd status | |
3efd9988 FG |
621 | for state in OSD_STATUS: |
622 | status = osd[state] | |
91327a77 AA |
623 | self.metrics['osd_{}'.format(state)].set(status, ( |
624 | 'osd.{}'.format(id_), | |
625 | )) | |
3efd9988 | 626 | |
92f5a8d4 | 627 | osd_dev_node = None |
a8e16298 | 628 | if obj_store == "filestore": |
11fdf7f2 TL |
629 | # collect filestore backend device |
630 | osd_dev_node = osd_metadata.get( | |
631 | 'backend_filestore_dev_node', None) | |
632 | # collect filestore journal device | |
f64942e4 AA |
633 | osd_wal_dev_node = osd_metadata.get('osd_journal', '') |
634 | osd_db_dev_node = '' | |
a8e16298 | 635 | elif obj_store == "bluestore": |
11fdf7f2 TL |
636 | # collect bluestore backend device |
637 | osd_dev_node = osd_metadata.get( | |
638 | 'bluestore_bdev_dev_node', None) | |
639 | # collect bluestore wal backend | |
f64942e4 | 640 | osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '') |
11fdf7f2 | 641 | # collect bluestore db backend |
f64942e4 AA |
642 | osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '') |
643 | if osd_dev_node and osd_dev_node == "unknown": | |
644 | osd_dev_node = None | |
645 | ||
3efd9988 FG |
646 | osd_hostname = osd_metadata.get('hostname', None) |
647 | if osd_dev_node and osd_hostname: | |
648 | self.log.debug("Got dev for osd {0}: {1}/{2}".format( | |
649 | id_, osd_hostname, osd_dev_node)) | |
91327a77 | 650 | self.metrics['disk_occupation'].set(1, ( |
28e407b8 | 651 | "osd.{0}".format(id_), |
3efd9988 | 652 | osd_dev_node, |
f64942e4 AA |
653 | osd_db_dev_node, |
654 | osd_wal_dev_node, | |
28e407b8 | 655 | osd_hostname |
3efd9988 FG |
656 | )) |
657 | else: | |
658 | self.log.info("Missing dev node metadata for osd {0}, skipping " | |
11fdf7f2 | 659 | "occupation record for this osd".format(id_)) |
3efd9988 FG |
660 | |
661 | for pool in osd_map['pools']: | |
11fdf7f2 TL |
662 | self.metrics['pool_metadata'].set( |
663 | 1, (pool['pool'], pool['pool_name'])) | |
94b18763 | 664 | |
11fdf7f2 | 665 | # Populate other servers metadata |
94b18763 FG |
666 | for key, value in servers.items(): |
667 | service_id, service_type = key | |
11fdf7f2 TL |
668 | if service_type == 'rgw': |
669 | hostname, version = value | |
670 | self.metrics['rgw_metadata'].set( | |
671 | 1, | |
9f95a23c TL |
672 | ('{}.{}'.format(service_type, service_id), |
673 | hostname, version) | |
11fdf7f2 TL |
674 | ) |
675 | elif service_type == 'rbd-mirror': | |
676 | mirror_metadata = self.get_metadata('rbd-mirror', service_id) | |
677 | if mirror_metadata is None: | |
678 | continue | |
679 | mirror_metadata['ceph_daemon'] = '{}.{}'.format(service_type, | |
680 | service_id) | |
681 | self.metrics['rbd_mirror_metadata'].set( | |
682 | 1, (mirror_metadata.get(k, '') | |
683 | for k in RBD_MIRROR_METADATA) | |
684 | ) | |
3efd9988 | 685 | |
28e407b8 AA |
686 | def get_num_objects(self): |
687 | pg_sum = self.get('pg_summary')['pg_stats_sum']['stat_sum'] | |
688 | for obj in NUM_OBJECTS: | |
689 | stat = 'num_objects_{}'.format(obj) | |
91327a77 | 690 | self.metrics[stat].set(pg_sum[stat]) |
28e407b8 | 691 | |
11fdf7f2 TL |
692 | def get_rbd_stats(self): |
693 | # Per RBD image stats is collected by registering a dynamic osd perf | |
694 | # stats query that tells OSDs to group stats for requests associated | |
695 | # with RBD objects by pool, namespace, and image id, which are | |
696 | # extracted from the request object names or other attributes. | |
697 | # The RBD object names have the following prefixes: | |
698 | # - rbd_data.{image_id}. (data stored in the same pool as metadata) | |
699 | # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool) | |
700 | # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled) | |
701 | # The pool_id in the object name is the id of the pool with the image | |
702 | # metdata, and should be used in the image spec. If there is no pool_id | |
703 | # in the object name, the image pool is the pool where the object is | |
704 | # located. | |
705 | ||
706 | # Parse rbd_stats_pools option, which is a comma or space separated | |
707 | # list of pool[/namespace] entries. If no namespace is specifed the | |
708 | # stats are collected for every namespace in the pool. | |
709 | pools_string = self.get_localized_module_option('rbd_stats_pools', '') | |
710 | pools = {} | |
711 | for p in [x for x in re.split('[\s,]+', pools_string) if x]: | |
712 | s = p.split('/', 2) | |
713 | pool_name = s[0] | |
714 | if len(s) == 1: | |
715 | # empty set means collect for all namespaces | |
716 | pools[pool_name] = set() | |
717 | continue | |
718 | if pool_name not in pools: | |
719 | pools[pool_name] = set() | |
720 | elif not pools[pool_name]: | |
721 | continue | |
722 | pools[pool_name].add(s[1]) | |
723 | ||
724 | rbd_stats_pools = {} | |
725 | for pool_id in list(self.rbd_stats['pools']): | |
726 | name = self.rbd_stats['pools'][pool_id]['name'] | |
727 | if name not in pools: | |
728 | del self.rbd_stats['pools'][pool_id] | |
729 | else: | |
730 | rbd_stats_pools[name] = \ | |
731 | self.rbd_stats['pools'][pool_id]['ns_names'] | |
732 | ||
733 | pools_refreshed = False | |
734 | if pools: | |
735 | next_refresh = self.rbd_stats['pools_refresh_time'] + \ | |
736 | self.get_localized_module_option( | |
737 | 'rbd_stats_pools_refresh_interval', 300) | |
738 | if rbd_stats_pools != pools or time.time() >= next_refresh: | |
739 | self.refresh_rbd_stats_pools(pools) | |
740 | pools_refreshed = True | |
741 | ||
742 | pool_ids = list(self.rbd_stats['pools']) | |
743 | pool_ids.sort() | |
744 | pool_id_regex = '^(' + '|'.join([str(x) for x in pool_ids]) + ')$' | |
745 | ||
746 | nspace_names = [] | |
747 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
748 | if pool['ns_names']: | |
749 | nspace_names.extend(pool['ns_names']) | |
750 | else: | |
751 | nspace_names = [] | |
752 | break | |
753 | if nspace_names: | |
754 | namespace_regex = '^(' + \ | |
755 | "|".join([re.escape(x) | |
756 | for x in set(nspace_names)]) + ')$' | |
757 | else: | |
758 | namespace_regex = '^(.*)$' | |
759 | ||
760 | if 'query' in self.rbd_stats and \ | |
761 | (pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex'] or | |
762 | namespace_regex != self.rbd_stats['query']['key_descriptor'][1]['regex']): | |
763 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
764 | del self.rbd_stats['query_id'] | |
765 | del self.rbd_stats['query'] | |
766 | ||
767 | if not self.rbd_stats['pools']: | |
768 | return | |
769 | ||
770 | counters_info = self.rbd_stats['counters_info'] | |
771 | ||
772 | if 'query_id' not in self.rbd_stats: | |
773 | query = { | |
774 | 'key_descriptor': [ | |
775 | {'type': 'pool_id', 'regex': pool_id_regex}, | |
776 | {'type': 'namespace', 'regex': namespace_regex}, | |
777 | {'type': 'object_name', | |
778 | 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'}, | |
779 | ], | |
780 | 'performance_counter_descriptors': list(counters_info), | |
781 | } | |
782 | query_id = self.add_osd_perf_query(query) | |
783 | if query_id is None: | |
784 | self.log.error('failed to add query %s' % query) | |
785 | return | |
786 | self.rbd_stats['query'] = query | |
787 | self.rbd_stats['query_id'] = query_id | |
788 | ||
789 | res = self.get_osd_perf_counters(self.rbd_stats['query_id']) | |
790 | for c in res['counters']: | |
791 | # if the pool id is not found in the object name use id of the | |
792 | # pool where the object is located | |
793 | if c['k'][2][0]: | |
794 | pool_id = int(c['k'][2][0]) | |
795 | else: | |
796 | pool_id = int(c['k'][0][0]) | |
797 | if pool_id not in self.rbd_stats['pools'] and not pools_refreshed: | |
798 | self.refresh_rbd_stats_pools(pools) | |
799 | pools_refreshed = True | |
800 | if pool_id not in self.rbd_stats['pools']: | |
801 | continue | |
802 | pool = self.rbd_stats['pools'][pool_id] | |
803 | nspace_name = c['k'][1][0] | |
804 | if nspace_name not in pool['images']: | |
805 | continue | |
806 | image_id = c['k'][2][1] | |
807 | if image_id not in pool['images'][nspace_name] and \ | |
808 | not pools_refreshed: | |
809 | self.refresh_rbd_stats_pools(pools) | |
810 | pool = self.rbd_stats['pools'][pool_id] | |
811 | pools_refreshed = True | |
812 | if image_id not in pool['images'][nspace_name]: | |
813 | continue | |
814 | counters = pool['images'][nspace_name][image_id]['c'] | |
815 | for i in range(len(c['c'])): | |
816 | counters[i][0] += c['c'][i][0] | |
817 | counters[i][1] += c['c'][i][1] | |
818 | ||
819 | label_names = ("pool", "namespace", "image") | |
820 | for pool_id, pool in self.rbd_stats['pools'].items(): | |
821 | pool_name = pool['name'] | |
822 | for nspace_name, images in pool['images'].items(): | |
823 | for image_id in images: | |
824 | image_name = images[image_id]['n'] | |
825 | counters = images[image_id]['c'] | |
826 | i = 0 | |
827 | for key in counters_info: | |
828 | counter_info = counters_info[key] | |
829 | stattype = self._stattype_to_str(counter_info['type']) | |
830 | labels = (pool_name, nspace_name, image_name) | |
831 | if counter_info['type'] == self.PERFCOUNTER_COUNTER: | |
832 | path = 'rbd_' + key | |
833 | if path not in self.metrics: | |
834 | self.metrics[path] = Metric( | |
835 | stattype, | |
836 | path, | |
837 | counter_info['desc'], | |
838 | label_names, | |
839 | ) | |
840 | self.metrics[path].set(counters[i][0], labels) | |
841 | elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG: | |
842 | path = 'rbd_' + key + '_sum' | |
843 | if path not in self.metrics: | |
844 | self.metrics[path] = Metric( | |
845 | stattype, | |
846 | path, | |
847 | counter_info['desc'] + ' Total', | |
848 | label_names, | |
849 | ) | |
850 | self.metrics[path].set(counters[i][0], labels) | |
851 | path = 'rbd_' + key + '_count' | |
852 | if path not in self.metrics: | |
853 | self.metrics[path] = Metric( | |
854 | 'counter', | |
855 | path, | |
856 | counter_info['desc'] + ' Count', | |
857 | label_names, | |
858 | ) | |
859 | self.metrics[path].set(counters[i][1], labels) | |
860 | i += 1 | |
861 | ||
862 | def refresh_rbd_stats_pools(self, pools): | |
863 | self.log.debug('refreshing rbd pools %s' % (pools)) | |
864 | ||
865 | rbd = RBD() | |
866 | counters_info = self.rbd_stats['counters_info'] | |
867 | for pool_name, cfg_ns_names in pools.items(): | |
868 | try: | |
869 | pool_id = self.rados.pool_lookup(pool_name) | |
870 | with self.rados.open_ioctx(pool_name) as ioctx: | |
871 | if pool_id not in self.rbd_stats['pools']: | |
872 | self.rbd_stats['pools'][pool_id] = {'images': {}} | |
873 | pool = self.rbd_stats['pools'][pool_id] | |
874 | pool['name'] = pool_name | |
875 | pool['ns_names'] = cfg_ns_names | |
876 | if cfg_ns_names: | |
877 | nspace_names = list(cfg_ns_names) | |
878 | else: | |
879 | nspace_names = [''] + rbd.namespace_list(ioctx) | |
880 | for nspace_name in pool['images']: | |
881 | if nspace_name not in nspace_names: | |
882 | del pool['images'][nspace_name] | |
883 | for nspace_name in nspace_names: | |
884 | if (nspace_name and | |
885 | not rbd.namespace_exists(ioctx, nspace_name)): | |
886 | self.log.debug('unknown namespace %s for pool %s' % | |
887 | (nspace_name, pool_name)) | |
888 | continue | |
889 | ioctx.set_namespace(nspace_name) | |
890 | if nspace_name not in pool['images']: | |
891 | pool['images'][nspace_name] = {} | |
892 | namespace = pool['images'][nspace_name] | |
893 | images = {} | |
894 | for image_meta in RBD().list2(ioctx): | |
895 | image = {'n': image_meta['name']} | |
896 | image_id = image_meta['id'] | |
897 | if image_id in namespace: | |
898 | image['c'] = namespace[image_id]['c'] | |
899 | else: | |
900 | image['c'] = [[0, 0] for x in counters_info] | |
901 | images[image_id] = image | |
902 | pool['images'][nspace_name] = images | |
903 | except Exception as e: | |
904 | self.log.error('failed listing pool %s: %s' % (pool_name, e)) | |
905 | self.rbd_stats['pools_refresh_time'] = time.time() | |
906 | ||
907 | def shutdown_rbd_stats(self): | |
908 | if 'query_id' in self.rbd_stats: | |
909 | self.remove_osd_perf_query(self.rbd_stats['query_id']) | |
910 | del self.rbd_stats['query_id'] | |
911 | del self.rbd_stats['query'] | |
912 | self.rbd_stats['pools'].clear() | |
913 | ||
c07f9fc5 | 914 | def collect(self): |
91327a77 AA |
915 | # Clear the metrics before scraping |
916 | for k in self.metrics.keys(): | |
917 | self.metrics[k].clear() | |
918 | ||
3efd9988 FG |
919 | self.get_health() |
920 | self.get_df() | |
11fdf7f2 | 921 | self.get_pool_stats() |
94b18763 | 922 | self.get_fs() |
b32b8144 | 923 | self.get_osd_stats() |
3efd9988 | 924 | self.get_quorum_status() |
494da23a | 925 | self.get_mgr_status() |
3efd9988 FG |
926 | self.get_metadata_and_osd_status() |
927 | self.get_pg_status() | |
28e407b8 | 928 | self.get_num_objects() |
3efd9988 | 929 | |
94b18763 | 930 | for daemon, counters in self.get_all_perf_counters().items(): |
3efd9988 | 931 | for path, counter_info in counters.items(): |
28e407b8 | 932 | # Skip histograms, they are represented by long running avgs |
3efd9988 | 933 | stattype = self._stattype_to_str(counter_info['type']) |
3efd9988 FG |
934 | if not stattype or stattype == 'histogram': |
935 | self.log.debug('ignoring %s, type %s' % (path, stattype)) | |
936 | continue | |
937 | ||
81eedcae TL |
938 | path, label_names, labels = self._perfpath_to_path_labels( |
939 | daemon, path) | |
940 | ||
28e407b8 | 941 | # Get the value of the counter |
11fdf7f2 TL |
942 | value = self._perfvalue_to_value( |
943 | counter_info['type'], counter_info['value']) | |
28e407b8 AA |
944 | |
945 | # Represent the long running avgs as sum/count pairs | |
946 | if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG: | |
947 | _path = path + '_sum' | |
91327a77 AA |
948 | if _path not in self.metrics: |
949 | self.metrics[_path] = Metric( | |
950 | stattype, | |
951 | _path, | |
952 | counter_info['description'] + ' Total', | |
81eedcae | 953 | label_names, |
91327a77 | 954 | ) |
81eedcae | 955 | self.metrics[_path].set(value, labels) |
28e407b8 AA |
956 | |
957 | _path = path + '_count' | |
91327a77 AA |
958 | if _path not in self.metrics: |
959 | self.metrics[_path] = Metric( | |
960 | 'counter', | |
961 | _path, | |
962 | counter_info['description'] + ' Count', | |
81eedcae | 963 | label_names, |
91327a77 | 964 | ) |
81eedcae | 965 | self.metrics[_path].set(counter_info['count'], labels,) |
28e407b8 | 966 | else: |
91327a77 AA |
967 | if path not in self.metrics: |
968 | self.metrics[path] = Metric( | |
969 | stattype, | |
970 | path, | |
971 | counter_info['description'], | |
81eedcae | 972 | label_names, |
91327a77 | 973 | ) |
81eedcae | 974 | self.metrics[path].set(value, labels) |
91327a77 | 975 | |
11fdf7f2 TL |
976 | self.get_rbd_stats() |
977 | ||
91327a77 AA |
978 | # Return formatted metrics and clear no longer used data |
979 | _metrics = [m.str_expfmt() for m in self.metrics.values()] | |
980 | for k in self.metrics.keys(): | |
981 | self.metrics[k].clear() | |
982 | ||
983 | return ''.join(_metrics) + '\n' | |
c07f9fc5 | 984 | |
11fdf7f2 TL |
985 | def get_file_sd_config(self): |
986 | servers = self.list_servers() | |
987 | targets = [] | |
988 | for server in servers: | |
989 | hostname = server.get('hostname', '') | |
990 | for service in server.get('services', []): | |
991 | if service['type'] != 'mgr': | |
992 | continue | |
993 | id_ = service['id'] | |
994 | # get port for prometheus module at mgr with id_ | |
995 | # TODO use get_config_prefix or get_config here once | |
996 | # https://github.com/ceph/ceph/pull/20458 is merged | |
997 | result = CommandResult("") | |
998 | global_instance().send_command( | |
999 | result, "mon", '', | |
1000 | json.dumps({ | |
1001 | "prefix": "config-key get", | |
1002 | 'key': "config/mgr/mgr/prometheus/{}/server_port".format(id_), | |
1003 | }), | |
1004 | "") | |
1005 | r, outb, outs = result.wait() | |
1006 | if r != 0: | |
1007 | global_instance().log.error("Failed to retrieve port for mgr {}: {}".format(id_, outs)) | |
1008 | targets.append('{}:{}'.format(hostname, DEFAULT_PORT)) | |
1009 | else: | |
1010 | port = json.loads(outb) | |
1011 | targets.append('{}:{}'.format(hostname, port)) | |
1012 | ||
1013 | ret = [ | |
1014 | { | |
1015 | "targets": targets, | |
1016 | "labels": {} | |
1017 | } | |
1018 | ] | |
1019 | return 0, json.dumps(ret), "" | |
1020 | ||
1021 | def self_test(self): | |
1022 | self.collect() | |
1023 | self.get_file_sd_config() | |
1024 | ||
1025 | def handle_command(self, inbuf, cmd): | |
1026 | if cmd['prefix'] == 'prometheus file_sd_config': | |
1027 | return self.get_file_sd_config() | |
3efd9988 FG |
1028 | else: |
1029 | return (-errno.EINVAL, '', | |
1030 | "Command not found '{0}'".format(cmd['prefix'])) | |
c07f9fc5 FG |
1031 | |
1032 | def serve(self): | |
1033 | ||
1034 | class Root(object): | |
1035 | ||
1036 | # collapse everything to '/' | |
1037 | def _cp_dispatch(self, vpath): | |
1038 | cherrypy.request.path = '' | |
1039 | return self | |
1040 | ||
c07f9fc5 FG |
1041 | @cherrypy.expose |
1042 | def index(self): | |
3efd9988 FG |
1043 | return '''<!DOCTYPE html> |
1044 | <html> | |
9f95a23c TL |
1045 | <head><title>Ceph Exporter</title></head> |
1046 | <body> | |
1047 | <h1>Ceph Exporter</h1> | |
1048 | <p><a href='/metrics'>Metrics</a></p> | |
1049 | </body> | |
3efd9988 FG |
1050 | </html>''' |
1051 | ||
1052 | @cherrypy.expose | |
1053 | def metrics(self): | |
91327a77 AA |
1054 | instance = global_instance() |
1055 | # Lock the function execution | |
1056 | try: | |
1057 | instance.collect_lock.acquire() | |
1058 | return self._metrics(instance) | |
1059 | finally: | |
1060 | instance.collect_lock.release() | |
1061 | ||
11fdf7f2 TL |
1062 | @staticmethod |
1063 | def _metrics(instance): | |
9f95a23c TL |
1064 | # Return cached data if available and collected before the |
1065 | # cache times out | |
11fdf7f2 | 1066 | if instance.collect_cache and time.time() - instance.collect_time < instance.collect_timeout: |
94b18763 | 1067 | cherrypy.response.headers['Content-Type'] = 'text/plain' |
91327a77 AA |
1068 | return instance.collect_cache |
1069 | ||
1070 | if instance.have_mon_connection(): | |
1071 | instance.collect_cache = None | |
1072 | instance.collect_time = time.time() | |
1073 | instance.collect_cache = instance.collect() | |
1074 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1075 | return instance.collect_cache | |
94b18763 FG |
1076 | else: |
1077 | raise cherrypy.HTTPError(503, 'No MON connection') | |
c07f9fc5 | 1078 | |
91327a77 | 1079 | # Make the cache timeout for collecting configurable |
eafe8130 TL |
1080 | self.collect_timeout = float(self.get_localized_module_option( |
1081 | 'scrape_interval', 5.0)) | |
91327a77 | 1082 | |
11fdf7f2 | 1083 | server_addr = self.get_localized_module_option( |
494da23a | 1084 | 'server_addr', get_default_addr()) |
11fdf7f2 TL |
1085 | server_port = self.get_localized_module_option( |
1086 | 'server_port', DEFAULT_PORT) | |
c07f9fc5 FG |
1087 | self.log.info( |
1088 | "server_addr: %s server_port: %s" % | |
1089 | (server_addr, server_port) | |
1090 | ) | |
c07f9fc5 | 1091 | |
94b18763 FG |
1092 | # Publish the URI that others may use to access the service we're |
1093 | # about to start serving | |
1094 | self.set_uri('http://{0}:{1}/'.format( | |
eafe8130 | 1095 | socket.getfqdn() if server_addr in ['::', '0.0.0.0'] else server_addr, |
94b18763 FG |
1096 | server_port |
1097 | )) | |
1098 | ||
c07f9fc5 FG |
1099 | cherrypy.config.update({ |
1100 | 'server.socket_host': server_addr, | |
3efd9988 | 1101 | 'server.socket_port': int(server_port), |
c07f9fc5 FG |
1102 | 'engine.autoreload.on': False |
1103 | }) | |
1104 | cherrypy.tree.mount(Root(), "/") | |
94b18763 | 1105 | self.log.info('Starting engine...') |
c07f9fc5 | 1106 | cherrypy.engine.start() |
94b18763 | 1107 | self.log.info('Engine started.') |
91327a77 AA |
1108 | # wait for the shutdown event |
1109 | self.shutdown_event.wait() | |
1110 | self.shutdown_event.clear() | |
1111 | cherrypy.engine.stop() | |
1112 | self.log.info('Engine stopped.') | |
11fdf7f2 | 1113 | self.shutdown_rbd_stats() |
94b18763 FG |
1114 | |
1115 | def shutdown(self): | |
1116 | self.log.info('Stopping engine...') | |
91327a77 | 1117 | self.shutdown_event.set() |
94b18763 FG |
1118 | |
1119 | ||
1120 | class StandbyModule(MgrStandbyModule): | |
91327a77 AA |
1121 | def __init__(self, *args, **kwargs): |
1122 | super(StandbyModule, self).__init__(*args, **kwargs) | |
1123 | self.shutdown_event = threading.Event() | |
1124 | ||
94b18763 | 1125 | def serve(self): |
494da23a TL |
1126 | server_addr = self.get_localized_module_option( |
1127 | 'server_addr', get_default_addr()) | |
11fdf7f2 TL |
1128 | server_port = self.get_localized_module_option( |
1129 | 'server_port', DEFAULT_PORT) | |
1130 | self.log.info("server_addr: %s server_port: %s" % | |
1131 | (server_addr, server_port)) | |
94b18763 FG |
1132 | cherrypy.config.update({ |
1133 | 'server.socket_host': server_addr, | |
1134 | 'server.socket_port': int(server_port), | |
1135 | 'engine.autoreload.on': False | |
1136 | }) | |
1137 | ||
1138 | module = self | |
1139 | ||
1140 | class Root(object): | |
94b18763 FG |
1141 | @cherrypy.expose |
1142 | def index(self): | |
1143 | active_uri = module.get_active_uri() | |
1144 | return '''<!DOCTYPE html> | |
1145 | <html> | |
9f95a23c TL |
1146 | <head><title>Ceph Exporter</title></head> |
1147 | <body> | |
1148 | <h1>Ceph Exporter</h1> | |
94b18763 | 1149 | <p><a href='{}metrics'>Metrics</a></p> |
9f95a23c | 1150 | </body> |
94b18763 FG |
1151 | </html>'''.format(active_uri) |
1152 | ||
1153 | @cherrypy.expose | |
1154 | def metrics(self): | |
1155 | cherrypy.response.headers['Content-Type'] = 'text/plain' | |
1156 | return '' | |
1157 | ||
1158 | cherrypy.tree.mount(Root(), '/', {}) | |
1159 | self.log.info('Starting engine...') | |
1160 | cherrypy.engine.start() | |
94b18763 | 1161 | self.log.info('Engine started.') |
91327a77 AA |
1162 | # Wait for shutdown event |
1163 | self.shutdown_event.wait() | |
1164 | self.shutdown_event.clear() | |
1165 | cherrypy.engine.stop() | |
1166 | self.log.info('Engine stopped.') | |
94b18763 FG |
1167 | |
1168 | def shutdown(self): | |
1169 | self.log.info("Stopping engine...") | |
91327a77 | 1170 | self.shutdown_event.set() |
94b18763 | 1171 | self.log.info("Stopped engine") |