]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/zabbix/module.py
2 Zabbix module for ceph-mgr
4 Collect statistics from Ceph cluster and every X seconds send data to a Zabbix
5 server using the zabbix_sender executable.
9 from subprocess
import Popen
, PIPE
10 from threading
import Event
11 from mgr_module
import MgrModule
16 return sum(data
) / float(len(data
))
21 class ZabbixSender(object):
22 def __init__(self
, sender
, host
, port
, log
):
28 def send(self
, hostname
, data
):
32 cmd
= [self
.sender
, '-z', self
.host
, '-p', str(self
.port
), '-s',
33 hostname
, '-vv', '-i', '-']
35 self
.log
.debug('Executing: %s', cmd
)
37 proc
= Popen(cmd
, stdin
=PIPE
, stdout
=PIPE
, stderr
=PIPE
)
39 for key
, value
in data
.items():
40 proc
.stdin
.write('{0} ceph.{1} {2}\n'.format(hostname
, key
, value
))
42 stdout
, stderr
= proc
.communicate()
43 if proc
.returncode
!= 0:
44 raise RuntimeError('%s exited non-zero: %s' % (self
.sender
,
47 self
.log
.debug('Zabbix Sender: %s', stdout
.rstrip())
50 class Module(MgrModule
):
53 ceph_health_mapping
= {'HEALTH_OK': 0, 'HEALTH_WARN': 1, 'HEALTH_ERR': 2}
56 def config_keys(self
):
57 return dict((o
['name'], o
.get('default', None))
58 for o
in self
.MODULE_OPTIONS
)
62 'name': 'zabbix_sender',
63 'default': '/usr/bin/zabbix_sender'
66 'name': 'zabbix_host',
70 'name': 'zabbix_port',
87 "cmd": "zabbix config-set name=key,type=CephString "
88 "name=value,type=CephString",
89 "desc": "Set a configuration value",
93 "cmd": "zabbix config-show",
94 "desc": "Show current configuration",
99 "desc": "Force sending data to Zabbix",
104 def __init__(self
, *args
, **kwargs
):
105 super(Module
, self
).__init
__(*args
, **kwargs
)
108 def init_module_config(self
):
109 self
.fsid
= self
.get('mon_map')['fsid']
110 self
.log
.debug('Found Ceph fsid %s', self
.fsid
)
112 for key
, default
in self
.config_keys
.items():
113 self
.set_config_option(key
, self
.get_module_option(key
, default
))
115 def set_config_option(self
, option
, value
):
116 if option
not in self
.config_keys
.keys():
117 raise RuntimeError('{0} is a unknown configuration '
118 'option'.format(option
))
120 if option
in ['zabbix_port', 'interval']:
123 except (ValueError, TypeError):
124 raise RuntimeError('invalid {0} configured. Please specify '
125 'a valid integer'.format(option
))
127 if option
== 'interval' and value
< 10:
128 raise RuntimeError('interval should be set to at least 10 seconds')
130 self
.log
.debug('Setting in-memory config option %s to: %s', option
,
132 self
.config
[option
] = value
135 def get_pg_stats(self
):
138 pg_states
= ['active', 'peering', 'clean', 'scrubbing', 'undersized',
139 'backfilling', 'recovering', 'degraded', 'inconsistent',
140 'remapped', 'backfill_toofull', 'wait_backfill',
143 for state
in pg_states
:
144 stats
['num_pg_{0}'.format(state
)] = 0
146 pg_status
= self
.get('pg_status')
148 stats
['num_pg'] = pg_status
['num_pgs']
150 for state
in pg_status
['pgs_by_state']:
151 states
= state
['state_name'].split('+')
153 key
= 'num_pg_{0}'.format(s
)
155 stats
[key
] += state
['count']
162 health
= json
.loads(self
.get('health')['json'])
163 # 'status' is luminous+, 'overall_status' is legacy mode.
164 data
['overall_status'] = health
.get('status',
165 health
.get('overall_status'))
166 data
['overall_status_int'] = \
167 self
.ceph_health_mapping
.get(data
['overall_status'])
169 mon_status
= json
.loads(self
.get('mon_status')['json'])
170 data
['num_mon'] = len(mon_status
['monmap']['mons'])
173 data
['num_pools'] = len(df
['pools'])
174 data
['total_used_bytes'] = df
['stats']['total_used_bytes']
175 data
['total_bytes'] = df
['stats']['total_bytes']
176 data
['total_avail_bytes'] = df
['stats']['total_avail_bytes']
183 for pool
in df
['pools']:
184 wr_ops
+= pool
['stats']['wr']
185 rd_ops
+= pool
['stats']['rd']
186 wr_bytes
+= pool
['stats']['wr_bytes']
187 rd_bytes
+= pool
['stats']['rd_bytes']
189 data
['wr_ops'] = wr_ops
190 data
['rd_ops'] = rd_ops
191 data
['wr_bytes'] = wr_bytes
192 data
['rd_bytes'] = rd_bytes
194 osd_map
= self
.get('osd_map')
195 data
['num_osd'] = len(osd_map
['osds'])
196 data
['osd_nearfull_ratio'] = osd_map
['nearfull_ratio']
197 data
['osd_full_ratio'] = osd_map
['full_ratio']
198 data
['osd_backfillfull_ratio'] = osd_map
['backfillfull_ratio']
200 data
['num_pg_temp'] = len(osd_map
['pg_temp'])
204 for osd
in osd_map
['osds']:
211 data
['num_osd_up'] = num_up
212 data
['num_osd_in'] = num_in
216 osd_apply_latency_ns
= list()
217 osd_commit_latency_ns
= list()
219 osd_stats
= self
.get('osd_stats')
220 for osd
in osd_stats
['osd_stats']:
223 osd_fill
.append((float(osd
['kb_used']) / float(osd
['kb'])) * 100)
224 osd_pgs
.append(osd
['num_pgs'])
225 osd_apply_latency_ns
.append(osd
['perf_stat']['apply_latency_ns'])
226 osd_commit_latency_ns
.append(osd
['perf_stat']['commit_latency_ns'])
229 data
['osd_max_fill'] = max(osd_fill
)
230 data
['osd_min_fill'] = min(osd_fill
)
231 data
['osd_avg_fill'] = avg(osd_fill
)
232 data
['osd_max_pgs'] = max(osd_pgs
)
233 data
['osd_min_pgs'] = min(osd_pgs
)
234 data
['osd_avg_pgs'] = avg(osd_pgs
)
239 data
['osd_latency_apply_max'] = max(osd_apply_latency_ns
) / 1000000.0 # ns -> ms
240 data
['osd_latency_apply_min'] = min(osd_apply_latency_ns
) / 1000000.0 # ns -> ms
241 data
['osd_latency_apply_avg'] = avg(osd_apply_latency_ns
) / 1000000.0 # ns -> ms
243 data
['osd_latency_commit_max'] = max(osd_commit_latency_ns
) / 1000000.0 # ns -> ms
244 data
['osd_latency_commit_min'] = min(osd_commit_latency_ns
) / 1000000.0 # ns -> ms
245 data
['osd_latency_commit_avg'] = avg(osd_commit_latency_ns
) / 1000000.0 # ns -> ms
249 data
.update(self
.get_pg_stats())
254 data
= self
.get_data()
256 identifier
= self
.config
['identifier']
257 if identifier
is None or len(identifier
) == 0:
258 identifier
= 'ceph-{0}'.format(self
.fsid
)
260 if not self
.config
['zabbix_host']:
261 self
.log
.error('Zabbix server not set, please configure using: '
262 'ceph zabbix config-set zabbix_host <zabbix_host>')
263 self
.set_health_checks({
264 'MGR_ZABBIX_NO_SERVER': {
265 'severity': 'warning',
266 'summary': 'No Zabbix server configured',
267 'detail': ['Configuration value zabbix_host not configured']
274 'Sending data to Zabbix server %s as host/identifier %s',
275 self
.config
['zabbix_host'], identifier
)
278 zabbix
= ZabbixSender(self
.config
['zabbix_sender'],
279 self
.config
['zabbix_host'],
280 self
.config
['zabbix_port'], self
.log
)
282 zabbix
.send(identifier
, data
)
283 self
.set_health_checks(dict())
285 except Exception as exc
:
286 self
.log
.error('Exception when sending: %s', exc
)
287 self
.set_health_checks({
288 'MGR_ZABBIX_SEND_FAILED': {
289 'severity': 'warning',
290 'summary': 'Failed to send data to Zabbix',
297 def handle_command(self
, inbuf
, command
):
298 if command
['prefix'] == 'zabbix config-show':
299 return 0, json
.dumps(self
.config
), ''
300 elif command
['prefix'] == 'zabbix config-set':
302 value
= command
['value']
304 return -errno
.EINVAL
, '', 'Value should not be empty or None'
306 self
.log
.debug('Setting configuration option %s to %s', key
, value
)
307 if self
.set_config_option(key
, value
):
308 self
.set_module_option(key
, value
)
309 return 0, 'Configuration option {0} updated'.format(key
), ''
312 'Failed to update configuration option {0}'.format(key
), ''
314 elif command
['prefix'] == 'zabbix send':
316 return 0, 'Sending data to Zabbix', ''
318 return 1, 'Failed to send data to Zabbix', ''
320 return (-errno
.EINVAL
, '',
321 "Command not found '{0}'".format(command
['prefix']))
324 self
.log
.info('Stopping zabbix')
329 self
.log
.info('Zabbix module starting up')
332 self
.init_module_config()
335 self
.log
.debug('Waking up for new iteration')
339 except Exception as exc
:
340 # Shouldn't happen, but let's log it and retry next interval,
341 # rather than dying completely.
342 self
.log
.exception("Unexpected error during send():")
344 interval
= self
.config
['interval']
345 self
.log
.debug('Sleeping for %d seconds', interval
)
346 self
.event
.wait(interval
)
349 data
= self
.get_data()
351 if data
['overall_status'] not in self
.ceph_health_mapping
:
352 raise RuntimeError('No valid overall_status found in data')
354 int(data
['overall_status_int'])
356 if data
['num_mon'] < 1:
357 raise RuntimeError('num_mon is smaller than 1')