]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | Telemetry module for ceph-mgr | |
3 | ||
4 | Collect statistics from Ceph cluster and send this back to the Ceph project | |
5 | when user has opted-in | |
6 | """ | |
7 | import errno | |
8 | import json | |
9 | import re | |
10 | import requests | |
11 | import uuid | |
12 | import time | |
13 | from datetime import datetime | |
14 | from threading import Event | |
15 | from collections import defaultdict | |
16 | ||
17 | from mgr_module import MgrModule | |
18 | ||
19 | ||
20 | class Module(MgrModule): | |
21 | config = dict() | |
22 | ||
23 | metadata_keys = [ | |
24 | "arch", | |
25 | "ceph_version", | |
26 | "os", | |
27 | "cpu", | |
28 | "kernel_description", | |
29 | "kernel_version", | |
30 | "distro_description", | |
31 | "distro" | |
32 | ] | |
33 | ||
34 | MODULE_OPTIONS = [ | |
35 | { | |
36 | 'name': 'url', | |
37 | 'type': 'str', | |
38 | 'default': 'https://telemetry.ceph.com/report' | |
39 | }, | |
40 | { | |
41 | 'name': 'enabled', | |
42 | 'type': 'bool', | |
43 | 'default': False | |
44 | }, | |
45 | { | |
46 | 'name': 'leaderboard', | |
47 | 'type': 'bool', | |
48 | 'default': False | |
49 | }, | |
50 | { | |
51 | 'name': 'description', | |
52 | 'type': 'str', | |
53 | 'default': None | |
54 | }, | |
55 | { | |
56 | 'name': 'contact', | |
57 | 'type': 'str', | |
58 | 'default': None | |
59 | }, | |
60 | { | |
61 | 'name': 'organization', | |
62 | 'type': 'str', | |
63 | 'default': None | |
64 | }, | |
65 | { | |
66 | 'name': 'proxy', | |
67 | 'type': 'str', | |
68 | 'default': None | |
69 | }, | |
70 | { | |
71 | 'name': 'interval', | |
72 | 'type': 'int', | |
73 | 'default': 24, | |
74 | 'min': 8 | |
75 | } | |
76 | ] | |
77 | ||
78 | COMMANDS = [ | |
79 | { | |
80 | "cmd": "telemetry status", | |
81 | "desc": "Show current configuration", | |
82 | "perm": "r" | |
83 | }, | |
84 | { | |
85 | "cmd": "telemetry send", | |
86 | "desc": "Force sending data to Ceph telemetry", | |
87 | "perm": "rw" | |
88 | }, | |
89 | { | |
90 | "cmd": "telemetry show", | |
91 | "desc": "Show last report or report to be sent", | |
92 | "perm": "r" | |
93 | }, | |
94 | { | |
95 | "cmd": "telemetry on", | |
96 | "desc": "Enable telemetry reports from this cluster", | |
97 | "perm": "rw", | |
98 | }, | |
99 | { | |
100 | "cmd": "telemetry off", | |
101 | "desc": "Disable telemetry reports from this cluster", | |
102 | "perm": "rw", | |
103 | }, | |
104 | ] | |
105 | ||
106 | @property | |
107 | def config_keys(self): | |
108 | return dict((o['name'], o.get('default', None)) for o in self.MODULE_OPTIONS) | |
109 | ||
110 | def __init__(self, *args, **kwargs): | |
111 | super(Module, self).__init__(*args, **kwargs) | |
112 | self.event = Event() | |
113 | self.run = False | |
114 | self.last_upload = None | |
115 | self.last_report = dict() | |
116 | self.report_id = None | |
117 | ||
118 | def config_notify(self): | |
119 | for opt in self.MODULE_OPTIONS: | |
120 | setattr(self, | |
121 | opt['name'], | |
122 | self.get_module_option(opt['name'])) | |
123 | self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) | |
124 | ||
125 | @staticmethod | |
126 | def parse_timestamp(timestamp): | |
127 | return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f') | |
128 | ||
129 | def load(self): | |
130 | self.last_upload = self.get_store('last_upload', None) | |
131 | if self.last_upload is not None: | |
132 | self.last_upload = int(self.last_upload) | |
133 | ||
134 | self.report_id = self.get_store('report_id', None) | |
135 | if self.report_id is None: | |
136 | self.report_id = str(uuid.uuid4()) | |
137 | self.set_store('report_id', self.report_id) | |
138 | ||
139 | def gather_osd_metadata(self, osd_map): | |
140 | keys = ["osd_objectstore", "rotational"] | |
141 | keys += self.metadata_keys | |
142 | ||
143 | metadata = dict() | |
144 | for key in keys: | |
145 | metadata[key] = defaultdict(int) | |
146 | ||
147 | for osd in osd_map['osds']: | |
148 | for k, v in self.get_metadata('osd', str(osd['osd'])).items(): | |
149 | if k not in keys: | |
150 | continue | |
151 | ||
152 | metadata[k][v] += 1 | |
153 | ||
154 | return metadata | |
155 | ||
156 | def gather_mon_metadata(self, mon_map): | |
157 | keys = list() | |
158 | keys += self.metadata_keys | |
159 | ||
160 | metadata = dict() | |
161 | for key in keys: | |
162 | metadata[key] = defaultdict(int) | |
163 | ||
164 | for mon in mon_map['mons']: | |
165 | for k, v in self.get_metadata('mon', mon['name']).items(): | |
166 | if k not in keys: | |
167 | continue | |
168 | ||
169 | metadata[k][v] += 1 | |
170 | ||
171 | return metadata | |
172 | ||
173 | def gather_crashinfo(self): | |
174 | crashlist = list() | |
175 | errno, crashids, err = self.remote('crash', 'do_ls', '', '') | |
176 | if errno: | |
177 | return '' | |
178 | for crashid in crashids.split(): | |
179 | cmd = {'id': crashid} | |
180 | errno, crashinfo, err = self.remote('crash', 'do_info', cmd, '') | |
181 | if errno: | |
182 | continue | |
183 | c = json.loads(crashinfo) | |
184 | del c['utsname_hostname'] | |
185 | crashlist.append(c) | |
186 | return crashlist | |
187 | ||
188 | def compile_report(self): | |
189 | report = { | |
190 | 'leaderboard': False, | |
191 | 'report_version': 1, | |
192 | 'report_timestamp': datetime.utcnow().isoformat() | |
193 | } | |
194 | ||
195 | if self.leaderboard: | |
196 | report['leaderboard'] = True | |
197 | ||
198 | for option in ['description', 'contact', 'organization']: | |
199 | report[option] = getattr(self, option) | |
200 | ||
201 | mon_map = self.get('mon_map') | |
202 | osd_map = self.get('osd_map') | |
203 | service_map = self.get('service_map') | |
204 | fs_map = self.get('fs_map') | |
205 | df = self.get('df') | |
206 | ||
207 | report['report_id'] = self.report_id | |
208 | report['created'] = self.parse_timestamp(mon_map['created']).isoformat() | |
209 | ||
210 | report['mon'] = { | |
211 | 'count': len(mon_map['mons']), | |
212 | 'features': mon_map['features'] | |
213 | } | |
214 | ||
215 | num_pg = 0 | |
216 | report['pools'] = list() | |
217 | for pool in osd_map['pools']: | |
218 | num_pg += pool['pg_num'] | |
219 | report['pools'].append( | |
220 | { | |
221 | 'pool': pool['pool'], | |
222 | 'type': pool['type'], | |
223 | 'pg_num': pool['pg_num'], | |
224 | 'pgp_num': pool['pg_placement_num'], | |
225 | 'size': pool['size'], | |
226 | 'min_size': pool['min_size'], | |
227 | 'crush_rule': pool['crush_rule'] | |
228 | } | |
229 | ) | |
230 | ||
231 | report['osd'] = { | |
232 | 'count': len(osd_map['osds']), | |
233 | 'require_osd_release': osd_map['require_osd_release'], | |
234 | 'require_min_compat_client': osd_map['require_min_compat_client'] | |
235 | } | |
236 | ||
237 | report['fs'] = { | |
238 | 'count': len(fs_map['filesystems']) | |
239 | } | |
240 | ||
241 | report['metadata'] = dict() | |
242 | report['metadata']['osd'] = self.gather_osd_metadata(osd_map) | |
243 | report['metadata']['mon'] = self.gather_mon_metadata(mon_map) | |
244 | ||
245 | report['usage'] = { | |
246 | 'pools': len(df['pools']), | |
247 | 'pg_num:': num_pg, | |
248 | 'total_used_bytes': df['stats']['total_used_bytes'], | |
249 | 'total_bytes': df['stats']['total_bytes'], | |
250 | 'total_avail_bytes': df['stats']['total_avail_bytes'] | |
251 | } | |
252 | ||
253 | report['services'] = defaultdict(int) | |
254 | for key, value in service_map['services'].items(): | |
255 | report['services'][key] += 1 | |
256 | ||
257 | report['crashes'] = self.gather_crashinfo() | |
258 | ||
259 | return report | |
260 | ||
261 | def send(self, report): | |
262 | self.log.info('Upload report to: %s', self.url) | |
263 | proxies = dict() | |
264 | if self.proxy: | |
265 | self.log.info('Using HTTP(S) proxy: %s', self.proxy) | |
266 | proxies['http'] = self.proxy | |
267 | proxies['https'] = self.proxy | |
268 | ||
269 | resp = requests.put(url=self.url, json=report, proxies=proxies) | |
270 | if not resp.ok: | |
271 | self.log.error("Report send failed: %d %s %s" % | |
272 | (resp.status_code, resp.reason, resp.text)) | |
273 | return resp | |
274 | ||
275 | def handle_command(self, inbuf, command): | |
276 | if command['prefix'] == 'telemetry status': | |
277 | r = {} | |
278 | for opt in self.MODULE_OPTIONS: | |
279 | r[opt['name']] = getattr(self, opt['name']) | |
280 | return 0, json.dumps(r, indent=4), '' | |
281 | elif command['prefix'] == 'telemetry on': | |
282 | self.set_module_option('enabled', True) | |
283 | return 0, '', '' | |
284 | elif command['prefix'] == 'telemetry off': | |
285 | self.set_module_option('enabled', False) | |
286 | return 0, '', '' | |
287 | elif command['prefix'] == 'telemetry send': | |
288 | self.last_report = self.compile_report() | |
289 | resp = self.send(self.last_report) | |
290 | if resp.ok: | |
291 | return 0, 'Report sent to {0}'.format(self.url), '' | |
292 | return 1, '', 'Failed to send report to %s: %d %s %s' % ( | |
293 | self.url, | |
294 | resp.status_code, | |
295 | resp.reason, | |
296 | resp.text | |
297 | ) | |
298 | ||
299 | elif command['prefix'] == 'telemetry show': | |
300 | report = self.last_report | |
301 | if not report: | |
302 | report = self.compile_report() | |
303 | return 0, json.dumps(report, indent=4), '' | |
304 | else: | |
305 | return (-errno.EINVAL, '', | |
306 | "Command not found '{0}'".format(command['prefix'])) | |
307 | ||
308 | def self_test(self): | |
309 | report = self.compile_report() | |
310 | if len(report) == 0: | |
311 | raise RuntimeError('Report is empty') | |
312 | ||
313 | if 'report_id' not in report: | |
314 | raise RuntimeError('report_id not found in report') | |
315 | ||
316 | def shutdown(self): | |
317 | self.run = False | |
318 | self.event.set() | |
319 | ||
320 | def serve(self): | |
321 | self.load() | |
322 | self.config_notify() | |
323 | self.run = True | |
324 | ||
325 | self.log.debug('Waiting for mgr to warm up') | |
326 | self.event.wait(10) | |
327 | ||
328 | while self.run: | |
329 | if not self.enabled: | |
330 | self.log.info('Not sending report until configured to do so') | |
331 | self.event.wait(1800) | |
332 | continue | |
333 | ||
334 | now = int(time.time()) | |
335 | if not self.last_upload or (now - self.last_upload) > \ | |
336 | self.interval * 3600: | |
337 | self.log.info('Compiling and sending report to %s', | |
338 | self.url) | |
339 | ||
340 | try: | |
341 | self.last_report = self.compile_report() | |
342 | except: | |
343 | self.log.exception('Exception while compiling report:') | |
344 | ||
345 | try: | |
346 | resp = self.send(self.last_report) | |
347 | # self.send logs on failure; only update last_upload | |
348 | # if we succeed | |
349 | if resp.ok: | |
350 | self.last_upload = now | |
351 | self.set_store('last_upload', str(now)) | |
352 | except: | |
353 | self.log.exception('Exception while sending report:') | |
354 | else: | |
355 | self.log.info('Interval for sending new report has not expired') | |
356 | ||
357 | sleep = 3600 | |
358 | self.log.debug('Sleeping for %d seconds', sleep) | |
359 | self.event.wait(sleep) | |
360 | ||
361 | def self_test(self): | |
362 | self.compile_report() | |
363 | return True | |
364 | ||
365 | @staticmethod | |
366 | def can_run(): | |
367 | return True, '' |