]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/telemetry/module.py
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / pybind / mgr / telemetry / module.py
CommitLineData
11fdf7f2
TL
1"""
2Telemetry module for ceph-mgr
3
4Collect statistics from Ceph cluster and send this back to the Ceph project
5when user has opted-in
6"""
7import errno
8import json
9import re
10import requests
11import uuid
12import time
13from datetime import datetime
14from threading import Event
15from collections import defaultdict
16
17from mgr_module import MgrModule
18
19
20class Module(MgrModule):
21 config = dict()
22
23 metadata_keys = [
24 "arch",
25 "ceph_version",
26 "os",
27 "cpu",
28 "kernel_description",
29 "kernel_version",
30 "distro_description",
31 "distro"
32 ]
33
34 MODULE_OPTIONS = [
35 {
36 'name': 'url',
37 'type': 'str',
38 'default': 'https://telemetry.ceph.com/report'
39 },
40 {
41 'name': 'enabled',
42 'type': 'bool',
43 'default': False
44 },
45 {
46 'name': 'leaderboard',
47 'type': 'bool',
48 'default': False
49 },
50 {
51 'name': 'description',
52 'type': 'str',
53 'default': None
54 },
55 {
56 'name': 'contact',
57 'type': 'str',
58 'default': None
59 },
60 {
61 'name': 'organization',
62 'type': 'str',
63 'default': None
64 },
65 {
66 'name': 'proxy',
67 'type': 'str',
68 'default': None
69 },
70 {
71 'name': 'interval',
72 'type': 'int',
73 'default': 24,
74 'min': 8
75 }
76 ]
77
78 COMMANDS = [
79 {
80 "cmd": "telemetry status",
81 "desc": "Show current configuration",
82 "perm": "r"
83 },
84 {
85 "cmd": "telemetry send",
86 "desc": "Force sending data to Ceph telemetry",
87 "perm": "rw"
88 },
89 {
90 "cmd": "telemetry show",
91 "desc": "Show last report or report to be sent",
92 "perm": "r"
93 },
94 {
95 "cmd": "telemetry on",
96 "desc": "Enable telemetry reports from this cluster",
97 "perm": "rw",
98 },
99 {
100 "cmd": "telemetry off",
101 "desc": "Disable telemetry reports from this cluster",
102 "perm": "rw",
103 },
104 ]
105
106 @property
107 def config_keys(self):
108 return dict((o['name'], o.get('default', None)) for o in self.MODULE_OPTIONS)
109
110 def __init__(self, *args, **kwargs):
111 super(Module, self).__init__(*args, **kwargs)
112 self.event = Event()
113 self.run = False
114 self.last_upload = None
115 self.last_report = dict()
116 self.report_id = None
117
118 def config_notify(self):
119 for opt in self.MODULE_OPTIONS:
120 setattr(self,
121 opt['name'],
122 self.get_module_option(opt['name']))
123 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
124
125 @staticmethod
126 def parse_timestamp(timestamp):
127 return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')
128
129 def load(self):
130 self.last_upload = self.get_store('last_upload', None)
131 if self.last_upload is not None:
132 self.last_upload = int(self.last_upload)
133
134 self.report_id = self.get_store('report_id', None)
135 if self.report_id is None:
136 self.report_id = str(uuid.uuid4())
137 self.set_store('report_id', self.report_id)
138
139 def gather_osd_metadata(self, osd_map):
140 keys = ["osd_objectstore", "rotational"]
141 keys += self.metadata_keys
142
143 metadata = dict()
144 for key in keys:
145 metadata[key] = defaultdict(int)
146
147 for osd in osd_map['osds']:
148 for k, v in self.get_metadata('osd', str(osd['osd'])).items():
149 if k not in keys:
150 continue
151
152 metadata[k][v] += 1
153
154 return metadata
155
156 def gather_mon_metadata(self, mon_map):
157 keys = list()
158 keys += self.metadata_keys
159
160 metadata = dict()
161 for key in keys:
162 metadata[key] = defaultdict(int)
163
164 for mon in mon_map['mons']:
165 for k, v in self.get_metadata('mon', mon['name']).items():
166 if k not in keys:
167 continue
168
169 metadata[k][v] += 1
170
171 return metadata
172
173 def gather_crashinfo(self):
174 crashlist = list()
175 errno, crashids, err = self.remote('crash', 'do_ls', '', '')
176 if errno:
177 return ''
178 for crashid in crashids.split():
179 cmd = {'id': crashid}
180 errno, crashinfo, err = self.remote('crash', 'do_info', cmd, '')
181 if errno:
182 continue
183 c = json.loads(crashinfo)
184 del c['utsname_hostname']
185 crashlist.append(c)
186 return crashlist
187
188 def compile_report(self):
189 report = {
190 'leaderboard': False,
191 'report_version': 1,
192 'report_timestamp': datetime.utcnow().isoformat()
193 }
194
195 if self.leaderboard:
196 report['leaderboard'] = True
197
198 for option in ['description', 'contact', 'organization']:
199 report[option] = getattr(self, option)
200
201 mon_map = self.get('mon_map')
202 osd_map = self.get('osd_map')
203 service_map = self.get('service_map')
204 fs_map = self.get('fs_map')
205 df = self.get('df')
206
207 report['report_id'] = self.report_id
208 report['created'] = self.parse_timestamp(mon_map['created']).isoformat()
209
210 report['mon'] = {
211 'count': len(mon_map['mons']),
212 'features': mon_map['features']
213 }
214
215 num_pg = 0
216 report['pools'] = list()
217 for pool in osd_map['pools']:
218 num_pg += pool['pg_num']
219 report['pools'].append(
220 {
221 'pool': pool['pool'],
222 'type': pool['type'],
223 'pg_num': pool['pg_num'],
224 'pgp_num': pool['pg_placement_num'],
225 'size': pool['size'],
226 'min_size': pool['min_size'],
227 'crush_rule': pool['crush_rule']
228 }
229 )
230
231 report['osd'] = {
232 'count': len(osd_map['osds']),
233 'require_osd_release': osd_map['require_osd_release'],
234 'require_min_compat_client': osd_map['require_min_compat_client']
235 }
236
237 report['fs'] = {
238 'count': len(fs_map['filesystems'])
239 }
240
241 report['metadata'] = dict()
242 report['metadata']['osd'] = self.gather_osd_metadata(osd_map)
243 report['metadata']['mon'] = self.gather_mon_metadata(mon_map)
244
245 report['usage'] = {
246 'pools': len(df['pools']),
247 'pg_num:': num_pg,
248 'total_used_bytes': df['stats']['total_used_bytes'],
249 'total_bytes': df['stats']['total_bytes'],
250 'total_avail_bytes': df['stats']['total_avail_bytes']
251 }
252
253 report['services'] = defaultdict(int)
254 for key, value in service_map['services'].items():
255 report['services'][key] += 1
256
257 report['crashes'] = self.gather_crashinfo()
258
259 return report
260
261 def send(self, report):
262 self.log.info('Upload report to: %s', self.url)
263 proxies = dict()
264 if self.proxy:
265 self.log.info('Using HTTP(S) proxy: %s', self.proxy)
266 proxies['http'] = self.proxy
267 proxies['https'] = self.proxy
268
269 resp = requests.put(url=self.url, json=report, proxies=proxies)
270 if not resp.ok:
271 self.log.error("Report send failed: %d %s %s" %
272 (resp.status_code, resp.reason, resp.text))
273 return resp
274
275 def handle_command(self, inbuf, command):
276 if command['prefix'] == 'telemetry status':
277 r = {}
278 for opt in self.MODULE_OPTIONS:
279 r[opt['name']] = getattr(self, opt['name'])
280 return 0, json.dumps(r, indent=4), ''
281 elif command['prefix'] == 'telemetry on':
282 self.set_module_option('enabled', True)
283 return 0, '', ''
284 elif command['prefix'] == 'telemetry off':
285 self.set_module_option('enabled', False)
286 return 0, '', ''
287 elif command['prefix'] == 'telemetry send':
288 self.last_report = self.compile_report()
289 resp = self.send(self.last_report)
290 if resp.ok:
291 return 0, 'Report sent to {0}'.format(self.url), ''
292 return 1, '', 'Failed to send report to %s: %d %s %s' % (
293 self.url,
294 resp.status_code,
295 resp.reason,
296 resp.text
297 )
298
299 elif command['prefix'] == 'telemetry show':
300 report = self.last_report
301 if not report:
302 report = self.compile_report()
303 return 0, json.dumps(report, indent=4), ''
304 else:
305 return (-errno.EINVAL, '',
306 "Command not found '{0}'".format(command['prefix']))
307
308 def self_test(self):
309 report = self.compile_report()
310 if len(report) == 0:
311 raise RuntimeError('Report is empty')
312
313 if 'report_id' not in report:
314 raise RuntimeError('report_id not found in report')
315
316 def shutdown(self):
317 self.run = False
318 self.event.set()
319
320 def serve(self):
321 self.load()
322 self.config_notify()
323 self.run = True
324
325 self.log.debug('Waiting for mgr to warm up')
326 self.event.wait(10)
327
328 while self.run:
329 if not self.enabled:
330 self.log.info('Not sending report until configured to do so')
331 self.event.wait(1800)
332 continue
333
334 now = int(time.time())
335 if not self.last_upload or (now - self.last_upload) > \
336 self.interval * 3600:
337 self.log.info('Compiling and sending report to %s',
338 self.url)
339
340 try:
341 self.last_report = self.compile_report()
342 except:
343 self.log.exception('Exception while compiling report:')
344
345 try:
346 resp = self.send(self.last_report)
347 # self.send logs on failure; only update last_upload
348 # if we succeed
349 if resp.ok:
350 self.last_upload = now
351 self.set_store('last_upload', str(now))
352 except:
353 self.log.exception('Exception while sending report:')
354 else:
355 self.log.info('Interval for sending new report has not expired')
356
357 sleep = 3600
358 self.log.debug('Sleeping for %d seconds', sleep)
359 self.event.wait(sleep)
360
361 def self_test(self):
362 self.compile_report()
363 return True
364
365 @staticmethod
366 def can_run():
367 return True, ''