]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | Telemetry module for ceph-mgr | |
3 | ||
4 | Collect statistics from Ceph cluster and send this back to the Ceph project | |
5 | when user has opted-in | |
6 | """ | |
7 | import errno | |
eafe8130 | 8 | import hashlib |
11fdf7f2 | 9 | import json |
eafe8130 | 10 | import rbd |
11fdf7f2 TL |
11 | import re |
12 | import requests | |
13 | import uuid | |
14 | import time | |
eafe8130 | 15 | from datetime import datetime, timedelta |
11fdf7f2 TL |
16 | from threading import Event |
17 | from collections import defaultdict | |
18 | ||
19 | from mgr_module import MgrModule | |
20 | ||
21 | ||
eafe8130 TL |
22 | ALL_CHANNELS = ['basic', 'ident', 'crash', 'device'] |
23 | ||
24 | LICENSE='sharing-1-0' | |
25 | LICENSE_NAME='Community Data License Agreement - Sharing - Version 1.0' | |
26 | LICENSE_URL='https://cdla.io/sharing-1-0/' | |
27 | ||
28 | # If the telemetry revision has changed since this point, re-require | |
29 | # an opt-in. This should happen each time we add new information to | |
30 | # the telemetry report. | |
31 | LAST_REVISION_RE_OPT_IN = 2 | |
32 | ||
33 | # Latest revision of the telemetry report. Bump this each time we make | |
34 | # *any* change. | |
35 | REVISION = 3 | |
36 | ||
37 | # History of revisions | |
38 | # -------------------- | |
39 | # | |
40 | # Version 1: | |
41 | # Mimic and/or nautilus are lumped together here, since | |
42 | # we didn't track revisions yet. | |
43 | # | |
44 | # Version 2: | |
45 | # - added revision tracking, nagging, etc. | |
46 | # - added config option changes | |
47 | # - added channels | |
48 | # - added explicit license acknowledgement to the opt-in process | |
49 | # | |
50 | # Version 3: | |
51 | # - added device health metrics (i.e., SMART data, minus serial number) | |
52 | # - remove crush_rule | |
53 | # - added CephFS metadata (how many MDSs, fs features, how many data pools, | |
54 | # how much metadata is cached, rfiles, rbytes, rsnapshots) | |
55 | # - added more pool metadata (rep vs ec, cache tiering mode, ec profile) | |
56 | # - added host count, and counts for hosts with each of (mon, osd, mds, mgr) | |
57 | # - whether an OSD cluster network is in use | |
58 | # - rbd pool and image count, and rbd mirror mode (pool-level) | |
59 | # - rgw daemons, zones, zonegroups; which rgw frontends | |
60 | # - crush map stats | |
61 | ||
11fdf7f2 TL |
62 | class Module(MgrModule): |
63 | config = dict() | |
64 | ||
65 | metadata_keys = [ | |
66 | "arch", | |
67 | "ceph_version", | |
68 | "os", | |
69 | "cpu", | |
70 | "kernel_description", | |
71 | "kernel_version", | |
72 | "distro_description", | |
73 | "distro" | |
74 | ] | |
75 | ||
76 | MODULE_OPTIONS = [ | |
77 | { | |
78 | 'name': 'url', | |
79 | 'type': 'str', | |
80 | 'default': 'https://telemetry.ceph.com/report' | |
81 | }, | |
eafe8130 TL |
82 | { |
83 | 'name': 'device_url', | |
84 | 'type': 'str', | |
85 | 'default': 'https://telemetry.ceph.com/device' | |
86 | }, | |
11fdf7f2 TL |
87 | { |
88 | 'name': 'enabled', | |
89 | 'type': 'bool', | |
90 | 'default': False | |
91 | }, | |
eafe8130 TL |
92 | { |
93 | 'name': 'last_opt_revision', | |
94 | 'type': 'int', | |
95 | 'default': 1, | |
96 | }, | |
11fdf7f2 TL |
97 | { |
98 | 'name': 'leaderboard', | |
99 | 'type': 'bool', | |
100 | 'default': False | |
101 | }, | |
102 | { | |
103 | 'name': 'description', | |
104 | 'type': 'str', | |
105 | 'default': None | |
106 | }, | |
107 | { | |
108 | 'name': 'contact', | |
109 | 'type': 'str', | |
110 | 'default': None | |
111 | }, | |
112 | { | |
113 | 'name': 'organization', | |
114 | 'type': 'str', | |
115 | 'default': None | |
116 | }, | |
117 | { | |
118 | 'name': 'proxy', | |
119 | 'type': 'str', | |
120 | 'default': None | |
121 | }, | |
122 | { | |
123 | 'name': 'interval', | |
124 | 'type': 'int', | |
125 | 'default': 24, | |
126 | 'min': 8 | |
eafe8130 TL |
127 | }, |
128 | { | |
129 | 'name': 'channel_basic', | |
130 | 'type': 'bool', | |
131 | 'default': True, | |
132 | 'desc': 'Share basic cluster information (size, version)', | |
133 | }, | |
134 | { | |
135 | 'name': 'channel_ident', | |
136 | 'type': 'bool', | |
137 | 'default': False, | |
138 | 'description': 'Share a user-provided description and/or contact email for the cluster', | |
139 | }, | |
140 | { | |
141 | 'name': 'channel_crash', | |
142 | 'type': 'bool', | |
143 | 'default': True, | |
144 | 'description': 'Share metadata about Ceph daemon crashes (version, stack straces, etc)', | |
145 | }, | |
146 | { | |
147 | 'name': 'channel_device', | |
148 | 'type': 'bool', | |
149 | 'default': True, | |
150 | 'description': 'Share device health metrics (e.g., SMART data, minus potentially identifying info like serial numbers)', | |
151 | }, | |
11fdf7f2 TL |
152 | ] |
153 | ||
154 | COMMANDS = [ | |
155 | { | |
156 | "cmd": "telemetry status", | |
157 | "desc": "Show current configuration", | |
158 | "perm": "r" | |
159 | }, | |
160 | { | |
eafe8130 | 161 | "cmd": "telemetry send " |
9f95a23c TL |
162 | "name=endpoint,type=CephChoices,strings=ceph|device,n=N,req=false " |
163 | "name=license,type=CephString,req=false", | |
11fdf7f2 TL |
164 | "desc": "Force sending data to Ceph telemetry", |
165 | "perm": "rw" | |
166 | }, | |
167 | { | |
eafe8130 TL |
168 | "cmd": "telemetry show " |
169 | "name=channels,type=CephString,n=N,req=False", | |
11fdf7f2 TL |
170 | "desc": "Show last report or report to be sent", |
171 | "perm": "r" | |
172 | }, | |
92f5a8d4 TL |
173 | { |
174 | "cmd": "telemetry show-device", | |
175 | "desc": "Show last device report or device report to be sent", | |
176 | "perm": "r" | |
177 | }, | |
11fdf7f2 | 178 | { |
eafe8130 | 179 | "cmd": "telemetry on name=license,type=CephString,req=false", |
11fdf7f2 TL |
180 | "desc": "Enable telemetry reports from this cluster", |
181 | "perm": "rw", | |
182 | }, | |
183 | { | |
184 | "cmd": "telemetry off", | |
185 | "desc": "Disable telemetry reports from this cluster", | |
186 | "perm": "rw", | |
187 | }, | |
188 | ] | |
189 | ||
190 | @property | |
191 | def config_keys(self): | |
192 | return dict((o['name'], o.get('default', None)) for o in self.MODULE_OPTIONS) | |
193 | ||
194 | def __init__(self, *args, **kwargs): | |
195 | super(Module, self).__init__(*args, **kwargs) | |
196 | self.event = Event() | |
197 | self.run = False | |
198 | self.last_upload = None | |
199 | self.last_report = dict() | |
200 | self.report_id = None | |
eafe8130 | 201 | self.salt = None |
11fdf7f2 TL |
202 | |
203 | def config_notify(self): | |
204 | for opt in self.MODULE_OPTIONS: | |
205 | setattr(self, | |
206 | opt['name'], | |
207 | self.get_module_option(opt['name'])) | |
208 | self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) | |
eafe8130 TL |
209 | # wake up serve() thread |
210 | self.event.set() | |
11fdf7f2 | 211 | |
11fdf7f2 TL |
212 | def load(self): |
213 | self.last_upload = self.get_store('last_upload', None) | |
214 | if self.last_upload is not None: | |
215 | self.last_upload = int(self.last_upload) | |
216 | ||
217 | self.report_id = self.get_store('report_id', None) | |
218 | if self.report_id is None: | |
219 | self.report_id = str(uuid.uuid4()) | |
220 | self.set_store('report_id', self.report_id) | |
221 | ||
eafe8130 TL |
222 | self.salt = self.get_store('salt', None) |
223 | if not self.salt: | |
224 | self.salt = str(uuid.uuid4()) | |
225 | self.set_store('salt', self.salt) | |
226 | ||
11fdf7f2 TL |
227 | def gather_osd_metadata(self, osd_map): |
228 | keys = ["osd_objectstore", "rotational"] | |
229 | keys += self.metadata_keys | |
230 | ||
231 | metadata = dict() | |
232 | for key in keys: | |
233 | metadata[key] = defaultdict(int) | |
234 | ||
235 | for osd in osd_map['osds']: | |
92f5a8d4 TL |
236 | res = self.get_metadata('osd', str(osd['osd'])).items() |
237 | if res is None: | |
238 | self.log.debug('Could not get metadata for osd.%s' % str(osd['osd'])) | |
239 | continue | |
240 | for k, v in res: | |
11fdf7f2 TL |
241 | if k not in keys: |
242 | continue | |
243 | ||
244 | metadata[k][v] += 1 | |
245 | ||
246 | return metadata | |
247 | ||
248 | def gather_mon_metadata(self, mon_map): | |
249 | keys = list() | |
250 | keys += self.metadata_keys | |
251 | ||
252 | metadata = dict() | |
253 | for key in keys: | |
254 | metadata[key] = defaultdict(int) | |
255 | ||
256 | for mon in mon_map['mons']: | |
92f5a8d4 TL |
257 | res = self.get_metadata('mon', mon['name']).items() |
258 | if res is None: | |
259 | self.log.debug('Could not get metadata for mon.%s' % (mon['name'])) | |
260 | continue | |
261 | for k, v in res: | |
11fdf7f2 TL |
262 | if k not in keys: |
263 | continue | |
264 | ||
265 | metadata[k][v] += 1 | |
266 | ||
267 | return metadata | |
268 | ||
eafe8130 TL |
269 | def gather_crush_info(self): |
270 | osdmap = self.get_osdmap() | |
271 | crush_raw = osdmap.get_crush() | |
272 | crush = crush_raw.dump() | |
273 | ||
274 | def inc(d, k): | |
275 | if k in d: | |
276 | d[k] += 1 | |
277 | else: | |
278 | d[k] = 1 | |
279 | ||
280 | device_classes = {} | |
281 | for dev in crush['devices']: | |
282 | inc(device_classes, dev.get('class', '')) | |
283 | ||
284 | bucket_algs = {} | |
285 | bucket_types = {} | |
286 | bucket_sizes = {} | |
287 | for bucket in crush['buckets']: | |
288 | if '~' in bucket['name']: # ignore shadow buckets | |
289 | continue | |
290 | inc(bucket_algs, bucket['alg']) | |
291 | inc(bucket_types, bucket['type_id']) | |
292 | inc(bucket_sizes, len(bucket['items'])) | |
293 | ||
294 | return { | |
295 | 'num_devices': len(crush['devices']), | |
296 | 'num_types': len(crush['types']), | |
297 | 'num_buckets': len(crush['buckets']), | |
298 | 'num_rules': len(crush['rules']), | |
299 | 'device_classes': list(device_classes.values()), | |
300 | 'tunables': crush['tunables'], | |
301 | 'compat_weight_set': '-1' in crush['choose_args'], | |
302 | 'num_weight_sets': len(crush['choose_args']), | |
303 | 'bucket_algs': bucket_algs, | |
304 | 'bucket_sizes': bucket_sizes, | |
305 | 'bucket_types': bucket_types, | |
306 | } | |
307 | ||
308 | def gather_configs(self): | |
309 | # cluster config options | |
310 | cluster = set() | |
311 | r, outb, outs = self.mon_command({ | |
312 | 'prefix': 'config dump', | |
313 | 'format': 'json' | |
314 | }); | |
315 | if r != 0: | |
316 | return {} | |
317 | try: | |
318 | dump = json.loads(outb) | |
319 | except json.decoder.JSONDecodeError: | |
320 | return {} | |
321 | for opt in dump: | |
322 | name = opt.get('name') | |
323 | if name: | |
324 | cluster.add(name) | |
325 | # daemon-reported options (which may include ceph.conf) | |
326 | active = set() | |
327 | ls = self.get("modified_config_options"); | |
328 | for opt in ls.get('options', {}): | |
329 | active.add(opt) | |
330 | return { | |
331 | 'cluster_changed': sorted(list(cluster)), | |
332 | 'active_changed': sorted(list(active)), | |
333 | } | |
334 | ||
11fdf7f2 TL |
335 | def gather_crashinfo(self): |
336 | crashlist = list() | |
eafe8130 | 337 | errno, crashids, err = self.remote('crash', 'ls') |
11fdf7f2 TL |
338 | if errno: |
339 | return '' | |
340 | for crashid in crashids.split(): | |
341 | cmd = {'id': crashid} | |
342 | errno, crashinfo, err = self.remote('crash', 'do_info', cmd, '') | |
343 | if errno: | |
344 | continue | |
345 | c = json.loads(crashinfo) | |
346 | del c['utsname_hostname'] | |
92f5a8d4 TL |
347 | # entity_name might have more than one '.', beware |
348 | (etype, eid) = c.get('entity_name', '').split('.', 1) | |
eafe8130 TL |
349 | m = hashlib.sha1() |
350 | m.update(self.salt.encode('utf-8')) | |
351 | m.update(eid.encode('utf-8')) | |
352 | m.update(self.salt.encode('utf-8')) | |
353 | c['entity_name'] = etype + '.' + m.hexdigest() | |
11fdf7f2 TL |
354 | crashlist.append(c) |
355 | return crashlist | |
356 | ||
eafe8130 TL |
357 | def get_active_channels(self): |
358 | r = [] | |
359 | if self.channel_basic: | |
360 | r.append('basic') | |
361 | if self.channel_crash: | |
362 | r.append('crash') | |
363 | if self.channel_device: | |
364 | r.append('device') | |
365 | return r | |
366 | ||
367 | def gather_device_report(self): | |
368 | try: | |
369 | time_format = self.remote('devicehealth', 'get_time_format') | |
370 | except: | |
371 | return None | |
372 | cutoff = datetime.utcnow() - timedelta(hours=self.interval * 2) | |
373 | min_sample = cutoff.strftime(time_format) | |
374 | ||
375 | devices = self.get('devices')['devices'] | |
376 | ||
377 | res = {} # anon-host-id -> anon-devid -> { timestamp -> record } | |
378 | for d in devices: | |
379 | devid = d['devid'] | |
380 | try: | |
381 | # this is a map of stamp -> {device info} | |
382 | m = self.remote('devicehealth', 'get_recent_device_metrics', | |
383 | devid, min_sample) | |
384 | except: | |
385 | continue | |
386 | ||
387 | # anonymize host id | |
388 | try: | |
389 | host = d['location'][0]['host'] | |
390 | except: | |
391 | continue | |
392 | anon_host = self.get_store('host-id/%s' % host) | |
393 | if not anon_host: | |
394 | anon_host = str(uuid.uuid1()) | |
395 | self.set_store('host-id/%s' % host, anon_host) | |
396 | for dev, rep in m.items(): | |
397 | rep['host_id'] = anon_host | |
398 | ||
399 | # anonymize device id | |
eafe8130 TL |
400 | anon_devid = self.get_store('devid-id/%s' % devid) |
401 | if not anon_devid: | |
9f95a23c | 402 | anon_devid = f"{devid.rsplit('_', 1)[0]}_{uuid.uuid1()}" |
eafe8130 TL |
403 | self.set_store('devid-id/%s' % devid, anon_devid) |
404 | self.log.info('devid %s / %s, host %s / %s' % (devid, anon_devid, | |
405 | host, anon_host)) | |
406 | ||
407 | # anonymize the smartctl report itself | |
92f5a8d4 TL |
408 | serial = devid.rsplit('_', 1)[1] |
409 | m_str = json.dumps(m) | |
410 | m = json.loads(m_str.replace(serial, 'deleted')) | |
eafe8130 TL |
411 | |
412 | if anon_host not in res: | |
413 | res[anon_host] = {} | |
414 | res[anon_host][anon_devid] = m | |
415 | return res | |
416 | ||
417 | def get_latest(self, daemon_type, daemon_name, stat): | |
418 | data = self.get_counter(daemon_type, daemon_name, stat)[stat] | |
419 | #self.log.error("get_latest {0} data={1}".format(stat, data)) | |
420 | if data: | |
421 | return data[-1][1] | |
422 | else: | |
423 | return 0 | |
424 | ||
425 | def compile_report(self, channels=[]): | |
426 | if not channels: | |
427 | channels = self.get_active_channels() | |
11fdf7f2 TL |
428 | report = { |
429 | 'leaderboard': False, | |
430 | 'report_version': 1, | |
eafe8130 TL |
431 | 'report_timestamp': datetime.utcnow().isoformat(), |
432 | 'report_id': self.report_id, | |
433 | 'channels': channels, | |
434 | 'channels_available': ALL_CHANNELS, | |
435 | 'license': LICENSE, | |
11fdf7f2 TL |
436 | } |
437 | ||
eafe8130 TL |
438 | if 'ident' in channels: |
439 | if self.leaderboard: | |
440 | report['leaderboard'] = True | |
441 | for option in ['description', 'contact', 'organization']: | |
442 | report[option] = getattr(self, option) | |
443 | ||
444 | if 'basic' in channels: | |
445 | mon_map = self.get('mon_map') | |
446 | osd_map = self.get('osd_map') | |
447 | service_map = self.get('service_map') | |
448 | fs_map = self.get('fs_map') | |
449 | df = self.get('df') | |
450 | ||
9f95a23c | 451 | report['created'] = mon_map['created'] |
eafe8130 TL |
452 | |
453 | # mons | |
454 | v1_mons = 0 | |
455 | v2_mons = 0 | |
456 | ipv4_mons = 0 | |
457 | ipv6_mons = 0 | |
458 | for mon in mon_map['mons']: | |
459 | for a in mon['public_addrs']['addrvec']: | |
460 | if a['type'] == 'v2': | |
461 | v2_mons += 1 | |
462 | elif a['type'] == 'v1': | |
463 | v1_mons += 1 | |
464 | if a['addr'].startswith('['): | |
465 | ipv6_mons += 1 | |
466 | else: | |
467 | ipv4_mons += 1 | |
468 | report['mon'] = { | |
469 | 'count': len(mon_map['mons']), | |
470 | 'features': mon_map['features'], | |
471 | 'min_mon_release': mon_map['min_mon_release'], | |
472 | 'v1_addr_mons': v1_mons, | |
473 | 'v2_addr_mons': v2_mons, | |
474 | 'ipv4_addr_mons': ipv4_mons, | |
475 | 'ipv6_addr_mons': ipv6_mons, | |
476 | } | |
477 | ||
478 | report['config'] = self.gather_configs() | |
479 | ||
480 | # pools | |
481 | report['rbd'] = { | |
482 | 'num_pools': 0, | |
483 | 'num_images_by_pool': [], | |
484 | 'mirroring_by_pool': [], | |
485 | } | |
486 | num_pg = 0 | |
487 | report['pools'] = list() | |
488 | for pool in osd_map['pools']: | |
489 | num_pg += pool['pg_num'] | |
490 | ec_profile = {} | |
491 | if pool['erasure_code_profile']: | |
492 | orig = osd_map['erasure_code_profiles'].get( | |
493 | pool['erasure_code_profile'], {}) | |
494 | ec_profile = { | |
495 | k: orig[k] for k in orig.keys() | |
496 | if k in ['k', 'm', 'plugin', 'technique', | |
497 | 'crush-failure-domain', 'l'] | |
498 | } | |
499 | report['pools'].append( | |
500 | { | |
501 | 'pool': pool['pool'], | |
502 | 'type': pool['type'], | |
503 | 'pg_num': pool['pg_num'], | |
504 | 'pgp_num': pool['pg_placement_num'], | |
505 | 'size': pool['size'], | |
506 | 'min_size': pool['min_size'], | |
507 | 'pg_autoscale_mode': pool['pg_autoscale_mode'], | |
508 | 'target_max_bytes': pool['target_max_bytes'], | |
509 | 'target_max_objects': pool['target_max_objects'], | |
510 | 'type': ['', 'replicated', '', 'erasure'][pool['type']], | |
511 | 'erasure_code_profile': ec_profile, | |
512 | 'cache_mode': pool['cache_mode'], | |
513 | } | |
514 | ) | |
515 | if 'rbd' in pool['application_metadata']: | |
516 | report['rbd']['num_pools'] += 1 | |
517 | ioctx = self.rados.open_ioctx(pool['pool_name']) | |
518 | report['rbd']['num_images_by_pool'].append( | |
519 | sum(1 for _ in rbd.RBD().list2(ioctx))) | |
520 | report['rbd']['mirroring_by_pool'].append( | |
521 | rbd.RBD().mirror_mode_get(ioctx) != rbd.RBD_MIRROR_MODE_DISABLED) | |
522 | ||
523 | # osds | |
524 | cluster_network = False | |
525 | for osd in osd_map['osds']: | |
526 | if osd['up'] and not cluster_network: | |
527 | front_ip = osd['public_addrs']['addrvec'][0]['addr'].split(':')[0] | |
92f5a8d4 | 528 | back_ip = osd['cluster_addrs']['addrvec'][0]['addr'].split(':')[0] |
eafe8130 TL |
529 | if front_ip != back_ip: |
530 | cluster_network = True | |
531 | report['osd'] = { | |
532 | 'count': len(osd_map['osds']), | |
533 | 'require_osd_release': osd_map['require_osd_release'], | |
534 | 'require_min_compat_client': osd_map['require_min_compat_client'], | |
535 | 'cluster_network': cluster_network, | |
536 | } | |
537 | ||
538 | # crush | |
539 | report['crush'] = self.gather_crush_info() | |
540 | ||
541 | # cephfs | |
542 | report['fs'] = { | |
543 | 'count': len(fs_map['filesystems']), | |
544 | 'feature_flags': fs_map['feature_flags'], | |
545 | 'num_standby_mds': len(fs_map['standbys']), | |
546 | 'filesystems': [], | |
547 | } | |
548 | num_mds = len(fs_map['standbys']) | |
549 | for fsm in fs_map['filesystems']: | |
550 | fs = fsm['mdsmap'] | |
551 | num_sessions = 0 | |
552 | cached_ino = 0 | |
553 | cached_dn = 0 | |
554 | cached_cap = 0 | |
555 | subtrees = 0 | |
556 | rfiles = 0 | |
557 | rbytes = 0 | |
558 | rsnaps = 0 | |
559 | for gid, mds in fs['info'].items(): | |
560 | num_sessions += self.get_latest('mds', mds['name'], | |
561 | 'mds_sessions.session_count') | |
562 | cached_ino += self.get_latest('mds', mds['name'], | |
563 | 'mds_mem.ino') | |
564 | cached_dn += self.get_latest('mds', mds['name'], | |
565 | 'mds_mem.dn') | |
566 | cached_cap += self.get_latest('mds', mds['name'], | |
567 | 'mds_mem.cap') | |
568 | subtrees += self.get_latest('mds', mds['name'], | |
569 | 'mds.subtrees') | |
570 | if mds['rank'] == 0: | |
571 | rfiles = self.get_latest('mds', mds['name'], | |
572 | 'mds.root_rfiles') | |
573 | rbytes = self.get_latest('mds', mds['name'], | |
574 | 'mds.root_rbytes') | |
575 | rsnaps = self.get_latest('mds', mds['name'], | |
576 | 'mds.root_rsnaps') | |
577 | report['fs']['filesystems'].append({ | |
578 | 'max_mds': fs['max_mds'], | |
579 | 'ever_allowed_features': fs['ever_allowed_features'], | |
580 | 'explicitly_allowed_features': fs['explicitly_allowed_features'], | |
581 | 'num_in': len(fs['in']), | |
582 | 'num_up': len(fs['up']), | |
583 | 'num_standby_replay': len( | |
584 | [mds for gid, mds in fs['info'].items() | |
585 | if mds['state'] == 'up:standby-replay']), | |
586 | 'num_mds': len(fs['info']), | |
587 | 'num_sessions': num_sessions, | |
588 | 'cached_inos': cached_ino, | |
589 | 'cached_dns': cached_dn, | |
590 | 'cached_caps': cached_cap, | |
591 | 'cached_subtrees': subtrees, | |
592 | 'balancer_enabled': len(fs['balancer']) > 0, | |
593 | 'num_data_pools': len(fs['data_pools']), | |
594 | 'standby_count_wanted': fs['standby_count_wanted'], | |
595 | 'approx_ctime': fs['created'][0:7], | |
596 | 'files': rfiles, | |
597 | 'bytes': rbytes, | |
598 | 'snaps': rsnaps, | |
599 | }) | |
600 | num_mds += len(fs['info']) | |
601 | report['fs']['total_num_mds'] = num_mds | |
602 | ||
603 | # daemons | |
604 | report['metadata'] = dict() | |
605 | report['metadata']['osd'] = self.gather_osd_metadata(osd_map) | |
606 | report['metadata']['mon'] = self.gather_mon_metadata(mon_map) | |
607 | ||
608 | # host counts | |
609 | servers = self.list_servers() | |
610 | self.log.debug('servers %s' % servers) | |
611 | report['hosts'] = { | |
612 | 'num': len([h for h in servers if h['hostname']]), | |
613 | } | |
614 | for t in ['mon', 'mds', 'osd', 'mgr']: | |
615 | report['hosts']['num_with_' + t] = len( | |
616 | [h for h in servers | |
617 | if len([s for s in h['services'] if s['type'] == t])] | |
618 | ) | |
619 | ||
620 | report['usage'] = { | |
621 | 'pools': len(df['pools']), | |
92f5a8d4 | 622 | 'pg_num': num_pg, |
eafe8130 TL |
623 | 'total_used_bytes': df['stats']['total_used_bytes'], |
624 | 'total_bytes': df['stats']['total_bytes'], | |
625 | 'total_avail_bytes': df['stats']['total_avail_bytes'] | |
626 | } | |
627 | ||
628 | report['services'] = defaultdict(int) | |
629 | for key, value in service_map['services'].items(): | |
630 | report['services'][key] += 1 | |
631 | if key == 'rgw': | |
632 | report['rgw'] = { | |
633 | 'count': 0, | |
634 | } | |
635 | zones = set() | |
636 | realms = set() | |
637 | zonegroups = set() | |
638 | frontends = set() | |
639 | d = value.get('daemons', dict()) | |
640 | ||
641 | for k,v in d.items(): | |
642 | if k == 'summary' and v: | |
643 | report['rgw'][k] = v | |
644 | elif isinstance(v, dict) and 'metadata' in v: | |
645 | report['rgw']['count'] += 1 | |
646 | zones.add(v['metadata']['zone_id']) | |
647 | zonegroups.add(v['metadata']['zonegroup_id']) | |
648 | frontends.add(v['metadata']['frontend_type#0']) | |
649 | ||
650 | # we could actually iterate over all the keys of | |
651 | # the dict and check for how many frontends there | |
652 | # are, but it is unlikely that one would be running | |
653 | # more than 2 supported ones | |
654 | f2 = v['metadata'].get('frontend_type#1', None) | |
655 | if f2: | |
656 | frontends.add(f2) | |
657 | ||
658 | report['rgw']['zones'] = len(zones) | |
659 | report['rgw']['zonegroups'] = len(zonegroups) | |
660 | report['rgw']['frontends'] = list(frontends) # sets aren't json-serializable | |
661 | ||
662 | try: | |
663 | report['balancer'] = self.remote('balancer', 'gather_telemetry') | |
664 | except ImportError: | |
665 | report['balancer'] = { | |
666 | 'active': False | |
11fdf7f2 | 667 | } |
11fdf7f2 | 668 | |
eafe8130 TL |
669 | if 'crash' in channels: |
670 | report['crashes'] = self.gather_crashinfo() | |
11fdf7f2 | 671 | |
eafe8130 TL |
672 | # NOTE: We do not include the 'device' channel in this report; it is |
673 | # sent to a different endpoint. | |
11fdf7f2 TL |
674 | |
675 | return report | |
676 | ||
9f95a23c TL |
677 | def _try_post(self, what, url, report): |
678 | self.log.info('Sending %s to: %s' % (what, url)) | |
679 | proxies = dict() | |
680 | if self.proxy: | |
681 | self.log.info('Send using HTTP(S) proxy: %s', self.proxy) | |
682 | proxies['http'] = self.proxy | |
683 | proxies['https'] = self.proxy | |
684 | try: | |
685 | resp = requests.put(url=url, json=report, proxies=proxies) | |
686 | resp.raise_for_status() | |
687 | except Exception as e: | |
688 | fail_reason = 'Failed to send %s to %s: %s' % (what, url, str(e)) | |
689 | self.log.error(fail_reason) | |
690 | return fail_reason | |
691 | return None | |
692 | ||
eafe8130 TL |
693 | def send(self, report, endpoint=None): |
694 | if not endpoint: | |
695 | endpoint = ['ceph', 'device'] | |
696 | failed = [] | |
697 | success = [] | |
eafe8130 | 698 | self.log.debug('Send endpoints %s' % endpoint) |
eafe8130 TL |
699 | for e in endpoint: |
700 | if e == 'ceph': | |
9f95a23c TL |
701 | fail_reason = self._try_post('ceph report', self.url, report) |
702 | if fail_reason: | |
703 | failed.append(fail_reason) | |
eafe8130 TL |
704 | else: |
705 | now = int(time.time()) | |
706 | self.last_upload = now | |
707 | self.set_store('last_upload', str(now)) | |
708 | success.append('Ceph report sent to {0}'.format(self.url)) | |
709 | self.log.info('Sent report to {0}'.format(self.url)) | |
710 | elif e == 'device': | |
711 | if 'device' in self.get_active_channels(): | |
eafe8130 TL |
712 | devices = self.gather_device_report() |
713 | num_devs = 0 | |
714 | num_hosts = 0 | |
715 | for host, ls in devices.items(): | |
716 | self.log.debug('host %s devices %s' % (host, ls)) | |
717 | if not len(ls): | |
718 | continue | |
9f95a23c TL |
719 | fail_reason = self._try_post('devices', self.device_url, |
720 | ls) | |
721 | if fail_reason: | |
722 | failed.append(fail_reason) | |
eafe8130 TL |
723 | else: |
724 | num_devs += len(ls) | |
725 | num_hosts += 1 | |
726 | if num_devs: | |
727 | success.append('Reported %d devices across %d hosts' % ( | |
728 | num_devs, len(devices))) | |
729 | if failed: | |
730 | return 1, '', '\n'.join(success + failed) | |
731 | return 0, '', '\n'.join(success) | |
11fdf7f2 TL |
732 | |
733 | def handle_command(self, inbuf, command): | |
734 | if command['prefix'] == 'telemetry status': | |
735 | r = {} | |
736 | for opt in self.MODULE_OPTIONS: | |
737 | r[opt['name']] = getattr(self, opt['name']) | |
9f95a23c TL |
738 | r['last_upload'] = time.ctime(self.last_upload) if self.last_upload else self.last_upload |
739 | return 0, json.dumps(r, indent=4, sort_keys=True), '' | |
11fdf7f2 | 740 | elif command['prefix'] == 'telemetry on': |
eafe8130 TL |
741 | if command.get('license') != LICENSE: |
742 | return -errno.EPERM, '', "Telemetry data is licensed under the " + LICENSE_NAME + " (" + LICENSE_URL + ").\nTo enable, add '--license " + LICENSE + "' to the 'ceph telemetry on' command." | |
11fdf7f2 | 743 | self.set_module_option('enabled', True) |
eafe8130 | 744 | self.set_module_option('last_opt_revision', REVISION) |
11fdf7f2 TL |
745 | return 0, '', '' |
746 | elif command['prefix'] == 'telemetry off': | |
747 | self.set_module_option('enabled', False) | |
9f95a23c | 748 | self.set_module_option('last_opt_revision', 1) |
11fdf7f2 TL |
749 | return 0, '', '' |
750 | elif command['prefix'] == 'telemetry send': | |
9f95a23c TL |
751 | if self.last_opt_revision < LAST_REVISION_RE_OPT_IN and command.get('license') != LICENSE: |
752 | self.log.debug('A telemetry send attempt while opted-out. Asking for license agreement') | |
753 | return -errno.EPERM, '', "Telemetry data is licensed under the " + LICENSE_NAME + " (" + LICENSE_URL + ").\nTo manually send telemetry data, add '--license " + LICENSE + "' to the 'ceph telemetry send' command.\nPlease consider enabling the telemetry module with 'ceph telemetry on'." | |
11fdf7f2 | 754 | self.last_report = self.compile_report() |
eafe8130 | 755 | return self.send(self.last_report, command.get('endpoint')) |
11fdf7f2 TL |
756 | |
757 | elif command['prefix'] == 'telemetry show': | |
eafe8130 TL |
758 | report = self.compile_report( |
759 | channels=command.get('channels', None) | |
760 | ) | |
9f95a23c | 761 | report = json.dumps(report, indent=4, sort_keys=True) |
92f5a8d4 TL |
762 | if self.channel_device: |
763 | report += '\n \nDevice report is generated separately. To see it run \'ceph telemetry show-device\'.' | |
764 | return 0, report, '' | |
765 | elif command['prefix'] == 'telemetry show-device': | |
766 | return 0, json.dumps(self.gather_device_report(), indent=4, sort_keys=True), '' | |
11fdf7f2 TL |
767 | else: |
768 | return (-errno.EINVAL, '', | |
769 | "Command not found '{0}'".format(command['prefix'])) | |
770 | ||
771 | def self_test(self): | |
772 | report = self.compile_report() | |
773 | if len(report) == 0: | |
774 | raise RuntimeError('Report is empty') | |
775 | ||
776 | if 'report_id' not in report: | |
777 | raise RuntimeError('report_id not found in report') | |
778 | ||
779 | def shutdown(self): | |
780 | self.run = False | |
781 | self.event.set() | |
782 | ||
eafe8130 TL |
783 | def refresh_health_checks(self): |
784 | health_checks = {} | |
785 | if self.enabled and self.last_opt_revision < LAST_REVISION_RE_OPT_IN: | |
786 | health_checks['TELEMETRY_CHANGED'] = { | |
787 | 'severity': 'warning', | |
788 | 'summary': 'Telemetry requires re-opt-in', | |
789 | 'detail': [ | |
790 | 'telemetry report includes new information; must re-opt-in (or out)' | |
791 | ] | |
792 | } | |
793 | self.set_health_checks(health_checks) | |
794 | ||
11fdf7f2 TL |
795 | def serve(self): |
796 | self.load() | |
797 | self.config_notify() | |
798 | self.run = True | |
799 | ||
800 | self.log.debug('Waiting for mgr to warm up') | |
801 | self.event.wait(10) | |
802 | ||
803 | while self.run: | |
eafe8130 TL |
804 | self.event.clear() |
805 | ||
806 | self.refresh_health_checks() | |
807 | ||
808 | if self.last_opt_revision < LAST_REVISION_RE_OPT_IN: | |
809 | self.log.debug('Not sending report until user re-opts-in') | |
810 | self.event.wait(1800) | |
811 | continue | |
11fdf7f2 | 812 | if not self.enabled: |
eafe8130 | 813 | self.log.debug('Not sending report until configured to do so') |
11fdf7f2 TL |
814 | self.event.wait(1800) |
815 | continue | |
816 | ||
817 | now = int(time.time()) | |
818 | if not self.last_upload or (now - self.last_upload) > \ | |
819 | self.interval * 3600: | |
820 | self.log.info('Compiling and sending report to %s', | |
821 | self.url) | |
822 | ||
823 | try: | |
824 | self.last_report = self.compile_report() | |
825 | except: | |
826 | self.log.exception('Exception while compiling report:') | |
827 | ||
eafe8130 | 828 | self.send(self.last_report) |
11fdf7f2 | 829 | else: |
eafe8130 | 830 | self.log.debug('Interval for sending new report has not expired') |
11fdf7f2 TL |
831 | |
832 | sleep = 3600 | |
833 | self.log.debug('Sleeping for %d seconds', sleep) | |
834 | self.event.wait(sleep) | |
835 | ||
836 | def self_test(self): | |
837 | self.compile_report() | |
838 | return True | |
839 | ||
840 | @staticmethod | |
841 | def can_run(): | |
842 | return True, '' |