]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/telemetry/module.py
d/control: depend on python3-yaml for ceph-mgr
[ceph.git] / ceph / src / pybind / mgr / telemetry / module.py
CommitLineData
11fdf7f2
TL
1"""
2Telemetry module for ceph-mgr
3
4Collect statistics from Ceph cluster and send this back to the Ceph project
5when user has opted-in
6"""
7import errno
eafe8130 8import hashlib
11fdf7f2 9import json
eafe8130 10import rbd
11fdf7f2
TL
11import re
12import requests
13import uuid
14import time
eafe8130 15from datetime import datetime, timedelta
11fdf7f2
TL
16from threading import Event
17from collections import defaultdict
18
19from mgr_module import MgrModule
20
21
eafe8130
TL
22ALL_CHANNELS = ['basic', 'ident', 'crash', 'device']
23
24LICENSE='sharing-1-0'
25LICENSE_NAME='Community Data License Agreement - Sharing - Version 1.0'
26LICENSE_URL='https://cdla.io/sharing-1-0/'
27
28# If the telemetry revision has changed since this point, re-require
29# an opt-in. This should happen each time we add new information to
30# the telemetry report.
31LAST_REVISION_RE_OPT_IN = 2
32
33# Latest revision of the telemetry report. Bump this each time we make
34# *any* change.
35REVISION = 3
36
37# History of revisions
38# --------------------
39#
40# Version 1:
41# Mimic and/or nautilus are lumped together here, since
42# we didn't track revisions yet.
43#
44# Version 2:
45# - added revision tracking, nagging, etc.
46# - added config option changes
47# - added channels
48# - added explicit license acknowledgement to the opt-in process
49#
50# Version 3:
51# - added device health metrics (i.e., SMART data, minus serial number)
52# - remove crush_rule
53# - added CephFS metadata (how many MDSs, fs features, how many data pools,
54# how much metadata is cached, rfiles, rbytes, rsnapshots)
55# - added more pool metadata (rep vs ec, cache tiering mode, ec profile)
56# - added host count, and counts for hosts with each of (mon, osd, mds, mgr)
57# - whether an OSD cluster network is in use
58# - rbd pool and image count, and rbd mirror mode (pool-level)
59# - rgw daemons, zones, zonegroups; which rgw frontends
60# - crush map stats
61
11fdf7f2
TL
62class Module(MgrModule):
63 config = dict()
64
65 metadata_keys = [
66 "arch",
67 "ceph_version",
68 "os",
69 "cpu",
70 "kernel_description",
71 "kernel_version",
72 "distro_description",
73 "distro"
74 ]
75
76 MODULE_OPTIONS = [
77 {
78 'name': 'url',
79 'type': 'str',
80 'default': 'https://telemetry.ceph.com/report'
81 },
eafe8130
TL
82 {
83 'name': 'device_url',
84 'type': 'str',
85 'default': 'https://telemetry.ceph.com/device'
86 },
11fdf7f2
TL
87 {
88 'name': 'enabled',
89 'type': 'bool',
90 'default': False
91 },
eafe8130
TL
92 {
93 'name': 'last_opt_revision',
94 'type': 'int',
95 'default': 1,
96 },
11fdf7f2
TL
97 {
98 'name': 'leaderboard',
99 'type': 'bool',
100 'default': False
101 },
102 {
103 'name': 'description',
104 'type': 'str',
105 'default': None
106 },
107 {
108 'name': 'contact',
109 'type': 'str',
110 'default': None
111 },
112 {
113 'name': 'organization',
114 'type': 'str',
115 'default': None
116 },
117 {
118 'name': 'proxy',
119 'type': 'str',
120 'default': None
121 },
122 {
123 'name': 'interval',
124 'type': 'int',
125 'default': 24,
126 'min': 8
eafe8130
TL
127 },
128 {
129 'name': 'channel_basic',
130 'type': 'bool',
131 'default': True,
132 'desc': 'Share basic cluster information (size, version)',
133 },
134 {
135 'name': 'channel_ident',
136 'type': 'bool',
137 'default': False,
138 'description': 'Share a user-provided description and/or contact email for the cluster',
139 },
140 {
141 'name': 'channel_crash',
142 'type': 'bool',
143 'default': True,
144 'description': 'Share metadata about Ceph daemon crashes (version, stack straces, etc)',
145 },
146 {
147 'name': 'channel_device',
148 'type': 'bool',
149 'default': True,
150 'description': 'Share device health metrics (e.g., SMART data, minus potentially identifying info like serial numbers)',
151 },
11fdf7f2
TL
152 ]
153
154 COMMANDS = [
155 {
156 "cmd": "telemetry status",
157 "desc": "Show current configuration",
158 "perm": "r"
159 },
160 {
eafe8130 161 "cmd": "telemetry send "
9f95a23c
TL
162 "name=endpoint,type=CephChoices,strings=ceph|device,n=N,req=false "
163 "name=license,type=CephString,req=false",
11fdf7f2
TL
164 "desc": "Force sending data to Ceph telemetry",
165 "perm": "rw"
166 },
167 {
eafe8130
TL
168 "cmd": "telemetry show "
169 "name=channels,type=CephString,n=N,req=False",
11fdf7f2
TL
170 "desc": "Show last report or report to be sent",
171 "perm": "r"
172 },
92f5a8d4
TL
173 {
174 "cmd": "telemetry show-device",
175 "desc": "Show last device report or device report to be sent",
176 "perm": "r"
177 },
11fdf7f2 178 {
eafe8130 179 "cmd": "telemetry on name=license,type=CephString,req=false",
11fdf7f2
TL
180 "desc": "Enable telemetry reports from this cluster",
181 "perm": "rw",
182 },
183 {
184 "cmd": "telemetry off",
185 "desc": "Disable telemetry reports from this cluster",
186 "perm": "rw",
187 },
188 ]
189
190 @property
191 def config_keys(self):
192 return dict((o['name'], o.get('default', None)) for o in self.MODULE_OPTIONS)
193
194 def __init__(self, *args, **kwargs):
195 super(Module, self).__init__(*args, **kwargs)
196 self.event = Event()
197 self.run = False
198 self.last_upload = None
199 self.last_report = dict()
200 self.report_id = None
eafe8130 201 self.salt = None
11fdf7f2
TL
202
203 def config_notify(self):
204 for opt in self.MODULE_OPTIONS:
205 setattr(self,
206 opt['name'],
207 self.get_module_option(opt['name']))
208 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
eafe8130
TL
209 # wake up serve() thread
210 self.event.set()
11fdf7f2 211
11fdf7f2
TL
212 def load(self):
213 self.last_upload = self.get_store('last_upload', None)
214 if self.last_upload is not None:
215 self.last_upload = int(self.last_upload)
216
217 self.report_id = self.get_store('report_id', None)
218 if self.report_id is None:
219 self.report_id = str(uuid.uuid4())
220 self.set_store('report_id', self.report_id)
221
eafe8130
TL
222 self.salt = self.get_store('salt', None)
223 if not self.salt:
224 self.salt = str(uuid.uuid4())
225 self.set_store('salt', self.salt)
226
11fdf7f2
TL
227 def gather_osd_metadata(self, osd_map):
228 keys = ["osd_objectstore", "rotational"]
229 keys += self.metadata_keys
230
231 metadata = dict()
232 for key in keys:
233 metadata[key] = defaultdict(int)
234
235 for osd in osd_map['osds']:
92f5a8d4
TL
236 res = self.get_metadata('osd', str(osd['osd'])).items()
237 if res is None:
238 self.log.debug('Could not get metadata for osd.%s' % str(osd['osd']))
239 continue
240 for k, v in res:
11fdf7f2
TL
241 if k not in keys:
242 continue
243
244 metadata[k][v] += 1
245
246 return metadata
247
248 def gather_mon_metadata(self, mon_map):
249 keys = list()
250 keys += self.metadata_keys
251
252 metadata = dict()
253 for key in keys:
254 metadata[key] = defaultdict(int)
255
256 for mon in mon_map['mons']:
92f5a8d4
TL
257 res = self.get_metadata('mon', mon['name']).items()
258 if res is None:
259 self.log.debug('Could not get metadata for mon.%s' % (mon['name']))
260 continue
261 for k, v in res:
11fdf7f2
TL
262 if k not in keys:
263 continue
264
265 metadata[k][v] += 1
266
267 return metadata
268
eafe8130
TL
269 def gather_crush_info(self):
270 osdmap = self.get_osdmap()
271 crush_raw = osdmap.get_crush()
272 crush = crush_raw.dump()
273
274 def inc(d, k):
275 if k in d:
276 d[k] += 1
277 else:
278 d[k] = 1
279
280 device_classes = {}
281 for dev in crush['devices']:
282 inc(device_classes, dev.get('class', ''))
283
284 bucket_algs = {}
285 bucket_types = {}
286 bucket_sizes = {}
287 for bucket in crush['buckets']:
288 if '~' in bucket['name']: # ignore shadow buckets
289 continue
290 inc(bucket_algs, bucket['alg'])
291 inc(bucket_types, bucket['type_id'])
292 inc(bucket_sizes, len(bucket['items']))
293
294 return {
295 'num_devices': len(crush['devices']),
296 'num_types': len(crush['types']),
297 'num_buckets': len(crush['buckets']),
298 'num_rules': len(crush['rules']),
299 'device_classes': list(device_classes.values()),
300 'tunables': crush['tunables'],
301 'compat_weight_set': '-1' in crush['choose_args'],
302 'num_weight_sets': len(crush['choose_args']),
303 'bucket_algs': bucket_algs,
304 'bucket_sizes': bucket_sizes,
305 'bucket_types': bucket_types,
306 }
307
308 def gather_configs(self):
309 # cluster config options
310 cluster = set()
311 r, outb, outs = self.mon_command({
312 'prefix': 'config dump',
313 'format': 'json'
314 });
315 if r != 0:
316 return {}
317 try:
318 dump = json.loads(outb)
319 except json.decoder.JSONDecodeError:
320 return {}
321 for opt in dump:
322 name = opt.get('name')
323 if name:
324 cluster.add(name)
325 # daemon-reported options (which may include ceph.conf)
326 active = set()
327 ls = self.get("modified_config_options");
328 for opt in ls.get('options', {}):
329 active.add(opt)
330 return {
331 'cluster_changed': sorted(list(cluster)),
332 'active_changed': sorted(list(active)),
333 }
334
11fdf7f2
TL
335 def gather_crashinfo(self):
336 crashlist = list()
eafe8130 337 errno, crashids, err = self.remote('crash', 'ls')
11fdf7f2
TL
338 if errno:
339 return ''
340 for crashid in crashids.split():
341 cmd = {'id': crashid}
342 errno, crashinfo, err = self.remote('crash', 'do_info', cmd, '')
343 if errno:
344 continue
345 c = json.loads(crashinfo)
346 del c['utsname_hostname']
92f5a8d4
TL
347 # entity_name might have more than one '.', beware
348 (etype, eid) = c.get('entity_name', '').split('.', 1)
eafe8130
TL
349 m = hashlib.sha1()
350 m.update(self.salt.encode('utf-8'))
351 m.update(eid.encode('utf-8'))
352 m.update(self.salt.encode('utf-8'))
353 c['entity_name'] = etype + '.' + m.hexdigest()
11fdf7f2
TL
354 crashlist.append(c)
355 return crashlist
356
eafe8130
TL
357 def get_active_channels(self):
358 r = []
359 if self.channel_basic:
360 r.append('basic')
361 if self.channel_crash:
362 r.append('crash')
363 if self.channel_device:
364 r.append('device')
365 return r
366
367 def gather_device_report(self):
368 try:
369 time_format = self.remote('devicehealth', 'get_time_format')
370 except:
371 return None
372 cutoff = datetime.utcnow() - timedelta(hours=self.interval * 2)
373 min_sample = cutoff.strftime(time_format)
374
375 devices = self.get('devices')['devices']
376
377 res = {} # anon-host-id -> anon-devid -> { timestamp -> record }
378 for d in devices:
379 devid = d['devid']
380 try:
381 # this is a map of stamp -> {device info}
382 m = self.remote('devicehealth', 'get_recent_device_metrics',
383 devid, min_sample)
384 except:
385 continue
386
387 # anonymize host id
388 try:
389 host = d['location'][0]['host']
390 except:
391 continue
392 anon_host = self.get_store('host-id/%s' % host)
393 if not anon_host:
394 anon_host = str(uuid.uuid1())
395 self.set_store('host-id/%s' % host, anon_host)
396 for dev, rep in m.items():
397 rep['host_id'] = anon_host
398
399 # anonymize device id
eafe8130
TL
400 anon_devid = self.get_store('devid-id/%s' % devid)
401 if not anon_devid:
9f95a23c 402 anon_devid = f"{devid.rsplit('_', 1)[0]}_{uuid.uuid1()}"
eafe8130
TL
403 self.set_store('devid-id/%s' % devid, anon_devid)
404 self.log.info('devid %s / %s, host %s / %s' % (devid, anon_devid,
405 host, anon_host))
406
407 # anonymize the smartctl report itself
92f5a8d4
TL
408 serial = devid.rsplit('_', 1)[1]
409 m_str = json.dumps(m)
410 m = json.loads(m_str.replace(serial, 'deleted'))
eafe8130
TL
411
412 if anon_host not in res:
413 res[anon_host] = {}
414 res[anon_host][anon_devid] = m
415 return res
416
417 def get_latest(self, daemon_type, daemon_name, stat):
418 data = self.get_counter(daemon_type, daemon_name, stat)[stat]
419 #self.log.error("get_latest {0} data={1}".format(stat, data))
420 if data:
421 return data[-1][1]
422 else:
423 return 0
424
425 def compile_report(self, channels=[]):
426 if not channels:
427 channels = self.get_active_channels()
11fdf7f2
TL
428 report = {
429 'leaderboard': False,
430 'report_version': 1,
eafe8130
TL
431 'report_timestamp': datetime.utcnow().isoformat(),
432 'report_id': self.report_id,
433 'channels': channels,
434 'channels_available': ALL_CHANNELS,
435 'license': LICENSE,
11fdf7f2
TL
436 }
437
eafe8130
TL
438 if 'ident' in channels:
439 if self.leaderboard:
440 report['leaderboard'] = True
441 for option in ['description', 'contact', 'organization']:
442 report[option] = getattr(self, option)
443
444 if 'basic' in channels:
445 mon_map = self.get('mon_map')
446 osd_map = self.get('osd_map')
447 service_map = self.get('service_map')
448 fs_map = self.get('fs_map')
449 df = self.get('df')
450
9f95a23c 451 report['created'] = mon_map['created']
eafe8130
TL
452
453 # mons
454 v1_mons = 0
455 v2_mons = 0
456 ipv4_mons = 0
457 ipv6_mons = 0
458 for mon in mon_map['mons']:
459 for a in mon['public_addrs']['addrvec']:
460 if a['type'] == 'v2':
461 v2_mons += 1
462 elif a['type'] == 'v1':
463 v1_mons += 1
464 if a['addr'].startswith('['):
465 ipv6_mons += 1
466 else:
467 ipv4_mons += 1
468 report['mon'] = {
469 'count': len(mon_map['mons']),
470 'features': mon_map['features'],
471 'min_mon_release': mon_map['min_mon_release'],
472 'v1_addr_mons': v1_mons,
473 'v2_addr_mons': v2_mons,
474 'ipv4_addr_mons': ipv4_mons,
475 'ipv6_addr_mons': ipv6_mons,
476 }
477
478 report['config'] = self.gather_configs()
479
480 # pools
481 report['rbd'] = {
482 'num_pools': 0,
483 'num_images_by_pool': [],
484 'mirroring_by_pool': [],
485 }
486 num_pg = 0
487 report['pools'] = list()
488 for pool in osd_map['pools']:
489 num_pg += pool['pg_num']
490 ec_profile = {}
491 if pool['erasure_code_profile']:
492 orig = osd_map['erasure_code_profiles'].get(
493 pool['erasure_code_profile'], {})
494 ec_profile = {
495 k: orig[k] for k in orig.keys()
496 if k in ['k', 'm', 'plugin', 'technique',
497 'crush-failure-domain', 'l']
498 }
499 report['pools'].append(
500 {
501 'pool': pool['pool'],
502 'type': pool['type'],
503 'pg_num': pool['pg_num'],
504 'pgp_num': pool['pg_placement_num'],
505 'size': pool['size'],
506 'min_size': pool['min_size'],
507 'pg_autoscale_mode': pool['pg_autoscale_mode'],
508 'target_max_bytes': pool['target_max_bytes'],
509 'target_max_objects': pool['target_max_objects'],
510 'type': ['', 'replicated', '', 'erasure'][pool['type']],
511 'erasure_code_profile': ec_profile,
512 'cache_mode': pool['cache_mode'],
513 }
514 )
515 if 'rbd' in pool['application_metadata']:
516 report['rbd']['num_pools'] += 1
517 ioctx = self.rados.open_ioctx(pool['pool_name'])
518 report['rbd']['num_images_by_pool'].append(
519 sum(1 for _ in rbd.RBD().list2(ioctx)))
520 report['rbd']['mirroring_by_pool'].append(
521 rbd.RBD().mirror_mode_get(ioctx) != rbd.RBD_MIRROR_MODE_DISABLED)
522
523 # osds
524 cluster_network = False
525 for osd in osd_map['osds']:
526 if osd['up'] and not cluster_network:
527 front_ip = osd['public_addrs']['addrvec'][0]['addr'].split(':')[0]
92f5a8d4 528 back_ip = osd['cluster_addrs']['addrvec'][0]['addr'].split(':')[0]
eafe8130
TL
529 if front_ip != back_ip:
530 cluster_network = True
531 report['osd'] = {
532 'count': len(osd_map['osds']),
533 'require_osd_release': osd_map['require_osd_release'],
534 'require_min_compat_client': osd_map['require_min_compat_client'],
535 'cluster_network': cluster_network,
536 }
537
538 # crush
539 report['crush'] = self.gather_crush_info()
540
541 # cephfs
542 report['fs'] = {
543 'count': len(fs_map['filesystems']),
544 'feature_flags': fs_map['feature_flags'],
545 'num_standby_mds': len(fs_map['standbys']),
546 'filesystems': [],
547 }
548 num_mds = len(fs_map['standbys'])
549 for fsm in fs_map['filesystems']:
550 fs = fsm['mdsmap']
551 num_sessions = 0
552 cached_ino = 0
553 cached_dn = 0
554 cached_cap = 0
555 subtrees = 0
556 rfiles = 0
557 rbytes = 0
558 rsnaps = 0
559 for gid, mds in fs['info'].items():
560 num_sessions += self.get_latest('mds', mds['name'],
561 'mds_sessions.session_count')
562 cached_ino += self.get_latest('mds', mds['name'],
563 'mds_mem.ino')
564 cached_dn += self.get_latest('mds', mds['name'],
565 'mds_mem.dn')
566 cached_cap += self.get_latest('mds', mds['name'],
567 'mds_mem.cap')
568 subtrees += self.get_latest('mds', mds['name'],
569 'mds.subtrees')
570 if mds['rank'] == 0:
571 rfiles = self.get_latest('mds', mds['name'],
572 'mds.root_rfiles')
573 rbytes = self.get_latest('mds', mds['name'],
574 'mds.root_rbytes')
575 rsnaps = self.get_latest('mds', mds['name'],
576 'mds.root_rsnaps')
577 report['fs']['filesystems'].append({
578 'max_mds': fs['max_mds'],
579 'ever_allowed_features': fs['ever_allowed_features'],
580 'explicitly_allowed_features': fs['explicitly_allowed_features'],
581 'num_in': len(fs['in']),
582 'num_up': len(fs['up']),
583 'num_standby_replay': len(
584 [mds for gid, mds in fs['info'].items()
585 if mds['state'] == 'up:standby-replay']),
586 'num_mds': len(fs['info']),
587 'num_sessions': num_sessions,
588 'cached_inos': cached_ino,
589 'cached_dns': cached_dn,
590 'cached_caps': cached_cap,
591 'cached_subtrees': subtrees,
592 'balancer_enabled': len(fs['balancer']) > 0,
593 'num_data_pools': len(fs['data_pools']),
594 'standby_count_wanted': fs['standby_count_wanted'],
595 'approx_ctime': fs['created'][0:7],
596 'files': rfiles,
597 'bytes': rbytes,
598 'snaps': rsnaps,
599 })
600 num_mds += len(fs['info'])
601 report['fs']['total_num_mds'] = num_mds
602
603 # daemons
604 report['metadata'] = dict()
605 report['metadata']['osd'] = self.gather_osd_metadata(osd_map)
606 report['metadata']['mon'] = self.gather_mon_metadata(mon_map)
607
608 # host counts
609 servers = self.list_servers()
610 self.log.debug('servers %s' % servers)
611 report['hosts'] = {
612 'num': len([h for h in servers if h['hostname']]),
613 }
614 for t in ['mon', 'mds', 'osd', 'mgr']:
615 report['hosts']['num_with_' + t] = len(
616 [h for h in servers
617 if len([s for s in h['services'] if s['type'] == t])]
618 )
619
620 report['usage'] = {
621 'pools': len(df['pools']),
92f5a8d4 622 'pg_num': num_pg,
eafe8130
TL
623 'total_used_bytes': df['stats']['total_used_bytes'],
624 'total_bytes': df['stats']['total_bytes'],
625 'total_avail_bytes': df['stats']['total_avail_bytes']
626 }
627
628 report['services'] = defaultdict(int)
629 for key, value in service_map['services'].items():
630 report['services'][key] += 1
631 if key == 'rgw':
632 report['rgw'] = {
633 'count': 0,
634 }
635 zones = set()
636 realms = set()
637 zonegroups = set()
638 frontends = set()
639 d = value.get('daemons', dict())
640
641 for k,v in d.items():
642 if k == 'summary' and v:
643 report['rgw'][k] = v
644 elif isinstance(v, dict) and 'metadata' in v:
645 report['rgw']['count'] += 1
646 zones.add(v['metadata']['zone_id'])
647 zonegroups.add(v['metadata']['zonegroup_id'])
648 frontends.add(v['metadata']['frontend_type#0'])
649
650 # we could actually iterate over all the keys of
651 # the dict and check for how many frontends there
652 # are, but it is unlikely that one would be running
653 # more than 2 supported ones
654 f2 = v['metadata'].get('frontend_type#1', None)
655 if f2:
656 frontends.add(f2)
657
658 report['rgw']['zones'] = len(zones)
659 report['rgw']['zonegroups'] = len(zonegroups)
660 report['rgw']['frontends'] = list(frontends) # sets aren't json-serializable
661
662 try:
663 report['balancer'] = self.remote('balancer', 'gather_telemetry')
664 except ImportError:
665 report['balancer'] = {
666 'active': False
11fdf7f2 667 }
11fdf7f2 668
eafe8130
TL
669 if 'crash' in channels:
670 report['crashes'] = self.gather_crashinfo()
11fdf7f2 671
eafe8130
TL
672 # NOTE: We do not include the 'device' channel in this report; it is
673 # sent to a different endpoint.
11fdf7f2
TL
674
675 return report
676
9f95a23c
TL
677 def _try_post(self, what, url, report):
678 self.log.info('Sending %s to: %s' % (what, url))
679 proxies = dict()
680 if self.proxy:
681 self.log.info('Send using HTTP(S) proxy: %s', self.proxy)
682 proxies['http'] = self.proxy
683 proxies['https'] = self.proxy
684 try:
685 resp = requests.put(url=url, json=report, proxies=proxies)
686 resp.raise_for_status()
687 except Exception as e:
688 fail_reason = 'Failed to send %s to %s: %s' % (what, url, str(e))
689 self.log.error(fail_reason)
690 return fail_reason
691 return None
692
eafe8130
TL
693 def send(self, report, endpoint=None):
694 if not endpoint:
695 endpoint = ['ceph', 'device']
696 failed = []
697 success = []
eafe8130 698 self.log.debug('Send endpoints %s' % endpoint)
eafe8130
TL
699 for e in endpoint:
700 if e == 'ceph':
9f95a23c
TL
701 fail_reason = self._try_post('ceph report', self.url, report)
702 if fail_reason:
703 failed.append(fail_reason)
eafe8130
TL
704 else:
705 now = int(time.time())
706 self.last_upload = now
707 self.set_store('last_upload', str(now))
708 success.append('Ceph report sent to {0}'.format(self.url))
709 self.log.info('Sent report to {0}'.format(self.url))
710 elif e == 'device':
711 if 'device' in self.get_active_channels():
eafe8130
TL
712 devices = self.gather_device_report()
713 num_devs = 0
714 num_hosts = 0
715 for host, ls in devices.items():
716 self.log.debug('host %s devices %s' % (host, ls))
717 if not len(ls):
718 continue
9f95a23c
TL
719 fail_reason = self._try_post('devices', self.device_url,
720 ls)
721 if fail_reason:
722 failed.append(fail_reason)
eafe8130
TL
723 else:
724 num_devs += len(ls)
725 num_hosts += 1
726 if num_devs:
727 success.append('Reported %d devices across %d hosts' % (
728 num_devs, len(devices)))
729 if failed:
730 return 1, '', '\n'.join(success + failed)
731 return 0, '', '\n'.join(success)
11fdf7f2
TL
732
733 def handle_command(self, inbuf, command):
734 if command['prefix'] == 'telemetry status':
735 r = {}
736 for opt in self.MODULE_OPTIONS:
737 r[opt['name']] = getattr(self, opt['name'])
9f95a23c
TL
738 r['last_upload'] = time.ctime(self.last_upload) if self.last_upload else self.last_upload
739 return 0, json.dumps(r, indent=4, sort_keys=True), ''
11fdf7f2 740 elif command['prefix'] == 'telemetry on':
eafe8130
TL
741 if command.get('license') != LICENSE:
742 return -errno.EPERM, '', "Telemetry data is licensed under the " + LICENSE_NAME + " (" + LICENSE_URL + ").\nTo enable, add '--license " + LICENSE + "' to the 'ceph telemetry on' command."
11fdf7f2 743 self.set_module_option('enabled', True)
eafe8130 744 self.set_module_option('last_opt_revision', REVISION)
11fdf7f2
TL
745 return 0, '', ''
746 elif command['prefix'] == 'telemetry off':
747 self.set_module_option('enabled', False)
9f95a23c 748 self.set_module_option('last_opt_revision', 1)
11fdf7f2
TL
749 return 0, '', ''
750 elif command['prefix'] == 'telemetry send':
9f95a23c
TL
751 if self.last_opt_revision < LAST_REVISION_RE_OPT_IN and command.get('license') != LICENSE:
752 self.log.debug('A telemetry send attempt while opted-out. Asking for license agreement')
753 return -errno.EPERM, '', "Telemetry data is licensed under the " + LICENSE_NAME + " (" + LICENSE_URL + ").\nTo manually send telemetry data, add '--license " + LICENSE + "' to the 'ceph telemetry send' command.\nPlease consider enabling the telemetry module with 'ceph telemetry on'."
11fdf7f2 754 self.last_report = self.compile_report()
eafe8130 755 return self.send(self.last_report, command.get('endpoint'))
11fdf7f2
TL
756
757 elif command['prefix'] == 'telemetry show':
eafe8130
TL
758 report = self.compile_report(
759 channels=command.get('channels', None)
760 )
9f95a23c 761 report = json.dumps(report, indent=4, sort_keys=True)
92f5a8d4
TL
762 if self.channel_device:
763 report += '\n \nDevice report is generated separately. To see it run \'ceph telemetry show-device\'.'
764 return 0, report, ''
765 elif command['prefix'] == 'telemetry show-device':
766 return 0, json.dumps(self.gather_device_report(), indent=4, sort_keys=True), ''
11fdf7f2
TL
767 else:
768 return (-errno.EINVAL, '',
769 "Command not found '{0}'".format(command['prefix']))
770
771 def self_test(self):
772 report = self.compile_report()
773 if len(report) == 0:
774 raise RuntimeError('Report is empty')
775
776 if 'report_id' not in report:
777 raise RuntimeError('report_id not found in report')
778
779 def shutdown(self):
780 self.run = False
781 self.event.set()
782
eafe8130
TL
783 def refresh_health_checks(self):
784 health_checks = {}
785 if self.enabled and self.last_opt_revision < LAST_REVISION_RE_OPT_IN:
786 health_checks['TELEMETRY_CHANGED'] = {
787 'severity': 'warning',
788 'summary': 'Telemetry requires re-opt-in',
789 'detail': [
790 'telemetry report includes new information; must re-opt-in (or out)'
791 ]
792 }
793 self.set_health_checks(health_checks)
794
11fdf7f2
TL
795 def serve(self):
796 self.load()
797 self.config_notify()
798 self.run = True
799
800 self.log.debug('Waiting for mgr to warm up')
801 self.event.wait(10)
802
803 while self.run:
eafe8130
TL
804 self.event.clear()
805
806 self.refresh_health_checks()
807
808 if self.last_opt_revision < LAST_REVISION_RE_OPT_IN:
809 self.log.debug('Not sending report until user re-opts-in')
810 self.event.wait(1800)
811 continue
11fdf7f2 812 if not self.enabled:
eafe8130 813 self.log.debug('Not sending report until configured to do so')
11fdf7f2
TL
814 self.event.wait(1800)
815 continue
816
817 now = int(time.time())
818 if not self.last_upload or (now - self.last_upload) > \
819 self.interval * 3600:
820 self.log.info('Compiling and sending report to %s',
821 self.url)
822
823 try:
824 self.last_report = self.compile_report()
825 except:
826 self.log.exception('Exception while compiling report:')
827
eafe8130 828 self.send(self.last_report)
11fdf7f2 829 else:
eafe8130 830 self.log.debug('Interval for sending new report has not expired')
11fdf7f2
TL
831
832 sleep = 3600
833 self.log.debug('Sleeping for %d seconds', sleep)
834 self.event.wait(sleep)
835
836 def self_test(self):
837 self.compile_report()
838 return True
839
840 @staticmethod
841 def can_run():
842 return True, ''