ceph/src/pybind/mgr/telemetry/module.py

   1 """
   2 Telemetry module for ceph-mgr
   3
   4 Collect statistics from Ceph cluster and send this back to the Ceph project
   5 when user has opted-in
   6 """
   7 import logging
   8 import numbers
   9 import enum
  10 import errno
  11 import hashlib
  12 import json
  13 import rbd
  14 import requests
  15 import uuid
  16 import time
  17 from datetime import datetime, timedelta
  18 from prettytable import PrettyTable
  19 from threading import Event, Lock
  20 from collections import defaultdict
  21 from typing import cast, Any, DefaultDict, Dict, List, Optional, Tuple, TypeVar, TYPE_CHECKING, Union
  22
  23 from mgr_module import CLICommand, CLIReadCommand, MgrModule, Option, OptionValue, ServiceInfoT
  24
  25
  26 ALL_CHANNELS = ['basic', 'ident', 'crash', 'device', 'perf']
  27
  28 LICENSE = 'sharing-1-0'
  29 LICENSE_NAME = 'Community Data License Agreement - Sharing - Version 1.0'
  30 LICENSE_URL = 'https://cdla.io/sharing-1-0/'
  31 NO_SALT_CNT = 0
  32
  33 # Latest revision of the telemetry report.  Bump this each time we make
  34 # *any* change.
  35 REVISION = 3
  36
  37 # History of revisions
  38 # --------------------
  39 #
  40 # Version 1:
  41 #   Mimic and/or nautilus are lumped together here, since
  42 #   we didn't track revisions yet.
  43 #
  44 # Version 2:
  45 #   - added revision tracking, nagging, etc.
  46 #   - added config option changes
  47 #   - added channels
  48 #   - added explicit license acknowledgement to the opt-in process
  49 #
  50 # Version 3:
  51 #   - added device health metrics (i.e., SMART data, minus serial number)
  52 #   - remove crush_rule
  53 #   - added CephFS metadata (how many MDSs, fs features, how many data pools,
  54 #     how much metadata is cached, rfiles, rbytes, rsnapshots)
  55 #   - added more pool metadata (rep vs ec, cache tiering mode, ec profile)
  56 #   - added host count, and counts for hosts with each of (mon, osd, mds, mgr)
  57 #   - whether an OSD cluster network is in use
  58 #   - rbd pool and image count, and rbd mirror mode (pool-level)
  59 #   - rgw daemons, zones, zonegroups; which rgw frontends
  60 #   - crush map stats
  61
  62 class Collection(str, enum.Enum):
  63     basic_base = 'basic_base'
  64     device_base = 'device_base'
  65     crash_base = 'crash_base'
  66     ident_base = 'ident_base'
  67     perf_perf = 'perf_perf'
  68     basic_mds_metadata = 'basic_mds_metadata'
  69     basic_pool_usage = 'basic_pool_usage'
  70     basic_usage_by_class = 'basic_usage_by_class'
  71     basic_rook_v01 = 'basic_rook_v01'
  72
  73 MODULE_COLLECTION : List[Dict] = [
  74     {
  75         "name": Collection.basic_base,
  76         "description": "Basic information about the cluster (capacity, number and type of daemons, version, etc.)",
  77         "channel": "basic",
  78         "nag": False
  79     },
  80     {
  81         "name": Collection.device_base,
  82         "description": "Information about device health metrics",
  83         "channel": "device",
  84         "nag": False
  85     },
  86     {
  87         "name": Collection.crash_base,
  88         "description": "Information about daemon crashes (daemon type and version, backtrace, etc.)",
  89         "channel": "crash",
  90         "nag": False
  91     },
  92     {
  93         "name": Collection.ident_base,
  94         "description": "User-provided identifying information about the cluster",
  95         "channel": "ident",
  96         "nag": False
  97     },
  98     {
  99         "name": Collection.perf_perf,
 100         "description": "Information about performance counters of the cluster",
 101         "channel": "perf",
 102         "nag": True
 103     },
 104     {
 105         "name": Collection.basic_mds_metadata,
 106         "description": "MDS metadata",
 107         "channel": "basic",
 108         "nag": False
 109     },
 110     {
 111         "name": Collection.basic_pool_usage,
 112         "description": "Default pool application and usage statistics",
 113         "channel": "basic",
 114         "nag": False
 115     },
 116     {
 117         "name": Collection.basic_usage_by_class,
 118         "description": "Default device class usage statistics",
 119         "channel": "basic",
 120         "nag": False
 121     },
 122     {
 123         "name": Collection.basic_rook_v01,
 124         "description": "Basic Rook deployment data",
 125         "channel": "basic",
 126         "nag": True
 127     },
 128 ]
 129
 130 ROOK_KEYS_BY_COLLECTION : List[Tuple[str, Collection]] = [
 131         # Note: a key cannot be both a node and a leaf, e.g.
 132         # "rook/a/b"
 133         # "rook/a/b/c"
 134         ("rook/version", Collection.basic_rook_v01),
 135         ("rook/kubernetes/version", Collection.basic_rook_v01),
 136         ("rook/csi/version", Collection.basic_rook_v01),
 137         ("rook/node/count/kubernetes-total", Collection.basic_rook_v01),
 138         ("rook/node/count/with-ceph-daemons", Collection.basic_rook_v01),
 139         ("rook/node/count/with-csi-rbd-plugin", Collection.basic_rook_v01),
 140         ("rook/node/count/with-csi-cephfs-plugin", Collection.basic_rook_v01),
 141         ("rook/node/count/with-csi-nfs-plugin", Collection.basic_rook_v01),
 142         ("rook/usage/storage-class/count/total", Collection.basic_rook_v01),
 143         ("rook/usage/storage-class/count/rbd", Collection.basic_rook_v01),
 144         ("rook/usage/storage-class/count/cephfs", Collection.basic_rook_v01),
 145         ("rook/usage/storage-class/count/nfs", Collection.basic_rook_v01),
 146         ("rook/usage/storage-class/count/bucket", Collection.basic_rook_v01),
 147         ("rook/cluster/storage/device-set/count/total", Collection.basic_rook_v01),
 148         ("rook/cluster/storage/device-set/count/portable", Collection.basic_rook_v01),
 149         ("rook/cluster/storage/device-set/count/non-portable", Collection.basic_rook_v01),
 150         ("rook/cluster/mon/count", Collection.basic_rook_v01),
 151         ("rook/cluster/mon/allow-multiple-per-node", Collection.basic_rook_v01),
 152         ("rook/cluster/mon/max-id", Collection.basic_rook_v01),
 153         ("rook/cluster/mon/pvc/enabled", Collection.basic_rook_v01),
 154         ("rook/cluster/mon/stretch/enabled", Collection.basic_rook_v01),
 155         ("rook/cluster/network/provider", Collection.basic_rook_v01),
 156         ("rook/cluster/external-mode", Collection.basic_rook_v01),
 157 ]
 158
 159 class Module(MgrModule):
 160     metadata_keys = [
 161         "arch",
 162         "ceph_version",
 163         "os",
 164         "cpu",
 165         "kernel_description",
 166         "kernel_version",
 167         "distro_description",
 168         "distro"
 169     ]
 170
 171     MODULE_OPTIONS = [
 172         Option(name='url',
 173                type='str',
 174                default='https://telemetry.ceph.com/report'),
 175         Option(name='device_url',
 176                type='str',
 177                default='https://telemetry.ceph.com/device'),
 178         Option(name='enabled',
 179                type='bool',
 180                default=False),
 181         Option(name='last_opt_revision',
 182                type='int',
 183                default=1),
 184         Option(name='leaderboard',
 185                type='bool',
 186                default=False),
 187         Option(name='description',
 188                type='str',
 189                default=None),
 190         Option(name='contact',
 191                type='str',
 192                default=None),
 193         Option(name='organization',
 194                type='str',
 195                default=None),
 196         Option(name='proxy',
 197                type='str',
 198                default=None),
 199         Option(name='interval',
 200                type='int',
 201                default=24,
 202                min=8),
 203         Option(name='channel_basic',
 204                type='bool',
 205                default=True,
 206                desc='Share basic cluster information (size, version)'),
 207         Option(name='channel_ident',
 208                type='bool',
 209                default=False,
 210                desc='Share a user-provided description and/or contact email for the cluster'),
 211         Option(name='channel_crash',
 212                type='bool',
 213                default=True,
 214                desc='Share metadata about Ceph daemon crashes (version, stack straces, etc)'),
 215         Option(name='channel_device',
 216                type='bool',
 217                default=True,
 218                desc=('Share device health metrics '
 219                      '(e.g., SMART data, minus potentially identifying info like serial numbers)')),
 220         Option(name='channel_perf',
 221                type='bool',
 222                default=False,
 223                desc='Share various performance metrics of a cluster'),
 224     ]
 225
 226     @property
 227     def config_keys(self) -> Dict[str, OptionValue]:
 228         return dict((o['name'], o.get('default', None)) for o in self.MODULE_OPTIONS)
 229
 230     def __init__(self, *args: Any, **kwargs: Any) -> None:
 231         super(Module, self).__init__(*args, **kwargs)
 232         self.event = Event()
 233         self.run = False
 234         self.db_collection: Optional[List[str]] = None
 235         self.last_opted_in_ceph_version: Optional[int] = None
 236         self.last_opted_out_ceph_version: Optional[int] = None
 237         self.last_upload: Optional[int] = None
 238         self.last_report: Dict[str, Any] = dict()
 239         self.report_id: Optional[str] = None
 240         self.salt: Optional[str] = None
 241         self.get_report_lock = Lock()
 242         self.config_update_module_option()
 243         # for mypy which does not run the code
 244         if TYPE_CHECKING:
 245             self.url = ''
 246             self.device_url = ''
 247             self.enabled = False
 248             self.last_opt_revision = 0
 249             self.leaderboard = ''
 250             self.interval = 0
 251             self.proxy = ''
 252             self.channel_basic = True
 253             self.channel_ident = False
 254             self.channel_crash = True
 255             self.channel_device = True
 256             self.channel_perf = False
 257             self.db_collection = ['basic_base', 'device_base']
 258             self.last_opted_in_ceph_version = 17
 259             self.last_opted_out_ceph_version = 0
 260
 261     def config_update_module_option(self) -> None:
 262         for opt in self.MODULE_OPTIONS:
 263             setattr(self,
 264                     opt['name'],
 265                     self.get_module_option(opt['name']))
 266             self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
 267
 268     def config_notify(self) -> None:
 269         self.config_update_module_option()
 270         # wake up serve() thread
 271         self.event.set()
 272
 273     def load(self) -> None:
 274         last_upload = self.get_store('last_upload', None)
 275         if last_upload is None:
 276             self.last_upload = None
 277         else:
 278             self.last_upload = int(last_upload)
 279
 280         report_id = self.get_store('report_id', None)
 281         if report_id is None:
 282             self.report_id = str(uuid.uuid4())
 283             self.set_store('report_id', self.report_id)
 284         else:
 285             self.report_id = report_id
 286
 287         salt = self.get_store('salt', None)
 288         if salt is None:
 289             self.salt = str(uuid.uuid4())
 290             self.set_store('salt', self.salt)
 291         else:
 292             self.salt = salt
 293
 294         self.init_collection()
 295
 296         last_opted_in_ceph_version = self.get_store('last_opted_in_ceph_version', None)
 297         if last_opted_in_ceph_version is None:
 298             self.last_opted_in_ceph_version = None
 299         else:
 300             self.last_opted_in_ceph_version = int(last_opted_in_ceph_version)
 301
 302         last_opted_out_ceph_version = self.get_store('last_opted_out_ceph_version', None)
 303         if last_opted_out_ceph_version is None:
 304             self.last_opted_out_ceph_version = None
 305         else:
 306             self.last_opted_out_ceph_version = int(last_opted_out_ceph_version)
 307
 308     def gather_osd_metadata(self,
 309                             osd_map: Dict[str, List[Dict[str, int]]]) -> Dict[str, Dict[str, int]]:
 310         keys = ["osd_objectstore", "rotational"]
 311         keys += self.metadata_keys
 312
 313         metadata: Dict[str, Dict[str, int]] = dict()
 314         for key in keys:
 315             metadata[key] = defaultdict(int)
 316
 317         for osd in osd_map['osds']:
 318             res = self.get_metadata('osd', str(osd['osd']))
 319             if res is None:
 320                 self.log.debug('Could not get metadata for osd.%s' % str(osd['osd']))
 321                 continue
 322             for k, v in res.items():
 323                 if k not in keys:
 324                     continue
 325
 326                 metadata[k][v] += 1
 327
 328         return metadata
 329
 330     def gather_mon_metadata(self,
 331                             mon_map: Dict[str, List[Dict[str, str]]]) -> Dict[str, Dict[str, int]]:
 332         keys = list()
 333         keys += self.metadata_keys
 334
 335         metadata: Dict[str, Dict[str, int]] = dict()
 336         for key in keys:
 337             metadata[key] = defaultdict(int)
 338
 339         for mon in mon_map['mons']:
 340             res = self.get_metadata('mon', mon['name'])
 341             if res is None:
 342                 self.log.debug('Could not get metadata for mon.%s' % (mon['name']))
 343                 continue
 344             for k, v in res.items():
 345                 if k not in keys:
 346                     continue
 347
 348                 metadata[k][v] += 1
 349
 350         return metadata
 351
 352     def gather_mds_metadata(self) -> Dict[str, Dict[str, int]]:
 353         metadata: Dict[str, Dict[str, int]] = dict()
 354
 355         res = self.get('mds_metadata')  # metadata of *all* mds daemons
 356         if res is None or not res:
 357             self.log.debug('Could not get metadata for mds daemons')
 358             return metadata
 359
 360         keys = list()
 361         keys += self.metadata_keys
 362
 363         for key in keys:
 364             metadata[key] = defaultdict(int)
 365
 366         for mds in res.values():
 367             for k, v in mds.items():
 368                 if k not in keys:
 369                     continue
 370
 371                 metadata[k][v] += 1
 372
 373         return metadata
 374
 375     def gather_crush_info(self) -> Dict[str, Union[int,
 376                                                    bool,
 377                                                    List[int],
 378                                                    Dict[str, int],
 379                                                    Dict[int, int]]]:
 380         osdmap = self.get_osdmap()
 381         crush_raw = osdmap.get_crush()
 382         crush = crush_raw.dump()
 383
 384         BucketKeyT = TypeVar('BucketKeyT', int, str)
 385
 386         def inc(d: Dict[BucketKeyT, int], k: BucketKeyT) -> None:
 387             if k in d:
 388                 d[k] += 1
 389             else:
 390                 d[k] = 1
 391
 392         device_classes: Dict[str, int] = {}
 393         for dev in crush['devices']:
 394             inc(device_classes, dev.get('class', ''))
 395
 396         bucket_algs: Dict[str, int] = {}
 397         bucket_types: Dict[str, int] = {}
 398         bucket_sizes: Dict[int, int] = {}
 399         for bucket in crush['buckets']:
 400             if '~' in bucket['name']:  # ignore shadow buckets
 401                 continue
 402             inc(bucket_algs, bucket['alg'])
 403             inc(bucket_types, bucket['type_id'])
 404             inc(bucket_sizes, len(bucket['items']))
 405
 406         return {
 407             'num_devices': len(crush['devices']),
 408             'num_types': len(crush['types']),
 409             'num_buckets': len(crush['buckets']),
 410             'num_rules': len(crush['rules']),
 411             'device_classes': list(device_classes.values()),
 412             'tunables': crush['tunables'],
 413             'compat_weight_set': '-1' in crush['choose_args'],
 414             'num_weight_sets': len(crush['choose_args']),
 415             'bucket_algs': bucket_algs,
 416             'bucket_sizes': bucket_sizes,
 417             'bucket_types': bucket_types,
 418         }
 419
 420     def gather_configs(self) -> Dict[str, List[str]]:
 421         # cluster config options
 422         cluster = set()
 423         r, outb, outs = self.mon_command({
 424             'prefix': 'config dump',
 425             'format': 'json'
 426         })
 427         if r != 0:
 428             return {}
 429         try:
 430             dump = json.loads(outb)
 431         except json.decoder.JSONDecodeError:
 432             return {}
 433         for opt in dump:
 434             name = opt.get('name')
 435             if name:
 436                 cluster.add(name)
 437         # daemon-reported options (which may include ceph.conf)
 438         active = set()
 439         ls = self.get("modified_config_options")
 440         for opt in ls.get('options', {}):
 441             active.add(opt)
 442         return {
 443             'cluster_changed': sorted(list(cluster)),
 444             'active_changed': sorted(list(active)),
 445         }
 446
 447     def anonymize_entity_name(self, entity_name:str) -> str:
 448         if '.' not in entity_name:
 449             self.log.debug(f"Cannot split entity name ({entity_name}), no '.' is found")
 450             return entity_name
 451
 452         (etype, eid) = entity_name.split('.', 1)
 453         m = hashlib.sha1()
 454         salt = ''
 455         if self.salt is not None:
 456             salt = self.salt
 457         # avoid asserting that salt exists
 458         if not self.salt:
 459             # do not set self.salt to a temp value
 460             salt = f"no_salt_found_{NO_SALT_CNT}"
 461             NO_SALT_CNT += 1
 462             self.log.debug(f"No salt found, created a temp one: {salt}")
 463         m.update(salt.encode('utf-8'))
 464         m.update(eid.encode('utf-8'))
 465         m.update(salt.encode('utf-8'))
 466
 467         return  etype + '.' + m.hexdigest()
 468
 469     def get_heap_stats(self) -> Dict[str, dict]:
 470         # Initialize result dict
 471         result: Dict[str, dict] = defaultdict(lambda: defaultdict(int))
 472
 473         # Get list of osd ids from the metadata
 474         osd_metadata = self.get('osd_metadata')
 475
 476         # Grab output from the "osd.x heap stats" command
 477         for osd_id in osd_metadata:
 478             cmd_dict = {
 479                 'prefix': 'heap',
 480                 'heapcmd': 'stats',
 481                 'id': str(osd_id),
 482             }
 483             r, outb, outs = self.osd_command(cmd_dict)
 484             if r != 0:
 485                 self.log.debug("Invalid command dictionary.")
 486                 continue
 487             else:
 488                 if 'tcmalloc heap stats' in outs:
 489                     values = [int(i) for i in outs.split() if i.isdigit()]
 490                     # `categories` must be ordered this way for the correct output to be parsed
 491                     categories = ['use_by_application',
 492                                   'page_heap_freelist',
 493                                   'central_cache_freelist',
 494                                   'transfer_cache_freelist',
 495                                   'thread_cache_freelists',
 496                                   'malloc_metadata',
 497                                   'actual_memory_used',
 498                                   'released_to_os',
 499                                   'virtual_address_space_used',
 500                                   'spans_in_use',
 501                                   'thread_heaps_in_use',
 502                                   'tcmalloc_page_size']
 503                     if len(values) != len(categories):
 504                         self.log.debug('Received unexpected output from osd.{}; number of values should match the number of expected categories:\n' \
 505                                 'values: len={} {} ~ categories: len={} {} ~ outs: {}'.format(osd_id, len(values), values, len(categories), categories, outs))
 506                         continue
 507                     osd = 'osd.' + str(osd_id)
 508                     result[osd] = dict(zip(categories, values))
 509                 else:
 510                     self.log.debug('No heap stats available on osd.{}: {}'.format(osd_id, outs))
 511                     continue
 512
 513         return result
 514
 515     def get_mempool(self, mode: str = 'separated') -> Dict[str, dict]:
 516         # Initialize result dict
 517         result: Dict[str, dict] = defaultdict(lambda: defaultdict(int))
 518
 519         # Get list of osd ids from the metadata
 520         osd_metadata = self.get('osd_metadata')
 521
 522         # Grab output from the "osd.x dump_mempools" command
 523         for osd_id in osd_metadata:
 524             cmd_dict = {
 525                 'prefix': 'dump_mempools',
 526                 'id': str(osd_id),
 527                 'format': 'json'
 528             }
 529             r, outb, outs = self.osd_command(cmd_dict)
 530             if r != 0:
 531                 self.log.debug("Invalid command dictionary.")
 532                 continue
 533             else:
 534                 try:
 535                     # This is where the mempool will land.
 536                     dump = json.loads(outb)
 537                     if mode == 'separated':
 538                         result["osd." + str(osd_id)] = dump['mempool']['by_pool']
 539                     elif mode == 'aggregated':
 540                         for mem_type in dump['mempool']['by_pool']:
 541                             result[mem_type]['bytes'] += dump['mempool']['by_pool'][mem_type]['bytes']
 542                             result[mem_type]['items'] += dump['mempool']['by_pool'][mem_type]['items']
 543                     else:
 544                         self.log.debug("Incorrect mode specified in get_mempool")
 545                 except (json.decoder.JSONDecodeError, KeyError) as e:
 546                     self.log.debug("Error caught on osd.{}: {}".format(osd_id, e))
 547                     continue
 548
 549         return result
 550
 551     def get_osd_histograms(self, mode: str = 'separated') -> List[Dict[str, dict]]:
 552         # Initialize result dict
 553         result: Dict[str, dict] = defaultdict(lambda: defaultdict(
 554                                               lambda: defaultdict(
 555                                               lambda: defaultdict(
 556                                               lambda: defaultdict(
 557                                               lambda: defaultdict(int))))))
 558
 559         # Get list of osd ids from the metadata
 560         osd_metadata = self.get('osd_metadata')
 561
 562         # Grab output from the "osd.x perf histogram dump" command
 563         for osd_id in osd_metadata:
 564             cmd_dict = {
 565                 'prefix': 'perf histogram dump',
 566                 'id': str(osd_id),
 567                 'format': 'json'
 568             }
 569             r, outb, outs = self.osd_command(cmd_dict)
 570             # Check for invalid calls
 571             if r != 0:
 572                 self.log.debug("Invalid command dictionary.")
 573                 continue
 574             else:
 575                 try:
 576                     # This is where the histograms will land if there are any.
 577                     dump = json.loads(outb)
 578
 579                     for histogram in dump['osd']:
 580                         # Log axis information. There are two axes, each represented
 581                         # as a dictionary. Both dictionaries are contained inside a
 582                         # list called 'axes'.
 583                         axes = []
 584                         for axis in dump['osd'][histogram]['axes']:
 585
 586                             # This is the dict that contains information for an individual
 587                             # axis. It will be appended to the 'axes' list at the end.
 588                             axis_dict: Dict[str, Any] = defaultdict()
 589
 590                             # Collecting information for buckets, min, name, etc.
 591                             axis_dict['buckets'] = axis['buckets']
 592                             axis_dict['min'] = axis['min']
 593                             axis_dict['name'] = axis['name']
 594                             axis_dict['quant_size'] = axis['quant_size']
 595                             axis_dict['scale_type'] = axis['scale_type']
 596
 597                             # Collecting ranges; placing them in lists to
 598                             # improve readability later on.
 599                             ranges = []
 600                             for _range in axis['ranges']:
 601                                 _max, _min = None, None
 602                                 if 'max' in _range:
 603                                     _max = _range['max']
 604                                 if 'min' in _range:
 605                                     _min = _range['min']
 606                                 ranges.append([_min, _max])
 607                             axis_dict['ranges'] = ranges
 608
 609                             # Now that 'axis_dict' contains all the appropriate
 610                             # information for the current axis, append it to the 'axes' list.
 611                             # There will end up being two axes in the 'axes' list, since the
 612                             # histograms are 2D.
 613                             axes.append(axis_dict)
 614
 615                         # Add the 'axes' list, containing both axes, to result.
 616                         # At this point, you will see that the name of the key is the string
 617                         # form of our axes list (str(axes)). This is there so that histograms
 618                         # with different axis configs will not be combined.
 619                         # These key names are later dropped when only the values are returned.
 620                         result[str(axes)][histogram]['axes'] = axes
 621
 622                         # Collect current values and make sure they are in
 623                         # integer form.
 624                         values = []
 625                         for value_list in dump['osd'][histogram]['values']:
 626                             values.append([int(v) for v in value_list])
 627
 628                         if mode == 'separated':
 629                             if 'osds' not in result[str(axes)][histogram]:
 630                                 result[str(axes)][histogram]['osds'] = []
 631                             result[str(axes)][histogram]['osds'].append({'osd_id': int(osd_id), 'values': values})
 632
 633                         elif mode == 'aggregated':
 634                             # Aggregate values. If 'values' have already been initialized,
 635                             # we can safely add.
 636                             if 'values' in result[str(axes)][histogram]:
 637                                 for i in range (0, len(values)):
 638                                     for j in range (0, len(values[i])):
 639                                         values[i][j] += result[str(axes)][histogram]['values'][i][j]
 640
 641                             # Add the values to result.
 642                             result[str(axes)][histogram]['values'] = values
 643
 644                             # Update num_combined_osds
 645                             if 'num_combined_osds' not in result[str(axes)][histogram]:
 646                                 result[str(axes)][histogram]['num_combined_osds'] = 1
 647                             else:
 648                                 result[str(axes)][histogram]['num_combined_osds'] += 1
 649                         else:
 650                             self.log.error('Incorrect mode specified in get_osd_histograms: {}'.format(mode))
 651                             return list()
 652
 653                 # Sometimes, json errors occur if you give it an empty string.
 654                 # I am also putting in a catch for a KeyError since it could
 655                 # happen where the code is assuming that a key exists in the
 656                 # schema when it doesn't. In either case, we'll handle that
 657                 # by continuing and collecting what we can from other osds.
 658                 except (json.decoder.JSONDecodeError, KeyError) as e:
 659                     self.log.debug("Error caught on osd.{}: {}".format(osd_id, e))
 660                     continue
 661
 662         return list(result.values())
 663
 664     def get_io_rate(self) -> dict:
 665         return self.get('io_rate')
 666
 667     def get_stats_per_pool(self) -> dict:
 668         result = self.get('pg_dump')['pool_stats']
 669
 670         # collect application metadata from osd_map
 671         osd_map = self.get('osd_map')
 672         application_metadata = {pool['pool']: pool['application_metadata'] for pool in osd_map['pools']}
 673
 674         # add application to each pool from pg_dump
 675         for pool in result:
 676             pool['application'] = []
 677             # Only include default applications
 678             for application in application_metadata[pool['poolid']]:
 679                 if application in ['cephfs', 'mgr', 'rbd', 'rgw']:
 680                     pool['application'].append(application)
 681
 682         return result
 683
 684     def get_stats_per_pg(self) -> dict:
 685         return self.get('pg_dump')['pg_stats']
 686
 687     def get_rocksdb_stats(self) -> Dict[str, str]:
 688         # Initalizers
 689         result: Dict[str, str] = defaultdict()
 690         version = self.get_rocksdb_version()
 691
 692         # Update result
 693         result['version'] = version
 694
 695         return result
 696
 697     def gather_crashinfo(self) -> List[Dict[str, str]]:
 698         crashlist: List[Dict[str, str]] = list()
 699         errno, crashids, err = self.remote('crash', 'ls')
 700         if errno:
 701             return crashlist
 702         for crashid in crashids.split():
 703             errno, crashinfo, err = self.remote('crash', 'do_info', crashid)
 704             if errno:
 705                 continue
 706             c = json.loads(crashinfo)
 707
 708             # redact hostname
 709             del c['utsname_hostname']
 710
 711             # entity_name might have more than one '.', beware
 712             (etype, eid) = c.get('entity_name', '').split('.', 1)
 713             m = hashlib.sha1()
 714             assert self.salt
 715             m.update(self.salt.encode('utf-8'))
 716             m.update(eid.encode('utf-8'))
 717             m.update(self.salt.encode('utf-8'))
 718             c['entity_name'] = etype + '.' + m.hexdigest()
 719
 720             # redact final line of python tracebacks, as the exception
 721             # payload may contain identifying information
 722             if 'mgr_module' in c and 'backtrace' in c:
 723                 # backtrace might be empty
 724                 if len(c['backtrace']) > 0:
 725                     c['backtrace'][-1] = '<redacted>'
 726
 727             crashlist.append(c)
 728         return crashlist
 729
 730     def gather_perf_counters(self, mode: str = 'separated') -> Dict[str, dict]:
 731         # Extract perf counter data with get_all_perf_counters(), a method
 732         # from mgr/mgr_module.py. This method returns a nested dictionary that
 733         # looks a lot like perf schema, except with some additional fields.
 734         #
 735         # Example of output, a snapshot of a mon daemon:
 736         #   "mon.b": {
 737         #       "bluestore.kv_flush_lat": {
 738         #           "count": 2431,
 739         #           "description": "Average kv_thread flush latency",
 740         #           "nick": "fl_l",
 741         #           "priority": 8,
 742         #           "type": 5,
 743         #           "units": 1,
 744         #           "value": 88814109
 745         #       },
 746         #   },
 747         all_perf_counters = self.get_all_perf_counters()
 748
 749         # Initialize 'result' dict
 750         result: Dict[str, dict] = defaultdict(lambda: defaultdict(
 751             lambda: defaultdict(lambda: defaultdict(int))))
 752
 753         # 'separated' mode
 754         anonymized_daemon_dict = {}
 755
 756         for daemon, all_perf_counters_by_daemon in all_perf_counters.items():
 757             daemon_type = daemon[0:3] # i.e. 'mds', 'osd', 'rgw'
 758
 759             if mode == 'separated':
 760                 # anonymize individual daemon names except osds
 761                 if (daemon_type != 'osd'):
 762                     anonymized_daemon = self.anonymize_entity_name(daemon)
 763                     anonymized_daemon_dict[anonymized_daemon] = daemon
 764                     daemon = anonymized_daemon
 765
 766             # Calculate num combined daemon types if in aggregated mode
 767             if mode == 'aggregated':
 768                 if 'num_combined_daemons' not in result[daemon_type]:
 769                     result[daemon_type]['num_combined_daemons'] = 1
 770                 else:
 771                     result[daemon_type]['num_combined_daemons'] += 1
 772
 773             for collection in all_perf_counters_by_daemon:
 774                 # Split the collection to avoid redundancy in final report; i.e.:
 775                 #   bluestore.kv_flush_lat, bluestore.kv_final_lat -->
 776                 #   bluestore: kv_flush_lat, kv_final_lat
 777                 col_0, col_1 = collection.split('.')
 778
 779                 # Debug log for empty keys. This initially was a problem for prioritycache
 780                 # perf counters, where the col_0 was empty for certain mon counters:
 781                 #
 782                 # "mon.a": {                  instead of    "mon.a": {
 783                 #      "": {                                     "prioritycache": {
 784                 #        "cache_bytes": {...},                          "cache_bytes": {...},
 785                 #
 786                 # This log is here to detect any future instances of a similar issue.
 787                 if (daemon == "") or (col_0 == "") or (col_1 == ""):
 788                     self.log.debug("Instance of an empty key: {}{}".format(daemon, collection))
 789
 790                 if mode == 'separated':
 791                     # Add value to result
 792                     result[daemon][col_0][col_1]['value'] = \
 793                             all_perf_counters_by_daemon[collection]['value']
 794
 795                     # Check that 'count' exists, as not all counters have a count field.
 796                     if 'count' in all_perf_counters_by_daemon[collection]:
 797                         result[daemon][col_0][col_1]['count'] = \
 798                                 all_perf_counters_by_daemon[collection]['count']
 799                 elif mode == 'aggregated':
 800                     # Not every rgw daemon has the same schema. Specifically, each rgw daemon
 801                     # has a uniquely-named collection that starts off identically (i.e.
 802                     # "objecter-0x...") then diverges (i.e. "...55f4e778e140.op_rmw").
 803                     # This bit of code combines these unique counters all under one rgw instance.
 804                     # Without this check, the schema would remain separeted out in the final report.
 805                     if col_0[0:11] == "objecter-0x":
 806                         col_0 = "objecter-0x"
 807
 808                     # Check that the value can be incremented. In some cases,
 809                     # the files are of type 'pair' (real-integer-pair, integer-integer pair).
 810                     # In those cases, the value is a dictionary, and not a number.
 811                     #   i.e. throttle-msgr_dispatch_throttler-hbserver["wait"]
 812                     if isinstance(all_perf_counters_by_daemon[collection]['value'], numbers.Number):
 813                         result[daemon_type][col_0][col_1]['value'] += \
 814                                 all_perf_counters_by_daemon[collection]['value']
 815
 816                     # Check that 'count' exists, as not all counters have a count field.
 817                     if 'count' in all_perf_counters_by_daemon[collection]:
 818                         result[daemon_type][col_0][col_1]['count'] += \
 819                                 all_perf_counters_by_daemon[collection]['count']
 820                 else:
 821                     self.log.error('Incorrect mode specified in gather_perf_counters: {}'.format(mode))
 822                     return {}
 823
 824         if mode == 'separated':
 825             # for debugging purposes only, this data is never reported
 826             self.log.debug('Anonymized daemon mapping for telemetry perf_counters (anonymized: real): {}'.format(anonymized_daemon_dict))
 827
 828         return result
 829
 830     def get_active_channels(self) -> List[str]:
 831         r = []
 832         if self.channel_basic:
 833             r.append('basic')
 834         if self.channel_crash:
 835             r.append('crash')
 836         if self.channel_device:
 837             r.append('device')
 838         if self.channel_ident:
 839             r.append('ident')
 840         if self.channel_perf:
 841             r.append('perf')
 842         return r
 843
 844     def gather_device_report(self) -> Dict[str, Dict[str, Dict[str, str]]]:
 845         try:
 846             time_format = self.remote('devicehealth', 'get_time_format')
 847         except Exception as e:
 848             self.log.debug('Unable to format time: {}'.format(e))
 849             return {}
 850         cutoff = datetime.utcnow() - timedelta(hours=self.interval * 2)
 851         min_sample = cutoff.strftime(time_format)
 852
 853         devices = self.get('devices')['devices']
 854         if not devices:
 855             self.log.debug('Unable to get device info from the mgr.')
 856             return {}
 857
 858         # anon-host-id -> anon-devid -> { timestamp -> record }
 859         res: Dict[str, Dict[str, Dict[str, str]]] = {}
 860         for d in devices:
 861             devid = d['devid']
 862             try:
 863                 # this is a map of stamp -> {device info}
 864                 m = self.remote('devicehealth', 'get_recent_device_metrics',
 865                                 devid, min_sample)
 866             except Exception as e:
 867                 self.log.debug('Unable to get recent metrics from device with id "{}": {}'.format(devid, e))
 868                 continue
 869
 870             # anonymize host id
 871             try:
 872                 host = d['location'][0]['host']
 873             except (KeyError, IndexError) as e:
 874                 self.log.debug('Unable to get host from device with id "{}": {}'.format(devid, e))
 875                 continue
 876             anon_host = self.get_store('host-id/%s' % host)
 877             if not anon_host:
 878                 anon_host = str(uuid.uuid1())
 879                 self.set_store('host-id/%s' % host, anon_host)
 880             serial = None
 881             for dev, rep in m.items():
 882                 rep['host_id'] = anon_host
 883                 if serial is None and 'serial_number' in rep:
 884                     serial = rep['serial_number']
 885
 886             # anonymize device id
 887             anon_devid = self.get_store('devid-id/%s' % devid)
 888             if not anon_devid:
 889                 # ideally devid is 'vendor_model_serial',
 890                 # but can also be 'model_serial', 'serial'
 891                 if '_' in devid:
 892                     anon_devid = f"{devid.rsplit('_', 1)[0]}_{uuid.uuid1()}"
 893                 else:
 894                     anon_devid = str(uuid.uuid1())
 895                 self.set_store('devid-id/%s' % devid, anon_devid)
 896             self.log.info('devid %s / %s, host %s / %s' % (devid, anon_devid,
 897                                                            host, anon_host))
 898
 899             # anonymize the smartctl report itself
 900             if serial:
 901                 m_str = json.dumps(m)
 902                 m = json.loads(m_str.replace(serial, 'deleted'))
 903
 904             if anon_host not in res:
 905                 res[anon_host] = {}
 906             res[anon_host][anon_devid] = m
 907         return res
 908
 909     def get_latest(self, daemon_type: str, daemon_name: str, stat: str) -> int:
 910         data = self.get_counter(daemon_type, daemon_name, stat)[stat]
 911         if data:
 912             return data[-1][1]
 913         else:
 914             return 0
 915
 916     def compile_report(self, channels: Optional[List[str]] = None) -> Dict[str, Any]:
 917         if not channels:
 918             channels = self.get_active_channels()
 919         report = {
 920             'leaderboard': self.leaderboard,
 921             'report_version': 1,
 922             'report_timestamp': datetime.utcnow().isoformat(),
 923             'report_id': self.report_id,
 924             'channels': channels,
 925             'channels_available': ALL_CHANNELS,
 926             'license': LICENSE,
 927             'collections_available': [c['name'].name for c in MODULE_COLLECTION],
 928             'collections_opted_in': [c['name'].name for c in MODULE_COLLECTION if self.is_enabled_collection(c['name'])],
 929         }
 930
 931         if 'ident' in channels:
 932             for option in ['description', 'contact', 'organization']:
 933                 report[option] = getattr(self, option)
 934
 935         if 'basic' in channels:
 936             mon_map = self.get('mon_map')
 937             osd_map = self.get('osd_map')
 938             service_map = self.get('service_map')
 939             fs_map = self.get('fs_map')
 940             df = self.get('df')
 941             df_pools = {pool['id']: pool for pool in df['pools']}
 942
 943             report['created'] = mon_map['created']
 944
 945             # mons
 946             v1_mons = 0
 947             v2_mons = 0
 948             ipv4_mons = 0
 949             ipv6_mons = 0
 950             for mon in mon_map['mons']:
 951                 for a in mon['public_addrs']['addrvec']:
 952                     if a['type'] == 'v2':
 953                         v2_mons += 1
 954                     elif a['type'] == 'v1':
 955                         v1_mons += 1
 956                     if a['addr'].startswith('['):
 957                         ipv6_mons += 1
 958                     else:
 959                         ipv4_mons += 1
 960             report['mon'] = {
 961                 'count': len(mon_map['mons']),
 962                 'features': mon_map['features'],
 963                 'min_mon_release': mon_map['min_mon_release'],
 964                 'v1_addr_mons': v1_mons,
 965                 'v2_addr_mons': v2_mons,
 966                 'ipv4_addr_mons': ipv4_mons,
 967                 'ipv6_addr_mons': ipv6_mons,
 968             }
 969
 970             report['config'] = self.gather_configs()
 971
 972             # pools
 973
 974             rbd_num_pools = 0
 975             rbd_num_images_by_pool = []
 976             rbd_mirroring_by_pool = []
 977             num_pg = 0
 978             report['pools'] = list()
 979             for pool in osd_map['pools']:
 980                 num_pg += pool['pg_num']
 981                 ec_profile = {}
 982                 if pool['erasure_code_profile']:
 983                     orig = osd_map['erasure_code_profiles'].get(
 984                         pool['erasure_code_profile'], {})
 985                     ec_profile = {
 986                         k: orig[k] for k in orig.keys()
 987                         if k in ['k', 'm', 'plugin', 'technique',
 988                                  'crush-failure-domain', 'l']
 989                     }
 990                 pool_data = {
 991                         'pool': pool['pool'],
 992                         'pg_num': pool['pg_num'],
 993                         'pgp_num': pool['pg_placement_num'],
 994                         'size': pool['size'],
 995                         'min_size': pool['min_size'],
 996                         'pg_autoscale_mode': pool['pg_autoscale_mode'],
 997                         'target_max_bytes': pool['target_max_bytes'],
 998                         'target_max_objects': pool['target_max_objects'],
 999                         'type': ['', 'replicated', '', 'erasure'][pool['type']],
1000                         'erasure_code_profile': ec_profile,
1001                         'cache_mode': pool['cache_mode'],
1002                     }
1003
1004                 # basic_pool_usage collection
1005                 if self.is_enabled_collection(Collection.basic_pool_usage):
1006                     pool_data['application'] = []
1007                     for application in pool['application_metadata']:
1008                         # Only include default applications
1009                         if application in ['cephfs', 'mgr', 'rbd', 'rgw']:
1010                             pool_data['application'].append(application)
1011                     pool_stats = df_pools[pool['pool']]['stats']
1012                     pool_data['stats'] = { # filter out kb_used
1013                                             'avail_raw': pool_stats['avail_raw'],
1014                                             'bytes_used': pool_stats['bytes_used'],
1015                                             'compress_bytes_used': pool_stats['compress_bytes_used'],
1016                                             'compress_under_bytes': pool_stats['compress_under_bytes'],
1017                                             'data_bytes_used': pool_stats['data_bytes_used'],
1018                                             'dirty': pool_stats['dirty'],
1019                                             'max_avail': pool_stats['max_avail'],
1020                                             'objects': pool_stats['objects'],
1021                                             'omap_bytes_used': pool_stats['omap_bytes_used'],
1022                                             'percent_used': pool_stats['percent_used'],
1023                                             'quota_bytes': pool_stats['quota_bytes'],
1024                                             'quota_objects': pool_stats['quota_objects'],
1025                                             'rd': pool_stats['rd'],
1026                                             'rd_bytes': pool_stats['rd_bytes'],
1027                                             'stored': pool_stats['stored'],
1028                                             'stored_data': pool_stats['stored_data'],
1029                                             'stored_omap': pool_stats['stored_omap'],
1030                                             'stored_raw': pool_stats['stored_raw'],
1031                                             'wr': pool_stats['wr'],
1032                                             'wr_bytes': pool_stats['wr_bytes']
1033                         }
1034
1035                 cast(List[Dict[str, Any]], report['pools']).append(pool_data)
1036                 if 'rbd' in pool['application_metadata']:
1037                     rbd_num_pools += 1
1038                     ioctx = self.rados.open_ioctx(pool['pool_name'])
1039                     rbd_num_images_by_pool.append(
1040                         sum(1 for _ in rbd.RBD().list2(ioctx)))
1041                     rbd_mirroring_by_pool.append(
1042                         rbd.RBD().mirror_mode_get(ioctx) != rbd.RBD_MIRROR_MODE_DISABLED)
1043             report['rbd'] = {
1044                 'num_pools': rbd_num_pools,
1045                 'num_images_by_pool': rbd_num_images_by_pool,
1046                 'mirroring_by_pool': rbd_mirroring_by_pool}
1047
1048             # osds
1049             cluster_network = False
1050             for osd in osd_map['osds']:
1051                 if osd['up'] and not cluster_network:
1052                     front_ip = osd['public_addrs']['addrvec'][0]['addr'].split(':')[0]
1053                     back_ip = osd['cluster_addrs']['addrvec'][0]['addr'].split(':')[0]
1054                     if front_ip != back_ip:
1055                         cluster_network = True
1056             report['osd'] = {
1057                 'count': len(osd_map['osds']),
1058                 'require_osd_release': osd_map['require_osd_release'],
1059                 'require_min_compat_client': osd_map['require_min_compat_client'],
1060                 'cluster_network': cluster_network,
1061             }
1062
1063             # crush
1064             report['crush'] = self.gather_crush_info()
1065
1066             # cephfs
1067             report['fs'] = {
1068                 'count': len(fs_map['filesystems']),
1069                 'feature_flags': fs_map['feature_flags'],
1070                 'num_standby_mds': len(fs_map['standbys']),
1071                 'filesystems': [],
1072             }
1073             num_mds = len(fs_map['standbys'])
1074             for fsm in fs_map['filesystems']:
1075                 fs = fsm['mdsmap']
1076                 num_sessions = 0
1077                 cached_ino = 0
1078                 cached_dn = 0
1079                 cached_cap = 0
1080                 subtrees = 0
1081                 rfiles = 0
1082                 rbytes = 0
1083                 rsnaps = 0
1084                 for gid, mds in fs['info'].items():
1085                     num_sessions += self.get_latest('mds', mds['name'],
1086                                                     'mds_sessions.session_count')
1087                     cached_ino += self.get_latest('mds', mds['name'],
1088                                                   'mds_mem.ino')
1089                     cached_dn += self.get_latest('mds', mds['name'],
1090                                                  'mds_mem.dn')
1091                     cached_cap += self.get_latest('mds', mds['name'],
1092                                                   'mds_mem.cap')
1093                     subtrees += self.get_latest('mds', mds['name'],
1094                                                 'mds.subtrees')
1095                     if mds['rank'] == 0:
1096                         rfiles = self.get_latest('mds', mds['name'],
1097                                                  'mds.root_rfiles')
1098                         rbytes = self.get_latest('mds', mds['name'],
1099                                                  'mds.root_rbytes')
1100                         rsnaps = self.get_latest('mds', mds['name'],
1101                                                  'mds.root_rsnaps')
1102                 report['fs']['filesystems'].append({  # type: ignore
1103                     'max_mds': fs['max_mds'],
1104                     'ever_allowed_features': fs['ever_allowed_features'],
1105                     'explicitly_allowed_features': fs['explicitly_allowed_features'],
1106                     'num_in': len(fs['in']),
1107                     'num_up': len(fs['up']),
1108                     'num_standby_replay': len(
1109                         [mds for gid, mds in fs['info'].items()
1110                          if mds['state'] == 'up:standby-replay']),
1111                     'num_mds': len(fs['info']),
1112                     'num_sessions': num_sessions,
1113                     'cached_inos': cached_ino,
1114                     'cached_dns': cached_dn,
1115                     'cached_caps': cached_cap,
1116                     'cached_subtrees': subtrees,
1117                     'balancer_enabled': len(fs['balancer']) > 0,
1118                     'num_data_pools': len(fs['data_pools']),
1119                     'standby_count_wanted': fs['standby_count_wanted'],
1120                     'approx_ctime': fs['created'][0:7],
1121                     'files': rfiles,
1122                     'bytes': rbytes,
1123                     'snaps': rsnaps,
1124                 })
1125                 num_mds += len(fs['info'])
1126             report['fs']['total_num_mds'] = num_mds  # type: ignore
1127
1128             # daemons
1129             report['metadata'] = dict(osd=self.gather_osd_metadata(osd_map),
1130                                       mon=self.gather_mon_metadata(mon_map))
1131
1132             if self.is_enabled_collection(Collection.basic_mds_metadata):
1133                 report['metadata']['mds'] = self.gather_mds_metadata()  # type: ignore
1134
1135             # host counts
1136             servers = self.list_servers()
1137             self.log.debug('servers %s' % servers)
1138             hosts = {
1139                 'num': len([h for h in servers if h['hostname']]),
1140             }
1141             for t in ['mon', 'mds', 'osd', 'mgr']:
1142                 nr_services = sum(1 for host in servers if
1143                                   any(service for service in cast(List[ServiceInfoT],
1144                                                                   host['services'])
1145                                       if service['type'] == t))
1146                 hosts['num_with_' + t] = nr_services
1147             report['hosts'] = hosts
1148
1149             report['usage'] = {
1150                 'pools': len(df['pools']),
1151                 'pg_num': num_pg,
1152                 'total_used_bytes': df['stats']['total_used_bytes'],
1153                 'total_bytes': df['stats']['total_bytes'],
1154                 'total_avail_bytes': df['stats']['total_avail_bytes']
1155             }
1156             # basic_usage_by_class collection
1157             if self.is_enabled_collection(Collection.basic_usage_by_class):
1158                 report['usage']['stats_by_class'] = {} # type: ignore
1159                 for device_class in df['stats_by_class']:
1160                     if device_class in ['hdd', 'ssd', 'nvme']:
1161                         report['usage']['stats_by_class'][device_class] = df['stats_by_class'][device_class] # type: ignore
1162
1163             services: DefaultDict[str, int] = defaultdict(int)
1164             for key, value in service_map['services'].items():
1165                 services[key] += 1
1166                 if key == 'rgw':
1167                     rgw = {}
1168                     zones = set()
1169                     zonegroups = set()
1170                     frontends = set()
1171                     count = 0
1172                     d = value.get('daemons', dict())
1173                     for k, v in d.items():
1174                         if k == 'summary' and v:
1175                             rgw[k] = v
1176                         elif isinstance(v, dict) and 'metadata' in v:
1177                             count += 1
1178                             zones.add(v['metadata']['zone_id'])
1179                             zonegroups.add(v['metadata']['zonegroup_id'])
1180                             frontends.add(v['metadata']['frontend_type#0'])
1181
1182                             # we could actually iterate over all the keys of
1183                             # the dict and check for how many frontends there
1184                             # are, but it is unlikely that one would be running
1185                             # more than 2 supported ones
1186                             f2 = v['metadata'].get('frontend_type#1', None)
1187                             if f2:
1188                                 frontends.add(f2)
1189
1190                     rgw['count'] = count
1191                     rgw['zones'] = len(zones)
1192                     rgw['zonegroups'] = len(zonegroups)
1193                     rgw['frontends'] = list(frontends)  # sets aren't json-serializable
1194                     report['rgw'] = rgw
1195             report['services'] = services
1196
1197             try:
1198                 report['balancer'] = self.remote('balancer', 'gather_telemetry')
1199             except ImportError:
1200                 report['balancer'] = {
1201                     'active': False
1202                 }
1203
1204             # Rook
1205             self.get_rook_data(report)
1206
1207         if 'crash' in channels:
1208             report['crashes'] = self.gather_crashinfo()
1209
1210         if 'perf' in channels:
1211             if self.is_enabled_collection(Collection.perf_perf):
1212                 report['perf_counters'] = self.gather_perf_counters('separated')
1213                 report['stats_per_pool'] = self.get_stats_per_pool()
1214                 report['stats_per_pg'] = self.get_stats_per_pg()
1215                 report['io_rate'] = self.get_io_rate()
1216                 report['osd_perf_histograms'] = self.get_osd_histograms('separated')
1217                 report['mempool'] = self.get_mempool('separated')
1218                 report['heap_stats'] = self.get_heap_stats()
1219                 report['rocksdb_stats'] = self.get_rocksdb_stats()
1220
1221         # NOTE: We do not include the 'device' channel in this report; it is
1222         # sent to a different endpoint.
1223
1224         return report
1225
1226     def get_rook_data(self, report: Dict[str, object]) -> None:
1227         r, outb, outs = self.mon_command({
1228             'prefix': 'config-key dump',
1229             'format': 'json'
1230         })
1231         if r != 0:
1232             return
1233         try:
1234             config_kv_dump = json.loads(outb)
1235         except json.decoder.JSONDecodeError:
1236             return
1237
1238         for elem in ROOK_KEYS_BY_COLLECTION:
1239             # elem[0] is the full key path (e.g. "rook/node/count/with-csi-nfs-plugin")
1240             # elem[1] is the Collection this key belongs to
1241             if self.is_enabled_collection(elem[1]):
1242                 self.add_kv_to_report(report, elem[0], config_kv_dump.get(elem[0]))
1243
1244     def add_kv_to_report(self, report: Dict[str, object], key_path: str, value: Any) -> None:
1245         last_node = key_path.split('/')[-1]
1246         for node in key_path.split('/')[0:-1]:
1247             if node not in report:
1248                 report[node] = {}
1249             report = report[node]  # type: ignore
1250
1251             # sanity check of keys correctness
1252             if not isinstance(report, dict):
1253                 self.log.error(f"'{key_path}' is an invalid key, expected type 'dict' but got {type(report)}")
1254                 return
1255
1256         if last_node in report:
1257             self.log.error(f"'{key_path}' is an invalid key, last part must not exist at this point")
1258             return
1259
1260         report[last_node] = value
1261
1262     def _try_post(self, what: str, url: str, report: Dict[str, Dict[str, str]]) -> Optional[str]:
1263         self.log.info('Sending %s to: %s' % (what, url))
1264         proxies = dict()
1265         if self.proxy:
1266             self.log.info('Send using HTTP(S) proxy: %s', self.proxy)
1267             proxies['http'] = self.proxy
1268             proxies['https'] = self.proxy
1269         try:
1270             resp = requests.put(url=url, json=report, proxies=proxies)
1271             resp.raise_for_status()
1272         except Exception as e:
1273             fail_reason = 'Failed to send %s to %s: %s' % (what, url, str(e))
1274             self.log.error(fail_reason)
1275             return fail_reason
1276         return None
1277
1278     class EndPoint(enum.Enum):
1279         ceph = 'ceph'
1280         device = 'device'
1281
1282     def collection_delta(self, channels: Optional[List[str]] = None) -> Optional[List[Collection]]:
1283         '''
1284         Find collections that are available in the module, but are not in the db
1285         '''
1286         if self.db_collection is None:
1287             return None
1288
1289         if not channels:
1290             channels = ALL_CHANNELS
1291         else:
1292             for ch in channels:
1293                 if ch not in ALL_CHANNELS:
1294                     self.log.debug(f"invalid channel name: {ch}")
1295                     return None
1296
1297         new_collection : List[Collection] = []
1298
1299         for c in MODULE_COLLECTION:
1300             if c['name'].name not in self.db_collection:
1301                 if c['channel'] in channels:
1302                     new_collection.append(c['name'])
1303
1304         return new_collection
1305
1306     def is_major_upgrade(self) -> bool:
1307         '''
1308         Returns True only if the user last opted-in to an older major
1309         '''
1310         if self.last_opted_in_ceph_version is None or self.last_opted_in_ceph_version == 0:
1311             # we do not know what Ceph version was when the user last opted-in,
1312             # thus we do not wish to nag in case of a major upgrade
1313             return False
1314
1315         mon_map = self.get('mon_map')
1316         mon_min = mon_map.get("min_mon_release", 0)
1317
1318         if mon_min - self.last_opted_in_ceph_version > 0:
1319             self.log.debug(f"major upgrade: mon_min is: {mon_min} and user last opted-in in {self.last_opted_in_ceph_version}")
1320             return True
1321
1322         return False
1323
1324     def is_opted_in(self) -> bool:
1325         # If len is 0 it means that the user is either opted-out (never
1326         # opted-in, or invoked `telemetry off`), or they upgraded from a
1327         # telemetry revision 1 or 2, which required to re-opt in to revision 3,
1328         # regardless, hence is considered as opted-out
1329         if self.db_collection is None:
1330             return False
1331         return len(self.db_collection) > 0
1332
1333     def should_nag(self) -> bool:
1334         # Find delta between opted-in collections and module collections;
1335         # nag only if module has a collection which is not in db, and nag == True.
1336
1337         # We currently do not nag if the user is opted-out (or never opted-in).
1338         # If we wish to do this in the future, we need to have a tri-mode state
1339         # (opted in, opted out, no action yet), and it needs to be guarded by a
1340         # config option (so that nagging can be turned off via config).
1341         # We also need to add a last_opted_out_ceph_version variable, for the
1342         # major upgrade check.
1343
1344         # check if there are collections the user is not opt-in to
1345         # that we should nag about
1346         if self.db_collection is not None:
1347             for c in MODULE_COLLECTION:
1348                 if c['name'].name not in self.db_collection:
1349                     if c['nag'] == True:
1350                         self.log.debug(f"The collection: {c['name']} is not reported")
1351                         return True
1352
1353         # user might be opted-in to the most recent collection, or there is no
1354         # new collection which requires nagging about; thus nag in case it's a
1355         # major upgrade and there are new collections
1356         # (which their own nag == False):
1357         new_collections = False
1358         col_delta = self.collection_delta()
1359         if col_delta is not None and len(col_delta) > 0:
1360             new_collections = True
1361
1362         return self.is_major_upgrade() and new_collections
1363
1364     def init_collection(self) -> None:
1365         # We fetch from db the collections the user had already opted-in to.
1366         # During the transition the results will be empty, but the user might
1367         # be opted-in to an older version (e.g. revision = 3)
1368
1369         collection = self.get_store('collection')
1370
1371         if collection is not None:
1372             self.db_collection = json.loads(collection)
1373
1374         if self.db_collection is None:
1375             # happens once on upgrade
1376             if not self.enabled:
1377                 # user is not opted-in
1378                 self.set_store('collection', json.dumps([]))
1379                 self.log.debug("user is not opted-in")
1380             else:
1381                 # user is opted-in, verify the revision:
1382                 if self.last_opt_revision == REVISION:
1383                     self.log.debug(f"telemetry revision is {REVISION}")
1384                     base_collection = [Collection.basic_base.name, Collection.device_base.name, Collection.crash_base.name, Collection.ident_base.name]
1385                     self.set_store('collection', json.dumps(base_collection))
1386                 else:
1387                     # user is opted-in to an older version, meaning they need
1388                     # to re-opt in regardless
1389                     self.set_store('collection', json.dumps([]))
1390                     self.log.debug(f"user is opted-in but revision is old ({self.last_opt_revision}), needs to re-opt-in")
1391
1392             # reload collection after setting
1393             collection = self.get_store('collection')
1394             if collection is not None:
1395                 self.db_collection = json.loads(collection)
1396             else:
1397                 raise RuntimeError('collection is None after initial setting')
1398         else:
1399             # user has already upgraded
1400             self.log.debug(f"user has upgraded already: collection: {self.db_collection}")
1401
1402     def is_enabled_collection(self, collection: Collection) -> bool:
1403         if self.db_collection is None:
1404             return False
1405         return collection.name in self.db_collection
1406
1407     def opt_in_all_collections(self) -> None:
1408         """
1409         Opt-in to all collections; Update db with the currently available collections in the module
1410         """
1411         if self.db_collection is None:
1412             raise RuntimeError('db_collection is None after initial setting')
1413
1414         for c in MODULE_COLLECTION:
1415             if c['name'].name not in self.db_collection:
1416                 self.db_collection.append(c['name'])
1417
1418         self.set_store('collection', json.dumps(self.db_collection))
1419
1420     def send(self,
1421              report: Dict[str, Dict[str, str]],
1422              endpoint: Optional[List[EndPoint]] = None) -> Tuple[int, str, str]:
1423         if not endpoint:
1424             endpoint = [self.EndPoint.ceph, self.EndPoint.device]
1425         failed = []
1426         success = []
1427         self.log.debug('Send endpoints %s' % endpoint)
1428         for e in endpoint:
1429             if e == self.EndPoint.ceph:
1430                 fail_reason = self._try_post('ceph report', self.url, report)
1431                 if fail_reason:
1432                     failed.append(fail_reason)
1433                 else:
1434                     now = int(time.time())
1435                     self.last_upload = now
1436                     self.set_store('last_upload', str(now))
1437                     success.append('Ceph report sent to {0}'.format(self.url))
1438                     self.log.info('Sent report to {0}'.format(self.url))
1439             elif e == self.EndPoint.device:
1440                 if 'device' in self.get_active_channels():
1441                     devices = self.gather_device_report()
1442                     if devices:
1443                         num_devs = 0
1444                         num_hosts = 0
1445                         for host, ls in devices.items():
1446                             self.log.debug('host %s devices %s' % (host, ls))
1447                             if not len(ls):
1448                                 continue
1449                             fail_reason = self._try_post('devices', self.device_url,
1450                                                          ls)
1451                             if fail_reason:
1452                                 failed.append(fail_reason)
1453                             else:
1454                                 num_devs += len(ls)
1455                                 num_hosts += 1
1456                         if num_devs:
1457                             success.append('Reported %d devices from %d hosts across a total of %d hosts' % (
1458                                 num_devs, num_hosts, len(devices)))
1459                     else:
1460                         fail_reason = 'Unable to send device report: Device channel is on, but the generated report was empty.'
1461                         failed.append(fail_reason)
1462                         self.log.error(fail_reason)
1463         if failed:
1464             return 1, '', '\n'.join(success + failed)
1465         return 0, '', '\n'.join(success)
1466
1467     def format_perf_histogram(self, report: Dict[str, Any]) -> None:
1468         # Formatting the perf histograms so they are human-readable. This will change the
1469         # ranges and values, which are currently in list form, into strings so that
1470         # they are displayed horizontally instead of vertically.
1471         try:
1472             # Formatting ranges and values in osd_perf_histograms
1473             mode = 'osd_perf_histograms'
1474             for config in report[mode]:
1475                 for histogram in config:
1476                     # Adjust ranges by converting lists into strings
1477                     for axis in config[histogram]['axes']:
1478                         for i in range(0, len(axis['ranges'])):
1479                             axis['ranges'][i] = str(axis['ranges'][i])
1480
1481                     for osd in config[histogram]['osds']:
1482                         for i in range(0, len(osd['values'])):
1483                             osd['values'][i] = str(osd['values'][i])
1484         except KeyError:
1485             # If the perf channel is not enabled, there should be a KeyError since
1486             # 'osd_perf_histograms' would not be present in the report. In that case,
1487             # the show function should pass as usual without trying to format the
1488             # histograms.
1489             pass
1490
1491     def toggle_channel(self, action: str, channels: Optional[List[str]] = None) -> Tuple[int, str, str]:
1492         '''
1493         Enable or disable a list of channels
1494         '''
1495         if not self.enabled:
1496             # telemetry should be on for channels to be toggled
1497             msg = 'Telemetry is off. Please consider opting-in with `ceph telemetry on`.\n' \
1498                   'Preview sample reports with `ceph telemetry preview`.'
1499             return 0, msg, ''
1500
1501         if channels is None:
1502             msg = f'Please provide a channel name. Available channels: {ALL_CHANNELS}.'
1503             return 0, msg, ''
1504
1505         state = action == 'enable'
1506         msg = ''
1507         for c in channels:
1508             if c not in ALL_CHANNELS:
1509                 msg = f"{msg}{c} is not a valid channel name. "\
1510                         f"Available channels: {ALL_CHANNELS}.\n"
1511             else:
1512                 self.set_module_option(f"channel_{c}", state)
1513                 setattr(self,
1514                         f"channel_{c}",
1515                         state)
1516                 msg = f"{msg}channel_{c} is {action}d\n"
1517
1518         return 0, msg, ''
1519
1520     @CLIReadCommand('telemetry status')
1521     def status(self) -> Tuple[int, str, str]:
1522         '''
1523         Show current configuration
1524         '''
1525         r = {}
1526         for opt in self.MODULE_OPTIONS:
1527             r[opt['name']] = getattr(self, opt['name'])
1528         r['last_upload'] = (time.ctime(self.last_upload)
1529                             if self.last_upload else self.last_upload)
1530         return 0, json.dumps(r, indent=4, sort_keys=True), ''
1531
1532     @CLIReadCommand('telemetry diff')
1533     def diff(self) -> Tuple[int, str, str]:
1534         '''
1535         Show the diff between opted-in collection and available collection
1536         '''
1537         diff = []
1538         keys = ['nag']
1539
1540         for c in MODULE_COLLECTION:
1541             if not self.is_enabled_collection(c['name']):
1542                 diff.append({key: val for key, val in c.items() if key not in keys})
1543
1544         r = None
1545         if diff == []:
1546             r = "Telemetry is up to date"
1547         else:
1548             r = json.dumps(diff, indent=4, sort_keys=True)
1549
1550         return 0, r, ''
1551
1552     @CLICommand('telemetry on')
1553     def on(self, license: Optional[str] = None) -> Tuple[int, str, str]:
1554         '''
1555         Enable telemetry reports from this cluster
1556         '''
1557         if license != LICENSE:
1558             return -errno.EPERM, '', f'''Telemetry data is licensed under the {LICENSE_NAME} ({LICENSE_URL}).
1559 To enable, add '--license {LICENSE}' to the 'ceph telemetry on' command.'''
1560         else:
1561             self.set_module_option('enabled', True)
1562             self.enabled = True
1563             self.opt_in_all_collections()
1564
1565             # for major releases upgrade nagging
1566             mon_map = self.get('mon_map')
1567             mon_min = mon_map.get("min_mon_release", 0)
1568             self.set_store('last_opted_in_ceph_version', str(mon_min))
1569             self.last_opted_in_ceph_version = mon_min
1570
1571             msg = 'Telemetry is on.'
1572             disabled_channels = ''
1573             active_channels = self.get_active_channels()
1574             for c in ALL_CHANNELS:
1575                 if c not in active_channels and c != 'ident':
1576                     disabled_channels = f"{disabled_channels} {c}"
1577
1578             if len(disabled_channels) > 0:
1579                 msg = f"{msg}\nSome channels are disabled, please enable with:\n"\
1580                         f"`ceph telemetry enable channel{disabled_channels}`"
1581
1582             return 0, msg, ''
1583
1584     @CLICommand('telemetry off')
1585     def off(self) -> Tuple[int, str, str]:
1586         '''
1587         Disable telemetry reports from this cluster
1588         '''
1589         if not self.enabled:
1590             # telemetry is already off
1591             msg = 'Telemetry is currently not enabled, nothing to turn off. '\
1592                     'Please consider opting-in with `ceph telemetry on`.\n' \
1593                   'Preview sample reports with `ceph telemetry preview`.'
1594             return 0, msg, ''
1595
1596         self.set_module_option('enabled', False)
1597         self.enabled = False
1598         self.set_store('collection', json.dumps([]))
1599         self.db_collection = []
1600
1601         # we might need this info in the future, in case
1602         # of nagging when user is opted-out
1603         mon_map = self.get('mon_map')
1604         mon_min = mon_map.get("min_mon_release", 0)
1605         self.set_store('last_opted_out_ceph_version', str(mon_min))
1606         self.last_opted_out_ceph_version = mon_min
1607
1608         msg = 'Telemetry is now disabled.'
1609         return 0, msg, ''
1610
1611     @CLIReadCommand('telemetry enable channel all')
1612     def enable_channel_all(self, channels: List[str] = ALL_CHANNELS) -> Tuple[int, str, str]:
1613         '''
1614         Enable all channels
1615         '''
1616         return self.toggle_channel('enable', channels)
1617
1618     @CLIReadCommand('telemetry enable channel')
1619     def enable_channel(self, channels: Optional[List[str]] = None) -> Tuple[int, str, str]:
1620         '''
1621         Enable a list of channels
1622         '''
1623         return self.toggle_channel('enable', channels)
1624
1625     @CLIReadCommand('telemetry disable channel all')
1626     def disable_channel_all(self, channels: List[str] = ALL_CHANNELS) -> Tuple[int, str, str]:
1627         '''
1628         Disable all channels
1629         '''
1630         return self.toggle_channel('disable', channels)
1631
1632     @CLIReadCommand('telemetry disable channel')
1633     def disable_channel(self, channels: Optional[List[str]] = None) -> Tuple[int, str, str]:
1634         '''
1635         Disable a list of channels
1636         '''
1637         return self.toggle_channel('disable', channels)
1638
1639     @CLIReadCommand('telemetry channel ls')
1640     def channel_ls(self) -> Tuple[int, str, str]:
1641         '''
1642         List all channels
1643         '''
1644         table = PrettyTable(
1645             [
1646                 'NAME', 'ENABLED', 'DEFAULT', 'DESC',
1647             ],
1648             border=False)
1649         table.align['NAME'] = 'l'
1650         table.align['ENABLED'] = 'l'
1651         table.align['DEFAULT'] = 'l'
1652         table.align['DESC'] = 'l'
1653         table.left_padding_width = 0
1654         table.right_padding_width = 4
1655
1656         for c in ALL_CHANNELS:
1657             enabled = "ON" if getattr(self, f"channel_{c}") else "OFF"
1658             for o in self.MODULE_OPTIONS:
1659                 if o['name'] == f"channel_{c}":
1660                     default = "ON" if o.get('default', None) else "OFF"
1661                     desc = o.get('desc', None)
1662
1663             table.add_row((
1664                 c,
1665                 enabled,
1666                 default,
1667                 desc,
1668             ))
1669
1670         return 0, table.get_string(sortby="NAME"), ''
1671
1672     @CLIReadCommand('telemetry collection ls')
1673     def collection_ls(self) -> Tuple[int, str, str]:
1674         '''
1675         List all collections
1676         '''
1677         col_delta = self.collection_delta()
1678         msg = ''
1679         if col_delta is not None and len(col_delta) > 0:
1680             msg = f"New collections are available:\n" \
1681                   f"{sorted([c.name for c in col_delta])}\n" \
1682                   f"Run `ceph telemetry on` to opt-in to these collections.\n"
1683
1684         table = PrettyTable(
1685             [
1686                 'NAME', 'STATUS', 'DESC',
1687             ],
1688             border=False)
1689         table.align['NAME'] = 'l'
1690         table.align['STATUS'] = 'l'
1691         table.align['DESC'] = 'l'
1692         table.left_padding_width = 0
1693         table.right_padding_width = 4
1694
1695         for c in MODULE_COLLECTION:
1696             name = c['name']
1697             opted_in = self.is_enabled_collection(name)
1698             channel_enabled = getattr(self, f"channel_{c['channel']}")
1699
1700             status = ''
1701             if channel_enabled and opted_in:
1702                 status = "REPORTING"
1703             else:
1704                 why = ''
1705                 delimiter = ''
1706
1707                 if not opted_in:
1708                     why += "NOT OPTED-IN"
1709                     delimiter = ', '
1710                 if not channel_enabled:
1711                     why += f"{delimiter}CHANNEL {c['channel']} IS OFF"
1712
1713                 status = f"NOT REPORTING: {why}"
1714
1715             desc = c['description']
1716
1717             table.add_row((
1718                 name,
1719                 status,
1720                 desc,
1721             ))
1722
1723         if len(msg):
1724             # add a new line between message and table output
1725             msg = f"{msg} \n"
1726
1727         return 0, f'{msg}{table.get_string(sortby="NAME")}', ''
1728
1729     @CLICommand('telemetry send')
1730     def do_send(self,
1731                 endpoint: Optional[List[EndPoint]] = None,
1732                 license: Optional[str] = None) -> Tuple[int, str, str]:
1733         '''
1734         Send a sample report
1735         '''
1736         if not self.is_opted_in() and license != LICENSE:
1737             self.log.debug(('A telemetry send attempt while opted-out. '
1738                             'Asking for license agreement'))
1739             return -errno.EPERM, '', f'''Telemetry data is licensed under the {LICENSE_NAME} ({LICENSE_URL}).
1740 To manually send telemetry data, add '--license {LICENSE}' to the 'ceph telemetry send' command.
1741 Please consider enabling the telemetry module with 'ceph telemetry on'.'''
1742         else:
1743             self.last_report = self.compile_report()
1744             return self.send(self.last_report, endpoint)
1745
1746     @CLIReadCommand('telemetry show')
1747     def show(self, channels: Optional[List[str]] = None) -> Tuple[int, str, str]:
1748         '''
1749         Show a sample report of opted-in collections (except for 'device')
1750         '''
1751         if not self.enabled:
1752             # if telemetry is off, no report is being sent, hence nothing to show
1753             msg = 'Telemetry is off. Please consider opting-in with `ceph telemetry on`.\n' \
1754                   'Preview sample reports with `ceph telemetry preview`.'
1755             return 0, msg, ''
1756
1757         report = self.get_report_locked(channels=channels)
1758         self.format_perf_histogram(report)
1759         report = json.dumps(report, indent=4, sort_keys=True)
1760
1761         if self.channel_device:
1762             report += '''\nDevice report is generated separately. To see it run 'ceph telemetry show-device'.'''
1763
1764         return 0, report, ''
1765
1766     @CLIReadCommand('telemetry preview')
1767     def preview(self, channels: Optional[List[str]] = None) -> Tuple[int, str, str]:
1768         '''
1769         Preview a sample report of the most recent collections available (except for 'device')
1770         '''
1771         report = {}
1772
1773         # We use a lock to prevent a scenario where the user wishes to preview
1774         # the report, and at the same time the module hits the interval of
1775         # sending a report with the opted-in collection, which has less data
1776         # than in the preview report.
1777         col_delta = self.collection_delta()
1778         with self.get_report_lock:
1779             if col_delta is not None and len(col_delta) == 0:
1780                 # user is already opted-in to the most recent collection
1781                 msg = 'Telemetry is up to date, see report with `ceph telemetry show`.'
1782                 return 0, msg, ''
1783             else:
1784                 # there are collections the user is not opted-in to
1785                 next_collection = []
1786
1787                 for c in MODULE_COLLECTION:
1788                     next_collection.append(c['name'].name)
1789
1790                 opted_in_collection = self.db_collection
1791                 self.db_collection = next_collection
1792                 report = self.get_report(channels=channels)
1793                 self.db_collection = opted_in_collection
1794
1795         self.format_perf_histogram(report)
1796         report = json.dumps(report, indent=4, sort_keys=True)
1797
1798         if self.channel_device:
1799             report += '''\nDevice report is generated separately. To see it run 'ceph telemetry preview-device'.'''
1800
1801         return 0, report, ''
1802
1803     @CLIReadCommand('telemetry show-device')
1804     def show_device(self) -> Tuple[int, str, str]:
1805         '''
1806         Show a sample device report
1807         '''
1808         if not self.enabled:
1809             # if telemetry is off, no report is being sent, hence nothing to show
1810             msg = 'Telemetry is off. Please consider opting-in with `ceph telemetry on`.\n' \
1811                   'Preview sample device reports with `ceph telemetry preview-device`.'
1812             return 0, msg, ''
1813
1814         if not self.channel_device:
1815             # if device channel is off, device report is not being sent, hence nothing to show
1816             msg = 'device channel is off. Please enable with `ceph telemetry enable channel device`.\n' \
1817                   'Preview sample device reports with `ceph telemetry preview-device`.'
1818             return 0, msg, ''
1819
1820         return 0, json.dumps(self.get_report_locked('device'), indent=4, sort_keys=True), ''
1821
1822     @CLIReadCommand('telemetry preview-device')
1823     def preview_device(self) -> Tuple[int, str, str]:
1824         '''
1825         Preview a sample device report of the most recent device collection
1826         '''
1827         report = {}
1828
1829         device_col_delta = self.collection_delta(['device'])
1830         with self.get_report_lock:
1831             if device_col_delta is not None and len(device_col_delta) == 0 and self.channel_device:
1832                 # user is already opted-in to the most recent device collection,
1833                 # and device channel is on, thus `show-device` should be called
1834                 msg = 'device channel is on and up to date, see report with `ceph telemetry show-device`.'
1835                 return 0, msg, ''
1836
1837             # either the user is not opted-in at all, or there are collections
1838             # they are not opted-in to
1839             next_collection = []
1840
1841             for c in MODULE_COLLECTION:
1842                 next_collection.append(c['name'].name)
1843
1844             opted_in_collection = self.db_collection
1845             self.db_collection = next_collection
1846             report = self.get_report('device')
1847             self.db_collection = opted_in_collection
1848
1849         report = json.dumps(report, indent=4, sort_keys=True)
1850         return 0, report, ''
1851
1852     @CLIReadCommand('telemetry show-all')
1853     def show_all(self) -> Tuple[int, str, str]:
1854         '''
1855         Show a sample report of all enabled channels (including 'device' channel)
1856         '''
1857         if not self.enabled:
1858             # if telemetry is off, no report is being sent, hence nothing to show
1859             msg = 'Telemetry is off. Please consider opting-in with `ceph telemetry on`.\n' \
1860                   'Preview sample reports with `ceph telemetry preview`.'
1861             return 0, msg, ''
1862
1863         if not self.channel_device:
1864             # device channel is off, no need to display its report
1865             return 0, json.dumps(self.get_report_locked('default'), indent=4, sort_keys=True), ''
1866
1867         # telemetry is on and device channel is enabled, show both
1868         return 0, json.dumps(self.get_report_locked('all'), indent=4, sort_keys=True), ''
1869
1870     @CLIReadCommand('telemetry preview-all')
1871     def preview_all(self) -> Tuple[int, str, str]:
1872         '''
1873         Preview a sample report of the most recent collections available of all channels (including 'device')
1874         '''
1875         report = {}
1876
1877         col_delta = self.collection_delta()
1878         with self.get_report_lock:
1879             if col_delta is not None and len(col_delta) == 0:
1880                 # user is already opted-in to the most recent collection
1881                 msg = 'Telemetry is up to date, see report with `ceph telemetry show`.'
1882                 return 0, msg, ''
1883
1884             # there are collections the user is not opted-in to
1885             next_collection = []
1886
1887             for c in MODULE_COLLECTION:
1888                 next_collection.append(c['name'].name)
1889
1890             opted_in_collection = self.db_collection
1891             self.db_collection = next_collection
1892             report = self.get_report('all')
1893             self.db_collection = opted_in_collection
1894
1895         self.format_perf_histogram(report)
1896         report = json.dumps(report, indent=4, sort_keys=True)
1897
1898         return 0, report, ''
1899
1900     def get_report_locked(self,
1901                           report_type: str = 'default',
1902                           channels: Optional[List[str]] = None) -> Dict[str, Any]:
1903         '''
1904         A wrapper around get_report to allow for compiling a report of the most recent module collections
1905         '''
1906         with self.get_report_lock:
1907             return self.get_report(report_type, channels)
1908
1909     def get_report(self,
1910                    report_type: str = 'default',
1911                    channels: Optional[List[str]] = None) -> Dict[str, Any]:
1912         if report_type == 'default':
1913             return self.compile_report(channels=channels)
1914         elif report_type == 'device':
1915             return self.gather_device_report()
1916         elif report_type == 'all':
1917             return {'report': self.compile_report(channels=channels),
1918                     'device_report': self.gather_device_report()}
1919         return {}
1920
1921     def self_test(self) -> None:
1922         report = self.compile_report()
1923         if len(report) == 0:
1924             raise RuntimeError('Report is empty')
1925
1926         if 'report_id' not in report:
1927             raise RuntimeError('report_id not found in report')
1928
1929     def shutdown(self) -> None:
1930         self.run = False
1931         self.event.set()
1932
1933     def refresh_health_checks(self) -> None:
1934         health_checks = {}
1935         # TODO do we want to nag also in case the user is not opted-in?
1936         if self.enabled and self.should_nag():
1937             health_checks['TELEMETRY_CHANGED'] = {
1938                 'severity': 'warning',
1939                 'summary': 'Telemetry requires re-opt-in',
1940                 'detail': [
1941                     'telemetry module includes new collections; please re-opt-in to new collections with `ceph telemetry on`'
1942                 ]
1943             }
1944         self.set_health_checks(health_checks)
1945
1946     def serve(self) -> None:
1947         self.load()
1948         self.run = True
1949
1950         self.log.debug('Waiting for mgr to warm up')
1951         time.sleep(10)
1952
1953         while self.run:
1954             self.event.clear()
1955
1956             self.refresh_health_checks()
1957
1958             if not self.is_opted_in():
1959                 self.log.debug('Not sending report until user re-opts-in')
1960                 self.event.wait(1800)
1961                 continue
1962             if not self.enabled:
1963                 self.log.debug('Not sending report until configured to do so')
1964                 self.event.wait(1800)
1965                 continue
1966
1967             now = int(time.time())
1968             if not self.last_upload or \
1969                (now - self.last_upload) > self.interval * 3600:
1970                 self.log.info('Compiling and sending report to %s',
1971                               self.url)
1972
1973                 try:
1974                     self.last_report = self.compile_report()
1975                 except Exception:
1976                     self.log.exception('Exception while compiling report:')
1977
1978                 self.send(self.last_report)
1979             else:
1980                 self.log.debug('Interval for sending new report has not expired')
1981
1982             sleep = 3600
1983             self.log.debug('Sleeping for %d seconds', sleep)
1984             self.event.wait(sleep)
1985
1986     @staticmethod
1987     def can_run() -> Tuple[bool, str]:
1988         return True, ''