ceph/src/pybind/mgr/prometheus/module.py

   1 import cherrypy
   2 import json
   3 import errno
   4 import math
   5 import os
   6 from collections import OrderedDict
   7 from mgr_module import MgrModule
   8
   9 # Defaults for the Prometheus HTTP server.  Can also set in config-key
  10 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
  11 # for Prometheus exporter port registry
  12
  13 DEFAULT_ADDR = '::'
  14 DEFAULT_PORT = 9283
  15
  16
  17 # cherrypy likes to sys.exit on error.  don't let it take us down too!
  18 def os_exit_noop(*args, **kwargs):
  19     pass
  20
  21
  22 os._exit = os_exit_noop
  23
  24
  25 # to access things in class Module from subclass Root.  Because
  26 # it's a dict, the writer doesn't need to declare 'global' for access
  27
  28 _global_instance = {'plugin': None}
  29
  30
  31 def global_instance():
  32     assert _global_instance['plugin'] is not None
  33     return _global_instance['plugin']
  34
  35
  36 def health_status_to_number(status):
  37
  38     if status == 'HEALTH_OK':
  39         return 0
  40     elif status == 'HEALTH_WARN':
  41         return 1
  42     elif status == 'HEALTH_ERR':
  43         return 2
  44
  45 PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'deep', 'degraded',
  46         'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
  47         'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
  48         'incomplete', 'stale', 'remapped', 'undersized', 'peered']
  49
  50 DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
  51
  52 DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
  53            'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
  54
  55 OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
  56
  57 OSD_STATUS = ['weight', 'up', 'in']
  58
  59 OSD_STATS = ['apply_latency_ms', 'commit_latency_ms']
  60
  61 POOL_METADATA = ('pool_id', 'name')
  62
  63 DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
  64
  65
  66 class Metric(object):
  67     def __init__(self, mtype, name, desc, labels=None):
  68         self.mtype = mtype
  69         self.name = name
  70         self.desc = desc
  71         self.labelnames = labels    # tuple if present
  72         self.value = dict()         # indexed by label values
  73
  74     def set(self, value, labelvalues=None):
  75         # labelvalues must be a tuple
  76         labelvalues = labelvalues or ('',)
  77         self.value[labelvalues] = value
  78
  79     def str_expfmt(self):
  80
  81         def promethize(path):
  82             ''' replace illegal metric name characters '''
  83             result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
  84
  85             # Hyphens usually turn into underscores, unless they are
  86             # trailing
  87             if result.endswith("-"):
  88                 result = result[0:-1] + "_minus"
  89             else:
  90                 result = result.replace("-", "_")
  91
  92             return "ceph_{0}".format(result)
  93
  94         def floatstr(value):
  95             ''' represent as Go-compatible float '''
  96             if value == float('inf'):
  97                 return '+Inf'
  98             if value == float('-inf'):
  99                 return '-Inf'
 100             if math.isnan(value):
 101                 return 'NaN'
 102             return repr(float(value))
 103
 104         name = promethize(self.name)
 105         expfmt = '''
 106 # HELP {name} {desc}
 107 # TYPE {name} {mtype}'''.format(
 108             name=name,
 109             desc=self.desc,
 110             mtype=self.mtype,
 111         )
 112
 113         for labelvalues, value in self.value.items():
 114             if self.labelnames:
 115                 labels = zip(self.labelnames, labelvalues)
 116                 labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
 117             else:
 118                 labels = ''
 119             if labels:
 120                 fmtstr = '\n{name}{{{labels}}} {value}'
 121             else:
 122                 fmtstr = '\n{name} {value}'
 123             expfmt += fmtstr.format(
 124                 name=name,
 125                 labels=labels,
 126                 value=floatstr(value),
 127             )
 128         return expfmt
 129
 130
 131 class Module(MgrModule):
 132     COMMANDS = [
 133         {
 134             "cmd": "prometheus self-test",
 135             "desc": "Run a self test on the prometheus module",
 136             "perm": "rw"
 137         },
 138     ]
 139
 140     def __init__(self, *args, **kwargs):
 141         super(Module, self).__init__(*args, **kwargs)
 142         self.notified = False
 143         self.serving = False
 144         self.metrics = self._setup_static_metrics()
 145         self.schema = OrderedDict()
 146         _global_instance['plugin'] = self
 147
 148     def _stattype_to_str(self, stattype):
 149
 150         typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
 151         if typeonly == 0:
 152             return 'gauge'
 153         if typeonly == self.PERFCOUNTER_LONGRUNAVG:
 154             # this lie matches the DaemonState decoding: only val, no counts
 155             return 'counter'
 156         if typeonly == self.PERFCOUNTER_COUNTER:
 157             return 'counter'
 158         if typeonly == self.PERFCOUNTER_HISTOGRAM:
 159             return 'histogram'
 160
 161         return ''
 162
 163     def _setup_static_metrics(self):
 164         metrics = {}
 165         metrics['health_status'] = Metric(
 166             'untyped',
 167             'health_status',
 168             'Cluster health status'
 169         )
 170         metrics['mon_quorum_count'] = Metric(
 171             'gauge',
 172             'mon_quorum_count',
 173             'Monitors in quorum'
 174         )
 175         metrics['osd_metadata'] = Metric(
 176             'untyped',
 177             'osd_metadata',
 178             'OSD Metadata',
 179             OSD_METADATA
 180         )
 181
 182         # The reason for having this separate to OSD_METADATA is
 183         # so that we can stably use the same tag names that
 184         # the Prometheus node_exporter does
 185         metrics['disk_occupation'] = Metric(
 186             'untyped',
 187             'disk_occupation',
 188             'Associate Ceph daemon with disk used',
 189             DISK_OCCUPATION
 190         )
 191
 192         metrics['pool_metadata'] = Metric(
 193             'untyped',
 194             'pool_metadata',
 195             'POOL Metadata',
 196             POOL_METADATA
 197         )
 198         for state in OSD_STATUS:
 199             path = 'osd_{}'.format(state)
 200             self.log.debug("init: creating {}".format(path))
 201             metrics[path] = Metric(
 202                 'untyped',
 203                 path,
 204                 'OSD status {}'.format(state),
 205                 ('ceph_daemon',)
 206             )
 207         for stat in OSD_STATS:
 208             path = 'osd_{}'.format(stat)
 209             self.log.debug("init: creating {}".format(path))
 210             metrics[path] = Metric(
 211                 'gauge',
 212                 path,
 213                 'OSD stat {}'.format(stat),
 214                 ('ceph_daemon',)
 215             )
 216         for state in PG_STATES:
 217             path = 'pg_{}'.format(state)
 218             self.log.debug("init: creating {}".format(path))
 219             metrics[path] = Metric(
 220                 'gauge',
 221                 path,
 222                 'PG {}'.format(state),
 223             )
 224         for state in DF_CLUSTER:
 225             path = 'cluster_{}'.format(state)
 226             self.log.debug("init: creating {}".format(path))
 227             metrics[path] = Metric(
 228                 'gauge',
 229                 path,
 230                 'DF {}'.format(state),
 231             )
 232         for state in DF_POOL:
 233             path = 'pool_{}'.format(state)
 234             self.log.debug("init: creating {}".format(path))
 235             metrics[path] = Metric(
 236                 'gauge',
 237                 path,
 238                 'DF pool {}'.format(state),
 239                 ('pool_id',)
 240             )
 241
 242         return metrics
 243
 244     def shutdown(self):
 245         self.serving = False
 246         pass
 247
 248     def get_health(self):
 249         health = json.loads(self.get('health')['json'])
 250         self.metrics['health_status'].set(
 251             health_status_to_number(health['status'])
 252         )
 253
 254     def get_df(self):
 255         # maybe get the to-be-exported metrics from a config?
 256         df = self.get('df')
 257         for stat in DF_CLUSTER:
 258             path = 'cluster_{}'.format(stat)
 259             self.metrics[path].set(df['stats'][stat])
 260
 261         for pool in df['pools']:
 262             for stat in DF_POOL:
 263                 path = 'pool_{}'.format(stat)
 264                 self.metrics[path].set(pool['stats'][stat], (pool['id'],))
 265
 266     def get_quorum_status(self):
 267         mon_status = json.loads(self.get('mon_status')['json'])
 268         self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
 269
 270     def get_pg_status(self):
 271         # TODO add per pool status?
 272         pg_s = self.get('pg_summary')['all']
 273         reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
 274                          key.split('+')]
 275         for state, value in reported_pg_s:
 276             path = 'pg_{}'.format(state)
 277             try:
 278                 self.metrics[path].set(value)
 279             except KeyError:
 280                 self.log.warn("skipping pg in unknown state {}".format(state))
 281         reported_states = [s[0] for s in reported_pg_s]
 282         for state in PG_STATES:
 283             path = 'pg_{}'.format(state)
 284             if state not in reported_states:
 285                 try:
 286                     self.metrics[path].set(0)
 287                 except KeyError:
 288                     self.log.warn("skipping pg in unknown state {}".format(state))
 289
 290     def get_osd_stats(self):
 291         osd_stats = self.get('osd_stats')
 292         for osd in osd_stats['osd_stats']:
 293             id_ = osd['osd']
 294             for stat in OSD_STATS:
 295                 status = osd['perf_stat'][stat]
 296                 self.metrics['osd_{}'.format(stat)].set(
 297                     status,
 298                     ('osd.{}'.format(id_),))
 299
 300     def get_metadata_and_osd_status(self):
 301         osd_map = self.get('osd_map')
 302         osd_devices = self.get('osd_map_crush')['devices']
 303         for osd in osd_map['osds']:
 304             id_ = osd['osd']
 305             p_addr = osd['public_addr'].split(':')[0]
 306             c_addr = osd['cluster_addr'].split(':')[0]
 307             dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
 308             self.metrics['osd_metadata'].set(0, (
 309                 c_addr,
 310                 dev_class['class'],
 311                 id_,
 312                 p_addr
 313             ))
 314             for state in OSD_STATUS:
 315                 status = osd[state]
 316                 self.metrics['osd_{}'.format(state)].set(
 317                     status,
 318                     ('osd.{}'.format(id_),))
 319
 320             osd_metadata = self.get_metadata("osd", str(id_))
 321             dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
 322             osd_dev_node = None
 323             for dev_key in dev_keys:
 324                 val = osd_metadata.get(dev_key, None)
 325                 if val and val != "unknown":
 326                     osd_dev_node = val
 327                     break
 328             osd_hostname = osd_metadata.get('hostname', None)
 329             if osd_dev_node and osd_hostname:
 330                 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
 331                     id_, osd_hostname, osd_dev_node))
 332                 self.metrics['disk_occupation'].set(0, (
 333                     osd_hostname,
 334                     osd_dev_node,
 335                     "osd.{0}".format(id_)
 336                 ))
 337             else:
 338                 self.log.info("Missing dev node metadata for osd {0}, skipping "
 339                                "occupation record for this osd".format(id_))
 340
 341         for pool in osd_map['pools']:
 342             id_ = pool['pool']
 343             name = pool['pool_name']
 344             self.metrics['pool_metadata'].set(0, (id_, name))
 345
 346     def collect(self):
 347         self.get_health()
 348         self.get_df()
 349         self.get_osd_stats()
 350         self.get_quorum_status()
 351         self.get_metadata_and_osd_status()
 352         self.get_pg_status()
 353
 354         for daemon, counters in self.get_all_perf_counters().iteritems():
 355             for path, counter_info in counters.items():
 356                 stattype = self._stattype_to_str(counter_info['type'])
 357                 # XXX simplify first effort: no histograms
 358                 # averages are already collapsed to one value for us
 359                 if not stattype or stattype == 'histogram':
 360                     self.log.debug('ignoring %s, type %s' % (path, stattype))
 361                     continue
 362
 363                 if path not in self.metrics:
 364                     self.metrics[path] = Metric(
 365                         stattype,
 366                         path,
 367                         counter_info['description'],
 368                         ("ceph_daemon",),
 369                     )
 370
 371                 self.metrics[path].set(
 372                     counter_info['value'],
 373                     (daemon,)
 374                 )
 375
 376         return self.metrics
 377
 378     def handle_command(self, cmd):
 379         if cmd['prefix'] == 'prometheus self-test':
 380             self.collect()
 381             return 0, '', 'Self-test OK'
 382         else:
 383             return (-errno.EINVAL, '',
 384                     "Command not found '{0}'".format(cmd['prefix']))
 385
 386     def serve(self):
 387
 388         class Root(object):
 389
 390             # collapse everything to '/'
 391             def _cp_dispatch(self, vpath):
 392                 cherrypy.request.path = ''
 393                 return self
 394
 395             def format_metrics(self, metrics):
 396                 formatted = ''
 397                 for m in metrics.values():
 398                     formatted += m.str_expfmt()
 399                 return formatted + '\n'
 400
 401             @cherrypy.expose
 402             def index(self):
 403                 return '''<!DOCTYPE html>
 404 <html>
 405         <head><title>Ceph Exporter</title></head>
 406         <body>
 407                 <h1>Ceph Exporter</h1>
 408                 <p><a href='/metrics'>Metrics</a></p>
 409         </body>
 410 </html>'''
 411
 412             @cherrypy.expose
 413             def metrics(self):
 414                 metrics = global_instance().collect()
 415                 cherrypy.response.headers['Content-Type'] = 'text/plain'
 416                 if metrics:
 417                     return self.format_metrics(metrics)
 418
 419         server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
 420         server_port = self.get_localized_config('server_port', DEFAULT_PORT)
 421         self.log.info(
 422             "server_addr: %s server_port: %s" %
 423             (server_addr, server_port)
 424         )
 425
 426         cherrypy.config.update({
 427             'server.socket_host': server_addr,
 428             'server.socket_port': int(server_port),
 429             'engine.autoreload.on': False
 430         })
 431         cherrypy.tree.mount(Root(), "/")
 432         cherrypy.engine.start()
 433         cherrypy.engine.block()