ceph/src/pybind/mgr/pg_autoscaler/module.py

   1 """
   2 Automatically scale pg_num based on how much data is stored in each pool.
   3 """
   4
   5 import errno
   6 import json
   7 import mgr_util
   8 import threading
   9 import uuid
  10 from six import itervalues, iteritems
  11 from collections import defaultdict
  12 from prettytable import PrettyTable, PLAIN_COLUMNS
  13
  14 from mgr_module import MgrModule
  15
  16 """
  17 Some terminology is made up for the purposes of this module:
  18
  19  - "raw pgs": pg count after applying replication, i.e. the real resource
  20               consumption of a pool.
  21  - "grow/shrink" - increase/decrease the pg_num in a pool
  22  - "crush subtree" - non-overlapping domains in crush hierarchy: used as
  23                      units of resource management.
  24 """
  25
  26 INTERVAL = 5
  27
  28 PG_NUM_MIN = 32  # unless specified on a per-pool basis
  29
  30 def nearest_power_of_two(n):
  31     v = int(n)
  32
  33     v -= 1
  34     v |= v >> 1
  35     v |= v >> 2
  36     v |= v >> 4
  37     v |= v >> 8
  38     v |= v >> 16
  39
  40     # High bound power of two
  41     v += 1
  42
  43     # Low bound power of tow
  44     x = v >> 1
  45
  46     return x if (v - n) > (n - x) else v
  47
  48
  49 class PgAutoscaler(MgrModule):
  50     """
  51     PG autoscaler.
  52     """
  53     COMMANDS = [
  54         {
  55             "cmd": "osd pool autoscale-status",
  56             "desc": "report on pool pg_num sizing recommendation and intent",
  57             "perm": "r"
  58         },
  59     ]
  60
  61     NATIVE_OPTIONS = [
  62         'mon_target_pg_per_osd',
  63         'mon_max_pg_per_osd',
  64     ]
  65
  66     MODULE_OPTIONS = [
  67         {
  68             'name': 'sleep_interval',
  69             'default': str(60),
  70         },
  71     ]
  72
  73     def __init__(self, *args, **kwargs):
  74         super(PgAutoscaler, self).__init__(*args, **kwargs)
  75         self._shutdown = threading.Event()
  76
  77         # So much of what we do peeks at the osdmap that it's easiest
  78         # to just keep a copy of the pythonized version.
  79         self._osd_map = None
  80
  81     def config_notify(self):
  82         for opt in self.NATIVE_OPTIONS:
  83             setattr(self,
  84                     opt,
  85                     self.get_ceph_option(opt))
  86             self.log.debug(' native option %s = %s', opt, getattr(self, opt))
  87         for opt in self.MODULE_OPTIONS:
  88             setattr(self,
  89                     opt['name'],
  90                     self.get_module_option(opt['name']) or opt['default'])
  91             self.log.debug(' mgr option %s = %s',
  92                            opt['name'], getattr(self, opt['name']))
  93
  94
  95     def handle_command(self, inbuf, cmd):
  96         if cmd['prefix'] == "osd pool autoscale-status":
  97             retval = self._command_autoscale_status(cmd)
  98         else:
  99             assert False  # ceph-mgr should never pass us unknown cmds
 100         return retval
 101
 102     def _command_autoscale_status(self, cmd):
 103         osdmap = self.get_osdmap()
 104         pools = osdmap.get_pools_by_name()
 105         ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
 106
 107         if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
 108             return 0, json.dumps(ps, indent=2), ''
 109         else:
 110             table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
 111                                  'RATE', 'RAW CAPACITY',
 112                                  'RATIO', 'TARGET RATIO',
 113                                  'BIAS',
 114                                  'PG_NUM',
 115 #                                 'IDEAL',
 116                                  'NEW PG_NUM', 'AUTOSCALE'],
 117                                 border=False)
 118             table.left_padding_width = 0
 119             table.right_padding_width = 1
 120             table.align['POOL'] = 'l'
 121             table.align['SIZE'] = 'r'
 122             table.align['TARGET SIZE'] = 'r'
 123             table.align['RATE'] = 'r'
 124             table.align['RAW CAPACITY'] = 'r'
 125             table.align['RATIO'] = 'r'
 126             table.align['TARGET RATIO'] = 'r'
 127             table.align['BIAS'] = 'r'
 128             table.align['PG_NUM'] = 'r'
 129 #            table.align['IDEAL'] = 'r'
 130             table.align['NEW PG_NUM'] = 'r'
 131             table.align['AUTOSCALE'] = 'l'
 132             for p in ps:
 133                 if p['would_adjust']:
 134                     final = str(p['pg_num_final'])
 135                 else:
 136                     final = ''
 137                 if p['target_bytes'] > 0:
 138                     ts = mgr_util.format_bytes(p['target_bytes'], 6)
 139                 else:
 140                     ts = ''
 141                 if p['target_ratio'] > 0.0:
 142                     tr = '%.4f' % p['target_ratio']
 143                 else:
 144                     tr = ''
 145                 table.add_row([
 146                     p['pool_name'],
 147                     mgr_util.format_bytes(p['logical_used'], 6),
 148                     ts,
 149                     p['raw_used_rate'],
 150                     mgr_util.format_bytes(p['subtree_capacity'], 6),
 151                     '%.4f' % p['capacity_ratio'],
 152                     tr,
 153                     p['bias'],
 154                     p['pg_num_target'],
 155 #                    p['pg_num_ideal'],
 156                     final,
 157                     p['pg_autoscale_mode'],
 158                 ])
 159             return 0, table.get_string(), ''
 160
 161     def serve(self):
 162         self.config_notify()
 163         while not self._shutdown.is_set():
 164             self._maybe_adjust()
 165             self._shutdown.wait(timeout=int(self.sleep_interval))
 166
 167     def shutdown(self):
 168         self.log.info('Stopping pg_autoscaler')
 169         self._shutdown.set()
 170
 171     def get_subtree_resource_status(self, osdmap, crush):
 172         """
 173         For each CRUSH subtree of interest (i.e. the roots under which
 174         we have pools), calculate the current resource usages and targets,
 175         such as how many PGs there are, vs. how many PGs we would
 176         like there to be.
 177         """
 178         result = {}
 179         pool_root = {}
 180         roots = []
 181
 182         class CrushSubtreeResourceStatus(object):
 183             def __init__(self):
 184                 self.root_ids = []
 185                 self.osds = set()
 186                 self.osd_count = None  # Number of OSDs
 187                 self.pg_target = None  # Ideal full-capacity PG count?
 188                 self.pg_current = 0  # How many PGs already?
 189                 self.capacity = None  # Total capacity of OSDs in subtree
 190                 self.pool_ids = []
 191                 self.pool_names = []
 192
 193         # identify subtrees (note that they may overlap!)
 194         for pool_id, pool in osdmap.get_pools().items():
 195             cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
 196             root_id = int(crush.get_rule_root(cr_name))
 197             pool_root[pool_id] = root_id
 198             osds = set(crush.get_osds_under(root_id))
 199
 200             # do we intersect an existing root?
 201             s = None
 202             for prev in itervalues(result):
 203                 if osds & prev.osds:
 204                     s = prev
 205                     break
 206             if not s:
 207                 s = CrushSubtreeResourceStatus()
 208                 roots.append(s)
 209             result[root_id] = s
 210             s.root_ids.append(root_id)
 211             s.osds |= osds
 212             s.pool_ids.append(int(pool_id))
 213             s.pool_names.append(pool['pool_name'])
 214             s.pg_current += pool['pg_num_target'] * pool['size']
 215
 216
 217         # finish subtrees
 218         all_stats = self.get('osd_stats')
 219         for s in roots:
 220             s.osd_count = len(s.osds)
 221             s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd)
 222
 223             capacity = 0.0
 224             for osd_stats in all_stats['osd_stats']:
 225                 if osd_stats['osd'] in s.osds:
 226                     # Intentionally do not apply the OSD's reweight to
 227                     # this, because we want to calculate PG counts based
 228                     # on the physical storage available, not how it is
 229                     # reweighted right now.
 230                     capacity += osd_stats['kb'] * 1024
 231
 232             s.capacity = capacity
 233
 234             self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
 235                            s.root_ids,
 236                            s.pool_ids,
 237                            s.osd_count,
 238                            s.pg_target)
 239
 240         return result, pool_root
 241
 242
 243     def _get_pool_status(
 244             self,
 245             osdmap,
 246             pools,
 247             threshold=3.0,
 248     ):
 249         assert threshold >= 2.0
 250
 251         crush_map = osdmap.get_crush()
 252
 253         root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
 254
 255         df = self.get('df')
 256         pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
 257
 258         ret = []
 259
 260         # iterate over all pools to determine how they should be sized
 261         for pool_name, p in iteritems(pools):
 262             pool_id = p['pool']
 263             if pool_id not in pool_stats:
 264                 # race with pool deletion; skip
 265                 continue
 266
 267             # FIXME: we assume there is only one take per pool, but that
 268             # may not be true.
 269             cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
 270             root_id = int(crush_map.get_rule_root(cr_name))
 271             pool_root[pool_name] = root_id
 272
 273             capacity = root_map[root_id].capacity
 274             if capacity == 0:
 275                 self.log.debug('skipping empty subtree %s', cr_name)
 276                 continue
 277
 278             raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
 279
 280             pool_logical_used = pool_stats[pool_id]['stored']
 281             bias = p['options'].get('pg_autoscale_bias', 1.0)
 282             target_bytes = p['options'].get('target_size_bytes', 0)
 283
 284             # What proportion of space are we using?
 285             actual_raw_used = pool_logical_used * raw_used_rate
 286             actual_capacity_ratio = float(actual_raw_used) / capacity
 287
 288             pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
 289             capacity_ratio = float(pool_raw_used) / capacity
 290
 291             target_ratio = p['options'].get('target_size_ratio', 0.0)
 292             final_ratio = max(capacity_ratio, target_ratio)
 293
 294             # So what proportion of pg allowance should we be using?
 295             pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
 296
 297             final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
 298                                   nearest_power_of_two(pool_pg_target))
 299
 300             self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
 301                           "pg target {4} quantized to {5} (current {6})".format(
 302                               p['pool_name'],
 303                               root_id,
 304                               final_ratio,
 305                               bias,
 306                               pool_pg_target,
 307                               final_pg_target,
 308                               p['pg_num_target']
 309                           ))
 310
 311             adjust = False
 312             if (final_pg_target > p['pg_num_target'] * threshold or \
 313                 final_pg_target <= p['pg_num_target'] / threshold) and \
 314                 final_ratio >= 0.0 and \
 315                 final_ratio <= 1.0:
 316                 adjust = True
 317
 318             ret.append({
 319                 'pool_id': pool_id,
 320                 'pool_name': p['pool_name'],
 321                 'crush_root_id': root_id,
 322                 'pg_autoscale_mode': p['pg_autoscale_mode'],
 323                 'pg_num_target': p['pg_num_target'],
 324                 'logical_used': pool_logical_used,
 325                 'target_bytes': target_bytes,
 326                 'raw_used_rate': raw_used_rate,
 327                 'subtree_capacity': capacity,
 328                 'actual_raw_used': actual_raw_used,
 329                 'raw_used': pool_raw_used,
 330                 'actual_capacity_ratio': actual_capacity_ratio,
 331                 'capacity_ratio': capacity_ratio,
 332                 'target_ratio': target_ratio,
 333                 'pg_num_ideal': int(pool_pg_target),
 334                 'pg_num_final': final_pg_target,
 335                 'would_adjust': adjust,
 336                 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
 337                 });
 338
 339         return (ret, root_map, pool_root)
 340
 341
 342     def _maybe_adjust(self):
 343         self.log.info('_maybe_adjust')
 344         osdmap = self.get_osdmap()
 345         pools = osdmap.get_pools_by_name()
 346         ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
 347
 348         # Anyone in 'warn', set the health message for them and then
 349         # drop them from consideration.
 350         too_few = []
 351         too_many = []
 352         health_checks = {}
 353
 354         total_ratio = dict([(r, 0.0) for r in iter(root_map)])
 355         total_target_ratio = dict([(r, 0.0) for r in iter(root_map)])
 356         target_ratio_pools = dict([(r, []) for r in iter(root_map)])
 357
 358         total_bytes = dict([(r, 0) for r in iter(root_map)])
 359         total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
 360         target_bytes_pools = dict([(r, []) for r in iter(root_map)])
 361
 362         for p in ps:
 363             total_ratio[p['crush_root_id']] += max(p['actual_capacity_ratio'],
 364                                                    p['target_ratio'])
 365             if p['target_ratio'] > 0:
 366                 total_target_ratio[p['crush_root_id']] += p['target_ratio']
 367                 target_ratio_pools[p['crush_root_id']].append(p['pool_name'])
 368             total_bytes[p['crush_root_id']] += max(
 369                 p['actual_raw_used'],
 370                 p['target_bytes'] * p['raw_used_rate'])
 371             if p['target_bytes'] > 0:
 372                 total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
 373                 target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
 374             if not p['would_adjust']:
 375                 continue
 376             if p['pg_autoscale_mode'] == 'warn':
 377                 msg = 'Pool %s has %d placement groups, should have %d' % (
 378                     p['pool_name'],
 379                     p['pg_num_target'],
 380                     p['pg_num_final'])
 381                 if p['pg_num_final'] > p['pg_num_target']:
 382                     too_few.append(msg)
 383                 else:
 384                     too_many.append(msg)
 385
 386             if p['pg_autoscale_mode'] == 'on':
 387                 # Note that setting pg_num actually sets pg_num_target (see
 388                 # OSDMonitor.cc)
 389                 r = self.mon_command({
 390                     'prefix': 'osd pool set',
 391                     'pool': p['pool_name'],
 392                     'var': 'pg_num',
 393                     'val': str(p['pg_num_final'])
 394                 })
 395
 396                 if r[0] != 0:
 397                     # FIXME: this is a serious and unexpected thing,
 398                     # we should expose it as a cluster log error once
 399                     # the hook for doing that from ceph-mgr modules is
 400                     # in.
 401                     self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
 402                                    .format(p['pool_name'],
 403                                            p['pg_num_final'], r))
 404
 405         if too_few:
 406             summary = "{0} pools have too few placement groups".format(
 407                 len(too_few))
 408             health_checks['POOL_TOO_FEW_PGS'] = {
 409                 'severity': 'warning',
 410                 'summary': summary,
 411                 'detail': too_few
 412             }
 413         if too_many:
 414             summary = "{0} pools have too many placement groups".format(
 415                 len(too_many))
 416             health_checks['POOL_TOO_MANY_PGS'] = {
 417                 'severity': 'warning',
 418                 'summary': summary,
 419                 'detail': too_many
 420             }
 421
 422         too_much_target_ratio = []
 423         for root_id, total in iteritems(total_ratio):
 424             total_target = total_target_ratio[root_id]
 425             if total_target > 0 and total > 1.0:
 426                 too_much_target_ratio.append(
 427                     'Pools %s overcommit available storage by %.03fx due to '
 428                     'target_size_ratio %.03f on pools %s' % (
 429                         root_map[root_id].pool_names,
 430                         total,
 431                         total_target,
 432                         target_ratio_pools[root_id]
 433                     )
 434                 )
 435             elif total_target > 1.0:
 436                 too_much_target_ratio.append(
 437                     'Pools %s have collective target_size_ratio %.03f > 1.0' % (
 438                         root_map[root_id].pool_names,
 439                         total_target
 440                     )
 441                 )
 442         if too_much_target_ratio:
 443             health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
 444                 'severity': 'warning',
 445                 'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
 446                 'detail': too_much_target_ratio,
 447             }
 448
 449         too_much_target_bytes = []
 450         for root_id, total in iteritems(total_bytes):
 451             total_target = total_target_bytes[root_id]
 452             if total_target > 0 and total > root_map[root_id].capacity:
 453                 too_much_target_bytes.append(
 454                     'Pools %s overcommit available storage by %.03fx due to '
 455                     'target_size_bytes %s on pools %s' % (
 456                         root_map[root_id].pool_names,
 457                         total / root_map[root_id].capacity,
 458                         mgr_util.format_bytes(total_target, 5, colored=False),
 459                         target_bytes_pools[root_id]
 460                     )
 461                 )
 462             elif total_target > root_map[root_id].capacity:
 463                 too_much_target_bytes.append(
 464                     'Pools %s overcommit available storage by %.03fx due to '
 465                     'collective target_size_bytes of %s' % (
 466                         root_map[root_id].pool_names,
 467                         total / root_map[root_id].capacity,
 468                         mgr_util.format_bytes(total_target, 5, colored=False),
 469                     )
 470                 )
 471         if too_much_target_bytes:
 472             health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
 473                 'severity': 'warning',
 474                 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
 475                 'detail': too_much_target_bytes,
 476             }
 477
 478
 479         self.set_health_checks(health_checks)