ceph/src/pybind/mgr/pg_autoscaler/module.py

   1 """
   2 Automatically scale pg_num based on how much data is stored in each pool.
   3 """
   4
   5 import json
   6 import mgr_util
   7 import threading
   8 import uuid
   9 from six import itervalues, iteritems
  10 from prettytable import PrettyTable
  11 from mgr_module import MgrModule
  12
  13 """
  14 Some terminology is made up for the purposes of this module:
  15
  16  - "raw pgs": pg count after applying replication, i.e. the real resource
  17               consumption of a pool.
  18  - "grow/shrink" - increase/decrease the pg_num in a pool
  19  - "crush subtree" - non-overlapping domains in crush hierarchy: used as
  20                      units of resource management.
  21 """
  22
  23 INTERVAL = 5
  24
  25 PG_NUM_MIN = 32  # unless specified on a per-pool basis
  26
  27 def nearest_power_of_two(n):
  28     v = int(n)
  29
  30     v -= 1
  31     v |= v >> 1
  32     v |= v >> 2
  33     v |= v >> 4
  34     v |= v >> 8
  35     v |= v >> 16
  36
  37     # High bound power of two
  38     v += 1
  39
  40     # Low bound power of tow
  41     x = v >> 1
  42
  43     return x if (v - n) > (n - x) else v
  44
  45 def effective_target_ratio(target_ratio, total_target_ratio, total_target_bytes, capacity):
  46     """
  47     Returns the target ratio after normalizing for ratios across pools and
  48     adjusting for capacity reserved by pools that have target_size_bytes set.
  49     """
  50     target_ratio = float(target_ratio)
  51     if total_target_ratio:
  52         target_ratio = target_ratio / total_target_ratio
  53
  54     if total_target_bytes and capacity:
  55         fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity)
  56         target_ratio *= fraction_available
  57
  58     return target_ratio
  59
  60
  61 class PgAdjustmentProgress(object):
  62     """
  63     Keeps the initial and target pg_num values
  64     """
  65     def __init__(self, pool_id, pg_num, pg_num_target):
  66         self.ev_id = str(uuid.uuid4())
  67         self.pool_id = pool_id
  68         self.reset(pg_num, pg_num_target)
  69
  70     def reset(self, pg_num, pg_num_target):
  71         self.pg_num = pg_num
  72         self.pg_num_target = pg_num_target
  73
  74     def update(self, module, progress):
  75         desc = 'increasing' if self.pg_num < self.pg_num_target else 'decreasing'
  76         module.remote('progress', 'update', self.ev_id,
  77                       ev_msg="PG autoscaler %s pool %d PGs from %d to %d" %
  78                             (desc, self.pool_id, self.pg_num, self.pg_num_target),
  79                       ev_progress=progress,
  80                       refs=[("pool", self.pool_id)])
  81
  82
  83 class PgAutoscaler(MgrModule):
  84     """
  85     PG autoscaler.
  86     """
  87     COMMANDS = [
  88         {
  89             "cmd": "osd pool autoscale-status",
  90             "desc": "report on pool pg_num sizing recommendation and intent",
  91             "perm": "r"
  92         },
  93     ]
  94
  95     NATIVE_OPTIONS = [
  96         'mon_target_pg_per_osd',
  97         'mon_max_pg_per_osd',
  98     ]
  99
 100     MODULE_OPTIONS = [
 101         {
 102             'name': 'sleep_interval',
 103             'default': str(60),
 104         },
 105     ]
 106
 107     def __init__(self, *args, **kwargs):
 108         super(PgAutoscaler, self).__init__(*args, **kwargs)
 109         self._shutdown = threading.Event()
 110         self._event = {}
 111
 112         # So much of what we do peeks at the osdmap that it's easiest
 113         # to just keep a copy of the pythonized version.
 114         self._osd_map = None
 115
 116     def config_notify(self):
 117         for opt in self.NATIVE_OPTIONS:
 118             setattr(self,
 119                     opt,
 120                     self.get_ceph_option(opt))
 121             self.log.debug(' native option %s = %s', opt, getattr(self, opt))
 122         for opt in self.MODULE_OPTIONS:
 123             setattr(self,
 124                     opt['name'],
 125                     self.get_module_option(opt['name']))
 126             self.log.debug(' mgr option %s = %s',
 127                            opt['name'], getattr(self, opt['name']))
 128
 129
 130     def handle_command(self, inbuf, cmd):
 131         if cmd['prefix'] == "osd pool autoscale-status":
 132             retval = self._command_autoscale_status(cmd)
 133         else:
 134             assert False  # ceph-mgr should never pass us unknown cmds
 135         return retval
 136
 137     def _command_autoscale_status(self, cmd):
 138         osdmap = self.get_osdmap()
 139         pools = osdmap.get_pools_by_name()
 140         ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
 141
 142         if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
 143             return 0, json.dumps(ps, indent=4, sort_keys=True), ''
 144         else:
 145             table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
 146                                  'RATE', 'RAW CAPACITY',
 147                                  'RATIO', 'TARGET RATIO',
 148                                  'EFFECTIVE RATIO',
 149                                  'BIAS',
 150                                  'PG_NUM',
 151 #                                 'IDEAL',
 152                                  'NEW PG_NUM', 'AUTOSCALE'],
 153                                 border=False)
 154             table.left_padding_width = 0
 155             table.right_padding_width = 2
 156             table.align['POOL'] = 'l'
 157             table.align['SIZE'] = 'r'
 158             table.align['TARGET SIZE'] = 'r'
 159             table.align['RATE'] = 'r'
 160             table.align['RAW CAPACITY'] = 'r'
 161             table.align['RATIO'] = 'r'
 162             table.align['TARGET RATIO'] = 'r'
 163             table.align['EFFECTIVE RATIO'] = 'r'
 164             table.align['BIAS'] = 'r'
 165             table.align['PG_NUM'] = 'r'
 166 #            table.align['IDEAL'] = 'r'
 167             table.align['NEW PG_NUM'] = 'r'
 168             table.align['AUTOSCALE'] = 'l'
 169             for p in ps:
 170                 if p['would_adjust']:
 171                     final = str(p['pg_num_final'])
 172                 else:
 173                     final = ''
 174                 if p['target_bytes'] > 0:
 175                     ts = mgr_util.format_bytes(p['target_bytes'], 6)
 176                 else:
 177                     ts = ''
 178                 if p['target_ratio'] > 0.0:
 179                     tr = '%.4f' % p['target_ratio']
 180                 else:
 181                     tr = ''
 182                 if p['effective_target_ratio'] > 0.0:
 183                     etr = '%.4f' % p['effective_target_ratio']
 184                 else:
 185                     etr = ''
 186                 table.add_row([
 187                     p['pool_name'],
 188                     mgr_util.format_bytes(p['logical_used'], 6),
 189                     ts,
 190                     p['raw_used_rate'],
 191                     mgr_util.format_bytes(p['subtree_capacity'], 6),
 192                     '%.4f' % p['capacity_ratio'],
 193                     tr,
 194                     etr,
 195                     p['bias'],
 196                     p['pg_num_target'],
 197 #                    p['pg_num_ideal'],
 198                     final,
 199                     p['pg_autoscale_mode'],
 200                 ])
 201             return 0, table.get_string(), ''
 202
 203     def serve(self):
 204         self.config_notify()
 205         while not self._shutdown.is_set():
 206             self._maybe_adjust()
 207             self._update_progress_events()
 208             self._shutdown.wait(timeout=int(self.sleep_interval))
 209
 210     def shutdown(self):
 211         self.log.info('Stopping pg_autoscaler')
 212         self._shutdown.set()
 213
 214     def get_subtree_resource_status(self, osdmap, crush):
 215         """
 216         For each CRUSH subtree of interest (i.e. the roots under which
 217         we have pools), calculate the current resource usages and targets,
 218         such as how many PGs there are, vs. how many PGs we would
 219         like there to be.
 220         """
 221         result = {}
 222         pool_root = {}
 223         roots = []
 224
 225         class CrushSubtreeResourceStatus(object):
 226             def __init__(self):
 227                 self.root_ids = []
 228                 self.osds = set()
 229                 self.osd_count = None  # Number of OSDs
 230                 self.pg_target = None  # Ideal full-capacity PG count?
 231                 self.pg_current = 0  # How many PGs already?
 232                 self.capacity = None  # Total capacity of OSDs in subtree
 233                 self.pool_ids = []
 234                 self.pool_names = []
 235                 self.total_target_ratio = 0.0
 236                 self.total_target_bytes = 0 # including replication / EC overhead
 237
 238         # identify subtrees (note that they may overlap!)
 239         for pool_id, pool in osdmap.get_pools().items():
 240             cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
 241             root_id = int(crush.get_rule_root(cr_name))
 242             pool_root[pool_id] = root_id
 243             osds = set(crush.get_osds_under(root_id))
 244
 245             # do we intersect an existing root?
 246             s = None
 247             for prev in itervalues(result):
 248                 if osds & prev.osds:
 249                     s = prev
 250                     break
 251             if not s:
 252                 s = CrushSubtreeResourceStatus()
 253                 roots.append(s)
 254             result[root_id] = s
 255             s.root_ids.append(root_id)
 256             s.osds |= osds
 257             s.pool_ids.append(pool_id)
 258             s.pool_names.append(pool['pool_name'])
 259             s.pg_current += pool['pg_num_target'] * pool['size']
 260             target_ratio = pool['options'].get('target_size_ratio', 0.0)
 261             if target_ratio:
 262                 s.total_target_ratio += target_ratio
 263             else:
 264                 target_bytes = pool['options'].get('target_size_bytes', 0)
 265                 if target_bytes:
 266                     s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id)
 267
 268         # finish subtrees
 269         all_stats = self.get('osd_stats')
 270         for s in roots:
 271             s.osd_count = len(s.osds)
 272             s.pg_target = s.osd_count * self.mon_target_pg_per_osd
 273
 274             capacity = 0.0
 275             for osd_stats in all_stats['osd_stats']:
 276                 if osd_stats['osd'] in s.osds:
 277                     # Intentionally do not apply the OSD's reweight to
 278                     # this, because we want to calculate PG counts based
 279                     # on the physical storage available, not how it is
 280                     # reweighted right now.
 281                     capacity += osd_stats['kb'] * 1024
 282
 283             s.capacity = capacity
 284
 285             self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
 286                            s.root_ids,
 287                            s.pool_ids,
 288                            s.osd_count,
 289                            s.pg_target)
 290
 291         return result, pool_root
 292
 293     def _get_pool_status(
 294             self,
 295             osdmap,
 296             pools,
 297             threshold=3.0,
 298     ):
 299         assert threshold >= 2.0
 300
 301         crush_map = osdmap.get_crush()
 302
 303         root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
 304
 305         df = self.get('df')
 306         pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
 307
 308         ret = []
 309
 310         # iterate over all pools to determine how they should be sized
 311         for pool_name, p in iteritems(pools):
 312             pool_id = p['pool']
 313             if pool_id not in pool_stats:
 314                 # race with pool deletion; skip
 315                 continue
 316
 317             # FIXME: we assume there is only one take per pool, but that
 318             # may not be true.
 319             cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
 320             root_id = int(crush_map.get_rule_root(cr_name))
 321             pool_root[pool_name] = root_id
 322
 323             capacity = root_map[root_id].capacity
 324             if capacity == 0:
 325                 self.log.debug('skipping empty subtree %s', cr_name)
 326                 continue
 327
 328             raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
 329
 330             pool_logical_used = pool_stats[pool_id]['stored']
 331             bias = p['options'].get('pg_autoscale_bias', 1.0)
 332             target_bytes = 0
 333             # ratio takes precedence if both are set
 334             if p['options'].get('target_size_ratio', 0.0) == 0.0:
 335                 target_bytes = p['options'].get('target_size_bytes', 0)
 336
 337             # What proportion of space are we using?
 338             actual_raw_used = pool_logical_used * raw_used_rate
 339             actual_capacity_ratio = float(actual_raw_used) / capacity
 340
 341             pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
 342             capacity_ratio = float(pool_raw_used) / capacity
 343
 344             self.log.info("effective_target_ratio {0} {1} {2} {3}".format(
 345                 p['options'].get('target_size_ratio', 0.0),
 346                 root_map[root_id].total_target_ratio,
 347                 root_map[root_id].total_target_bytes,
 348                 capacity))
 349             target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0),
 350                                                   root_map[root_id].total_target_ratio,
 351                                                   root_map[root_id].total_target_bytes,
 352                                                   capacity)
 353
 354             final_ratio = max(capacity_ratio, target_ratio)
 355
 356             # So what proportion of pg allowance should we be using?
 357             pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
 358
 359             final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
 360                                   nearest_power_of_two(pool_pg_target))
 361
 362             self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
 363                           "pg target {4} quantized to {5} (current {6})".format(
 364                               p['pool_name'],
 365                               root_id,
 366                               final_ratio,
 367                               bias,
 368                               pool_pg_target,
 369                               final_pg_target,
 370                               p['pg_num_target']
 371                           ))
 372
 373             adjust = False
 374             if (final_pg_target > p['pg_num_target'] * threshold or \
 375                 final_pg_target < p['pg_num_target'] / threshold) and \
 376                 final_ratio >= 0.0 and \
 377                 final_ratio <= 1.0:
 378                 adjust = True
 379
 380             ret.append({
 381                 'pool_id': pool_id,
 382                 'pool_name': p['pool_name'],
 383                 'crush_root_id': root_id,
 384                 'pg_autoscale_mode': p['pg_autoscale_mode'],
 385                 'pg_num_target': p['pg_num_target'],
 386                 'logical_used': pool_logical_used,
 387                 'target_bytes': target_bytes,
 388                 'raw_used_rate': raw_used_rate,
 389                 'subtree_capacity': capacity,
 390                 'actual_raw_used': actual_raw_used,
 391                 'raw_used': pool_raw_used,
 392                 'actual_capacity_ratio': actual_capacity_ratio,
 393                 'capacity_ratio': capacity_ratio,
 394                 'target_ratio': p['options'].get('target_size_ratio', 0.0),
 395                 'effective_target_ratio': target_ratio,
 396                 'pg_num_ideal': int(pool_pg_target),
 397                 'pg_num_final': final_pg_target,
 398                 'would_adjust': adjust,
 399                 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
 400                 });
 401
 402         return (ret, root_map, pool_root)
 403
 404     def _update_progress_events(self):
 405         osdmap = self.get_osdmap()
 406         pools = osdmap.get_pools()
 407         for pool_id in list(self._event):
 408             ev = self._event[pool_id]
 409             pool_data = pools.get(pool_id)
 410             if pool_data is None or pool_data['pg_num'] == pool_data['pg_num_target']:
 411                 # pool is gone or we've reached our target
 412                 self.remote('progress', 'complete', ev.ev_id)
 413                 del self._event[pool_id]
 414                 continue
 415             ev.update(self, (ev.pg_num - pool_data['pg_num']) / (ev.pg_num - ev.pg_num_target))
 416
 417     def _maybe_adjust(self):
 418         self.log.info('_maybe_adjust')
 419         osdmap = self.get_osdmap()
 420         if osdmap.get_require_osd_release() < 'nautilus':
 421             return
 422         pools = osdmap.get_pools_by_name()
 423         ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
 424
 425         # Anyone in 'warn', set the health message for them and then
 426         # drop them from consideration.
 427         too_few = []
 428         too_many = []
 429         bytes_and_ratio = []
 430         health_checks = {}
 431
 432         total_bytes = dict([(r, 0) for r in iter(root_map)])
 433         total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
 434         target_bytes_pools = dict([(r, []) for r in iter(root_map)])
 435
 436         for p in ps:
 437             pool_id = p['pool_id']
 438             pool_opts = pools[p['pool_name']]['options']
 439             if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0:
 440                     bytes_and_ratio.append('Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name'])
 441             total_bytes[p['crush_root_id']] += max(
 442                 p['actual_raw_used'],
 443                 p['target_bytes'] * p['raw_used_rate'])
 444             if p['target_bytes'] > 0:
 445                 total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
 446                 target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
 447             if not p['would_adjust']:
 448                 continue
 449             if p['pg_autoscale_mode'] == 'warn':
 450                 msg = 'Pool %s has %d placement groups, should have %d' % (
 451                     p['pool_name'],
 452                     p['pg_num_target'],
 453                     p['pg_num_final'])
 454                 if p['pg_num_final'] > p['pg_num_target']:
 455                     too_few.append(msg)
 456                 else:
 457                     too_many.append(msg)
 458
 459             if p['pg_autoscale_mode'] == 'on':
 460                 # Note that setting pg_num actually sets pg_num_target (see
 461                 # OSDMonitor.cc)
 462                 r = self.mon_command({
 463                     'prefix': 'osd pool set',
 464                     'pool': p['pool_name'],
 465                     'var': 'pg_num',
 466                     'val': str(p['pg_num_final'])
 467                 })
 468
 469                 # create new event or update existing one to reflect
 470                 # progress from current state to the new pg_num_target
 471                 pool_data = pools[p['pool_name']]
 472                 pg_num = pool_data['pg_num']
 473                 new_target = p['pg_num_final']
 474                 if pool_id in self._event:
 475                     self._event[pool_id].reset(pg_num, new_target)
 476                 else:
 477                     self._event[pool_id] = PgAdjustmentProgress(pool_id, pg_num, new_target)
 478                 self._event[pool_id].update(self, 0.0)
 479
 480                 if r[0] != 0:
 481                     # FIXME: this is a serious and unexpected thing,
 482                     # we should expose it as a cluster log error once
 483                     # the hook for doing that from ceph-mgr modules is
 484                     # in.
 485                     self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
 486                                    .format(p['pool_name'],
 487                                            p['pg_num_final'], r))
 488
 489         if too_few:
 490             summary = "{0} pools have too few placement groups".format(
 491                 len(too_few))
 492             health_checks['POOL_TOO_FEW_PGS'] = {
 493                 'severity': 'warning',
 494                 'summary': summary,
 495                 'count': len(too_few),
 496                 'detail': too_few
 497             }
 498         if too_many:
 499             summary = "{0} pools have too many placement groups".format(
 500                 len(too_many))
 501             health_checks['POOL_TOO_MANY_PGS'] = {
 502                 'severity': 'warning',
 503                 'summary': summary,
 504                 'count': len(too_many),
 505                 'detail': too_many
 506             }
 507
 508         too_much_target_bytes = []
 509         for root_id, total in iteritems(total_bytes):
 510             total_target = total_target_bytes[root_id]
 511             if total_target > 0 and total > root_map[root_id].capacity and root_map[root_id].capacity:
 512                 too_much_target_bytes.append(
 513                     'Pools %s overcommit available storage by %.03fx due to '
 514                     'target_size_bytes %s on pools %s' % (
 515                         root_map[root_id].pool_names,
 516                         total / root_map[root_id].capacity,
 517                         mgr_util.format_bytes(total_target, 5, colored=False),
 518                         target_bytes_pools[root_id]
 519                     )
 520                 )
 521             elif total_target > root_map[root_id].capacity and root_map[root_id].capacity:
 522                 too_much_target_bytes.append(
 523                     'Pools %s overcommit available storage by %.03fx due to '
 524                     'collective target_size_bytes of %s' % (
 525                         root_map[root_id].pool_names,
 526                         total / root_map[root_id].capacity,
 527                         mgr_util.format_bytes(total_target, 5, colored=False),
 528                     )
 529                 )
 530         if too_much_target_bytes:
 531             health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
 532                 'severity': 'warning',
 533                 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
 534                 'count': len(too_much_target_bytes),
 535                 'detail': too_much_target_bytes,
 536             }
 537
 538         if bytes_and_ratio:
 539             health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
 540                 'severity': 'warning',
 541                 'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio),
 542                 'count': len(bytes_and_ratio),
 543                 'detail': bytes_and_ratio,
 544             }
 545
 546         self.set_health_checks(health_checks)