[ceph.git] / ceph / src / pybind / mgr / pg_autoscaler / module.py

"""
Automatically scale pg_num based on how much data is stored in each pool.
"""

import json
import mgr_util
import threading
import uuid
from six import itervalues, iteritems
from prettytable import PrettyTable
from mgr_module import MgrModule

"""
Some terminology is made up for the purposes of this module:

 - "raw pgs": pg count after applying replication, i.e. the real resource
              consumption of a pool.
 - "grow/shrink" - increase/decrease the pg_num in a pool
 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
                     units of resource management.
"""

INTERVAL = 5

PG_NUM_MIN = 32  # unless specified on a per-pool basis

def nearest_power_of_two(n):
    v = int(n)

    v -= 1
    v |= v >> 1
    v |= v >> 2
    v |= v >> 4
    v |= v >> 8
    v |= v >> 16

    # High bound power of two
    v += 1

    # Low bound power of tow
    x = v >> 1

    return x if (v - n) > (n - x) else v

def effective_target_ratio(target_ratio, total_target_ratio, total_target_bytes, capacity):
    """
    Returns the target ratio after normalizing for ratios across pools and
    adjusting for capacity reserved by pools that have target_size_bytes set.
    """
    target_ratio = float(target_ratio)
    if total_target_ratio:
        target_ratio = target_ratio / total_target_ratio

    if total_target_bytes and capacity:
        fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity)
        target_ratio *= fraction_available

    return target_ratio


class PgAdjustmentProgress(object):
    """
    Keeps the initial and target pg_num values
    """
    def __init__(self, pool_id, pg_num, pg_num_target):
        self.ev_id = str(uuid.uuid4())
        self.pool_id = pool_id
        self.reset(pg_num, pg_num_target)

    def reset(self, pg_num, pg_num_target):
        self.pg_num = pg_num
        self.pg_num_target = pg_num_target

    def update(self, module, progress):
        desc = 'increasing' if self.pg_num < self.pg_num_target else 'decreasing'
        module.remote('progress', 'update', self.ev_id,
                      ev_msg="PG autoscaler %s pool %d PGs from %d to %d" %
                            (desc, self.pool_id, self.pg_num, self.pg_num_target),
                      ev_progress=progress,
                      refs=[("pool", self.pool_id)])


class PgAutoscaler(MgrModule):
    """
    PG autoscaler.
    """
    COMMANDS = [
        {
            "cmd": "osd pool autoscale-status",
            "desc": "report on pool pg_num sizing recommendation and intent",
            "perm": "r"
        },
    ]

    NATIVE_OPTIONS = [
        'mon_target_pg_per_osd',
        'mon_max_pg_per_osd',
    ]

    MODULE_OPTIONS = [
        {
            'name': 'sleep_interval',
            'default': str(60),
        },
    ]

    def __init__(self, *args, **kwargs):
        super(PgAutoscaler, self).__init__(*args, **kwargs)
        self._shutdown = threading.Event()
        self._event = {}

        # So much of what we do peeks at the osdmap that it's easiest
        # to just keep a copy of the pythonized version.
        self._osd_map = None

    def config_notify(self):
        for opt in self.NATIVE_OPTIONS:
            setattr(self,
                    opt,
                    self.get_ceph_option(opt))
            self.log.debug(' native option %s = %s', opt, getattr(self, opt))
        for opt in self.MODULE_OPTIONS:
            setattr(self,
                    opt['name'],
                    self.get_module_option(opt['name']))
            self.log.debug(' mgr option %s = %s',
                           opt['name'], getattr(self, opt['name']))


    def handle_command(self, inbuf, cmd):
        if cmd['prefix'] == "osd pool autoscale-status":
            retval = self._command_autoscale_status(cmd)
        else:
            assert False  # ceph-mgr should never pass us unknown cmds
        return retval

    def _command_autoscale_status(self, cmd):
        osdmap = self.get_osdmap()
        pools = osdmap.get_pools_by_name()
        ps, root_map, pool_root = self._get_pool_status(osdmap, pools)

        if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
            return 0, json.dumps(ps, indent=4, sort_keys=True), ''
        else:
            table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
                                 'RATE', 'RAW CAPACITY',
                                 'RATIO', 'TARGET RATIO',
                                 'EFFECTIVE RATIO',
                                 'BIAS',
                                 'PG_NUM',
#                                 'IDEAL',
                                 'NEW PG_NUM', 'AUTOSCALE'],
                                border=False)
            table.left_padding_width = 0
            table.right_padding_width = 2
            table.align['POOL'] = 'l'
            table.align['SIZE'] = 'r'
            table.align['TARGET SIZE'] = 'r'
            table.align['RATE'] = 'r'
            table.align['RAW CAPACITY'] = 'r'
            table.align['RATIO'] = 'r'
            table.align['TARGET RATIO'] = 'r'
            table.align['EFFECTIVE RATIO'] = 'r'
            table.align['BIAS'] = 'r'
            table.align['PG_NUM'] = 'r'
#            table.align['IDEAL'] = 'r'
            table.align['NEW PG_NUM'] = 'r'
            table.align['AUTOSCALE'] = 'l'
            for p in ps:
                if p['would_adjust']:
                    final = str(p['pg_num_final'])
                else:
                    final = ''
                if p['target_bytes'] > 0:
                    ts = mgr_util.format_bytes(p['target_bytes'], 6)
                else:
                    ts = ''
                if p['target_ratio'] > 0.0:
                    tr = '%.4f' % p['target_ratio']
                else:
                    tr = ''
                if p['effective_target_ratio'] > 0.0:
                    etr = '%.4f' % p['effective_target_ratio']
                else:
                    etr = ''
                table.add_row([
                    p['pool_name'],
                    mgr_util.format_bytes(p['logical_used'], 6),
                    ts,
                    p['raw_used_rate'],
                    mgr_util.format_bytes(p['subtree_capacity'], 6),
                    '%.4f' % p['capacity_ratio'],
                    tr,
                    etr,
                    p['bias'],
                    p['pg_num_target'],
#                    p['pg_num_ideal'],
                    final,
                    p['pg_autoscale_mode'],
                ])
            return 0, table.get_string(), ''

    def serve(self):
        self.config_notify()
        while not self._shutdown.is_set():
            self._maybe_adjust()
            self._update_progress_events()
            self._shutdown.wait(timeout=int(self.sleep_interval))

    def shutdown(self):
        self.log.info('Stopping pg_autoscaler')
        self._shutdown.set()

    def get_subtree_resource_status(self, osdmap, crush):
        """
        For each CRUSH subtree of interest (i.e. the roots under which
        we have pools), calculate the current resource usages and targets,
        such as how many PGs there are, vs. how many PGs we would
        like there to be.
        """
        result = {}
        pool_root = {}
        roots = []

        class CrushSubtreeResourceStatus(object):
            def __init__(self):
                self.root_ids = []
                self.osds = set()
                self.osd_count = None  # Number of OSDs
                self.pg_target = None  # Ideal full-capacity PG count?
                self.pg_current = 0  # How many PGs already?
                self.capacity = None  # Total capacity of OSDs in subtree
                self.pool_ids = []
                self.pool_names = []
                self.total_target_ratio = 0.0
                self.total_target_bytes = 0 # including replication / EC overhead

        # identify subtrees (note that they may overlap!)
        for pool_id, pool in osdmap.get_pools().items():
            cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
            root_id = int(crush.get_rule_root(cr_name))
            pool_root[pool_id] = root_id
            osds = set(crush.get_osds_under(root_id))

            # do we intersect an existing root?
            s = None
            for prev in itervalues(result):
                if osds & prev.osds:
                    s = prev
                    break
            if not s:
                s = CrushSubtreeResourceStatus()
                roots.append(s)
            result[root_id] = s
            s.root_ids.append(root_id)
            s.osds |= osds
            s.pool_ids.append(pool_id)
            s.pool_names.append(pool['pool_name'])
            s.pg_current += pool['pg_num_target'] * pool['size']
            target_ratio = pool['options'].get('target_size_ratio', 0.0)
            if target_ratio:
                s.total_target_ratio += target_ratio
            else:
                target_bytes = pool['options'].get('target_size_bytes', 0)
                if target_bytes:
                    s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id)

        # finish subtrees
        all_stats = self.get('osd_stats')
        for s in roots:
            s.osd_count = len(s.osds)
            s.pg_target = s.osd_count * self.mon_target_pg_per_osd

            capacity = 0.0
            for osd_stats in all_stats['osd_stats']:
                if osd_stats['osd'] in s.osds:
                    # Intentionally do not apply the OSD's reweight to
                    # this, because we want to calculate PG counts based
                    # on the physical storage available, not how it is
                    # reweighted right now.
                    capacity += osd_stats['kb'] * 1024

            s.capacity = capacity

            self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
                           s.root_ids,
                           s.pool_ids,
                           s.osd_count,
                           s.pg_target)

        return result, pool_root

    def _get_pool_status(
            self,
            osdmap,
            pools,
            threshold=3.0,
    ):
        assert threshold >= 2.0

        crush_map = osdmap.get_crush()

        root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)

        df = self.get('df')
        pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])

        ret = []

        # iterate over all pools to determine how they should be sized
        for pool_name, p in iteritems(pools):
            pool_id = p['pool']
            if pool_id not in pool_stats:
                # race with pool deletion; skip
                continue

            # FIXME: we assume there is only one take per pool, but that
            # may not be true.
            cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
            root_id = int(crush_map.get_rule_root(cr_name))
            pool_root[pool_name] = root_id

            capacity = root_map[root_id].capacity
            if capacity == 0:
                self.log.debug('skipping empty subtree %s', cr_name)
                continue

            raw_used_rate = osdmap.pool_raw_used_rate(pool_id)

            pool_logical_used = pool_stats[pool_id]['stored']
            bias = p['options'].get('pg_autoscale_bias', 1.0)
            target_bytes = 0
            # ratio takes precedence if both are set
            if p['options'].get('target_size_ratio', 0.0) == 0.0:
                target_bytes = p['options'].get('target_size_bytes', 0)

            # What proportion of space are we using?
            actual_raw_used = pool_logical_used * raw_used_rate
            actual_capacity_ratio = float(actual_raw_used) / capacity

            pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
            capacity_ratio = float(pool_raw_used) / capacity

            self.log.info("effective_target_ratio {0} {1} {2} {3}".format(
                p['options'].get('target_size_ratio', 0.0),
                root_map[root_id].total_target_ratio,
                root_map[root_id].total_target_bytes,
                capacity))
            target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0),
                                                  root_map[root_id].total_target_ratio,
                                                  root_map[root_id].total_target_bytes,
                                                  capacity)

            final_ratio = max(capacity_ratio, target_ratio)

            # So what proportion of pg allowance should we be using?
            pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias

            final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
                                  nearest_power_of_two(pool_pg_target))

            self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
                          "pg target {4} quantized to {5} (current {6})".format(
                              p['pool_name'],
                              root_id,
                              final_ratio,
                              bias,
                              pool_pg_target,
                              final_pg_target,
                              p['pg_num_target']
                          ))

            adjust = False
            if (final_pg_target > p['pg_num_target'] * threshold or \
                final_pg_target < p['pg_num_target'] / threshold) and \
                final_ratio >= 0.0 and \
                final_ratio <= 1.0:
                adjust = True

            ret.append({
                'pool_id': pool_id,
                'pool_name': p['pool_name'],
                'crush_root_id': root_id,
                'pg_autoscale_mode': p['pg_autoscale_mode'],
                'pg_num_target': p['pg_num_target'],
                'logical_used': pool_logical_used,
                'target_bytes': target_bytes,
                'raw_used_rate': raw_used_rate,
                'subtree_capacity': capacity,
                'actual_raw_used': actual_raw_used,
                'raw_used': pool_raw_used,
                'actual_capacity_ratio': actual_capacity_ratio,
                'capacity_ratio': capacity_ratio,
                'target_ratio': p['options'].get('target_size_ratio', 0.0),
                'effective_target_ratio': target_ratio,
                'pg_num_ideal': int(pool_pg_target),
                'pg_num_final': final_pg_target,
                'would_adjust': adjust,
                'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
                });

        return (ret, root_map, pool_root)

    def _update_progress_events(self):
        osdmap = self.get_osdmap()
        pools = osdmap.get_pools()
        for pool_id in list(self._event):
            ev = self._event[pool_id]
            pool_data = pools.get(pool_id)
            if pool_data is None or pool_data['pg_num'] == pool_data['pg_num_target']:
                # pool is gone or we've reached our target
                self.remote('progress', 'complete', ev.ev_id)
                del self._event[pool_id]
                continue
            ev.update(self, (ev.pg_num - pool_data['pg_num']) / (ev.pg_num - ev.pg_num_target))

    def _maybe_adjust(self):
        self.log.info('_maybe_adjust')
        osdmap = self.get_osdmap()
        if osdmap.get_require_osd_release() < 'nautilus':
            return
        pools = osdmap.get_pools_by_name()
        ps, root_map, pool_root = self._get_pool_status(osdmap, pools)

        # Anyone in 'warn', set the health message for them and then
        # drop them from consideration.
        too_few = []
        too_many = []
        bytes_and_ratio = []
        health_checks = {}

        total_bytes = dict([(r, 0) for r in iter(root_map)])
        total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
        target_bytes_pools = dict([(r, []) for r in iter(root_map)])

        for p in ps:
            pool_id = p['pool_id']
            pool_opts = pools[p['pool_name']]['options']
            if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0:
                    bytes_and_ratio.append('Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name'])
            total_bytes[p['crush_root_id']] += max(
                p['actual_raw_used'],
                p['target_bytes'] * p['raw_used_rate'])
            if p['target_bytes'] > 0:
                total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
                target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
            if not p['would_adjust']:
                continue
            if p['pg_autoscale_mode'] == 'warn':
                msg = 'Pool %s has %d placement groups, should have %d' % (
                    p['pool_name'],
                    p['pg_num_target'],
                    p['pg_num_final'])
                if p['pg_num_final'] > p['pg_num_target']:
                    too_few.append(msg)
                else:
                    too_many.append(msg)

            if p['pg_autoscale_mode'] == 'on':
                # Note that setting pg_num actually sets pg_num_target (see
                # OSDMonitor.cc)
                r = self.mon_command({
                    'prefix': 'osd pool set',
                    'pool': p['pool_name'],
                    'var': 'pg_num',
                    'val': str(p['pg_num_final'])
                })

                # create new event or update existing one to reflect
                # progress from current state to the new pg_num_target
                pool_data = pools[p['pool_name']]
                pg_num = pool_data['pg_num']
                new_target = p['pg_num_final']
                if pool_id in self._event:
                    self._event[pool_id].reset(pg_num, new_target)
                else:
                    self._event[pool_id] = PgAdjustmentProgress(pool_id, pg_num, new_target)
                self._event[pool_id].update(self, 0.0)

                if r[0] != 0:
                    # FIXME: this is a serious and unexpected thing,
                    # we should expose it as a cluster log error once
                    # the hook for doing that from ceph-mgr modules is
                    # in.
                    self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
                                   .format(p['pool_name'],
                                           p['pg_num_final'], r))

        if too_few:
            summary = "{0} pools have too few placement groups".format(
                len(too_few))
            health_checks['POOL_TOO_FEW_PGS'] = {
                'severity': 'warning',
                'summary': summary,
                'count': len(too_few),
                'detail': too_few
            }
        if too_many:
            summary = "{0} pools have too many placement groups".format(
                len(too_many))
            health_checks['POOL_TOO_MANY_PGS'] = {
                'severity': 'warning',
                'summary': summary,
                'count': len(too_many),
                'detail': too_many
            }

        too_much_target_bytes = []
        for root_id, total in iteritems(total_bytes):
            total_target = total_target_bytes[root_id]
            if total_target > 0 and total > root_map[root_id].capacity and root_map[root_id].capacity:
                too_much_target_bytes.append(
                    'Pools %s overcommit available storage by %.03fx due to '
                    'target_size_bytes %s on pools %s' % (
                        root_map[root_id].pool_names,
                        total / root_map[root_id].capacity,
                        mgr_util.format_bytes(total_target, 5, colored=False),
                        target_bytes_pools[root_id]
                    )
                )
            elif total_target > root_map[root_id].capacity and root_map[root_id].capacity:
                too_much_target_bytes.append(
                    'Pools %s overcommit available storage by %.03fx due to '
                    'collective target_size_bytes of %s' % (
                        root_map[root_id].pool_names,
                        total / root_map[root_id].capacity,
                        mgr_util.format_bytes(total_target, 5, colored=False),
                    )
                )
        if too_much_target_bytes:
            health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
                'severity': 'warning',
                'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
                'count': len(too_much_target_bytes),
                'detail': too_much_target_bytes,
            }

        if bytes_and_ratio:
            health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
                'severity': 'warning',
                'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio),
                'count': len(bytes_and_ratio),
                'detail': bytes_and_ratio,
            }

        self.set_health_checks(health_checks)
Commit	Line	Data
11fdf7f2 TL	1	"""
	2	Automatically scale pg_num based on how much data is stored in each pool.
	3	"""
	4
11fdf7f2 TL	5	import json
	6	import mgr_util
	7	import threading
	8	import uuid
81eedcae	9	from six import itervalues, iteritems
9f95a23c	10	from prettytable import PrettyTable
11fdf7f2 TL	11	from mgr_module import MgrModule
	12
	13	"""
	14	Some terminology is made up for the purposes of this module:
	15
	16	- "raw pgs": pg count after applying replication, i.e. the real resource
	17	consumption of a pool.
	18	- "grow/shrink" - increase/decrease the pg_num in a pool
	19	- "crush subtree" - non-overlapping domains in crush hierarchy: used as
	20	units of resource management.
	21	"""
	22
	23	INTERVAL = 5
	24
92f5a8d4	25	PG_NUM_MIN = 32 # unless specified on a per-pool basis
11fdf7f2 TL	26
	27	def nearest_power_of_two(n):
	28	v = int(n)
	29
	30	v -= 1
	31	v \|= v >> 1
	32	v \|= v >> 2
	33	v \|= v >> 4
	34	v \|= v >> 8
	35	v \|= v >> 16
	36
	37	# High bound power of two
	38	v += 1
	39
	40	# Low bound power of tow
	41	x = v >> 1
	42
	43	return x if (v - n) > (n - x) else v
	44
9f95a23c TL	45	def effective_target_ratio(target_ratio, total_target_ratio, total_target_bytes, capacity):
	46	"""
	47	Returns the target ratio after normalizing for ratios across pools and
	48	adjusting for capacity reserved by pools that have target_size_bytes set.
	49	"""
	50	target_ratio = float(target_ratio)
	51	if total_target_ratio:
	52	target_ratio = target_ratio / total_target_ratio
	53
	54	if total_target_bytes and capacity:
	55	fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity)
	56	target_ratio *= fraction_available
	57
	58	return target_ratio
	59
	60
	61	class PgAdjustmentProgress(object):
	62	"""
	63	Keeps the initial and target pg_num values
	64	"""
	65	def __init__(self, pool_id, pg_num, pg_num_target):
	66	self.ev_id = str(uuid.uuid4())
	67	self.pool_id = pool_id
	68	self.reset(pg_num, pg_num_target)
	69
	70	def reset(self, pg_num, pg_num_target):
	71	self.pg_num = pg_num
	72	self.pg_num_target = pg_num_target
	73
	74	def update(self, module, progress):
	75	desc = 'increasing' if self.pg_num < self.pg_num_target else 'decreasing'
	76	module.remote('progress', 'update', self.ev_id,
	77	ev_msg="PG autoscaler %s pool %d PGs from %d to %d" %
	78	(desc, self.pool_id, self.pg_num, self.pg_num_target),
	79	ev_progress=progress,
	80	refs=[("pool", self.pool_id)])
	81
11fdf7f2 TL	82
	83	class PgAutoscaler(MgrModule):
	84	"""
	85	PG autoscaler.
	86	"""
	87	COMMANDS = [
	88	{
	89	"cmd": "osd pool autoscale-status",
	90	"desc": "report on pool pg_num sizing recommendation and intent",
	91	"perm": "r"
	92	},
	93	]
	94
	95	NATIVE_OPTIONS = [
	96	'mon_target_pg_per_osd',
	97	'mon_max_pg_per_osd',
	98	]
	99
	100	MODULE_OPTIONS = [
	101	{
	102	'name': 'sleep_interval',
	103	'default': str(60),
	104	},
	105	]
	106
	107	def __init__(self, args, *kwargs):
	108	super(PgAutoscaler, self).__init__(args, *kwargs)
	109	self._shutdown = threading.Event()
9f95a23c	110	self._event = {}
11fdf7f2 TL	111
	112	# So much of what we do peeks at the osdmap that it's easiest
	113	# to just keep a copy of the pythonized version.
	114	self._osd_map = None
	115
	116	def config_notify(self):
	117	for opt in self.NATIVE_OPTIONS:
	118	setattr(self,
	119	opt,
	120	self.get_ceph_option(opt))
	121	self.log.debug(' native option %s = %s', opt, getattr(self, opt))
	122	for opt in self.MODULE_OPTIONS:
	123	setattr(self,
	124	opt['name'],
9f95a23c	125	self.get_module_option(opt['name']))
11fdf7f2 TL	126	self.log.debug(' mgr option %s = %s',
	127	opt['name'], getattr(self, opt['name']))
	128
	129
	130	def handle_command(self, inbuf, cmd):
	131	if cmd['prefix'] == "osd pool autoscale-status":
	132	retval = self._command_autoscale_status(cmd)
	133	else:
	134	assert False # ceph-mgr should never pass us unknown cmds
	135	return retval
	136
	137	def _command_autoscale_status(self, cmd):
	138	osdmap = self.get_osdmap()
	139	pools = osdmap.get_pools_by_name()
	140	ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
	141
	142	if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
9f95a23c	143	return 0, json.dumps(ps, indent=4, sort_keys=True), ''
11fdf7f2 TL	144	else:
	145	table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
	146	'RATE', 'RAW CAPACITY',
	147	'RATIO', 'TARGET RATIO',
9f95a23c	148	'EFFECTIVE RATIO',
11fdf7f2 TL	149	'BIAS',
	150	'PG_NUM',
	151	# 'IDEAL',
	152	'NEW PG_NUM', 'AUTOSCALE'],
	153	border=False)
eafe8130	154	table.left_padding_width = 0
9f95a23c	155	table.right_padding_width = 2
11fdf7f2 TL	156	table.align['POOL'] = 'l'
	157	table.align['SIZE'] = 'r'
	158	table.align['TARGET SIZE'] = 'r'
	159	table.align['RATE'] = 'r'
	160	table.align['RAW CAPACITY'] = 'r'
	161	table.align['RATIO'] = 'r'
	162	table.align['TARGET RATIO'] = 'r'
9f95a23c	163	table.align['EFFECTIVE RATIO'] = 'r'
11fdf7f2 TL	164	table.align['BIAS'] = 'r'
	165	table.align['PG_NUM'] = 'r'
	166	# table.align['IDEAL'] = 'r'
	167	table.align['NEW PG_NUM'] = 'r'
	168	table.align['AUTOSCALE'] = 'l'
	169	for p in ps:
	170	if p['would_adjust']:
	171	final = str(p['pg_num_final'])
	172	else:
	173	final = ''
	174	if p['target_bytes'] > 0:
	175	ts = mgr_util.format_bytes(p['target_bytes'], 6)
	176	else:
	177	ts = ''
	178	if p['target_ratio'] > 0.0:
	179	tr = '%.4f' % p['target_ratio']
	180	else:
	181	tr = ''
9f95a23c TL	182	if p['effective_target_ratio'] > 0.0:
	183	etr = '%.4f' % p['effective_target_ratio']
	184	else:
	185	etr = ''
11fdf7f2 TL	186	table.add_row([
	187	p['pool_name'],
	188	mgr_util.format_bytes(p['logical_used'], 6),
	189	ts,
	190	p['raw_used_rate'],
	191	mgr_util.format_bytes(p['subtree_capacity'], 6),
	192	'%.4f' % p['capacity_ratio'],
	193	tr,
9f95a23c	194	etr,
11fdf7f2 TL	195	p['bias'],
	196	p['pg_num_target'],
	197	# p['pg_num_ideal'],
	198	final,
	199	p['pg_autoscale_mode'],
	200	])
	201	return 0, table.get_string(), ''
	202
	203	def serve(self):
	204	self.config_notify()
	205	while not self._shutdown.is_set():
	206	self._maybe_adjust()
9f95a23c	207	self._update_progress_events()
11fdf7f2 TL	208	self._shutdown.wait(timeout=int(self.sleep_interval))
11fdf7f2 TL	209
92f5a8d4 TL	210	def shutdown(self):
	211	self.log.info('Stopping pg_autoscaler')
	212	self._shutdown.set()
	213
11fdf7f2 TL	214	def get_subtree_resource_status(self, osdmap, crush):
	215	"""
	216	For each CRUSH subtree of interest (i.e. the roots under which
	217	we have pools), calculate the current resource usages and targets,
	218	such as how many PGs there are, vs. how many PGs we would
	219	like there to be.
	220	"""
	221	result = {}
	222	pool_root = {}
	223	roots = []
	224
	225	class CrushSubtreeResourceStatus(object):
	226	def __init__(self):
	227	self.root_ids = []
	228	self.osds = set()
	229	self.osd_count = None # Number of OSDs
	230	self.pg_target = None # Ideal full-capacity PG count?
	231	self.pg_current = 0 # How many PGs already?
	232	self.capacity = None # Total capacity of OSDs in subtree
	233	self.pool_ids = []
	234	self.pool_names = []
9f95a23c TL	235	self.total_target_ratio = 0.0
9f95a23c TL	236	self.total_target_bytes = 0 # including replication / EC overhead
11fdf7f2 TL	237
	238	# identify subtrees (note that they may overlap!)
	239	for pool_id, pool in osdmap.get_pools().items():
	240	cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
	241	root_id = int(crush.get_rule_root(cr_name))
	242	pool_root[pool_id] = root_id
	243	osds = set(crush.get_osds_under(root_id))
	244
	245	# do we intersect an existing root?
	246	s = None
81eedcae	247	for prev in itervalues(result):
11fdf7f2 TL	248	if osds & prev.osds:
	249	s = prev
	250	break
	251	if not s:
	252	s = CrushSubtreeResourceStatus()
	253	roots.append(s)
	254	result[root_id] = s
	255	s.root_ids.append(root_id)
	256	s.osds \|= osds
9f95a23c	257	s.pool_ids.append(pool_id)
11fdf7f2 TL	258	s.pool_names.append(pool['pool_name'])
11fdf7f2 TL	259	s.pg_current += pool['pg_num_target'] * pool['size']
9f95a23c TL	260	target_ratio = pool['options'].get('target_size_ratio', 0.0)
	261	if target_ratio:
	262	s.total_target_ratio += target_ratio
	263	else:
	264	target_bytes = pool['options'].get('target_size_bytes', 0)
	265	if target_bytes:
	266	s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id)
11fdf7f2 TL	267
	268	# finish subtrees
	269	all_stats = self.get('osd_stats')
	270	for s in roots:
	271	s.osd_count = len(s.osds)
9f95a23c	272	s.pg_target = s.osd_count * self.mon_target_pg_per_osd
11fdf7f2 TL	273
	274	capacity = 0.0
	275	for osd_stats in all_stats['osd_stats']:
	276	if osd_stats['osd'] in s.osds:
	277	# Intentionally do not apply the OSD's reweight to
	278	# this, because we want to calculate PG counts based
	279	# on the physical storage available, not how it is
	280	# reweighted right now.
	281	capacity += osd_stats['kb'] * 1024
	282
	283	s.capacity = capacity
	284
	285	self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
	286	s.root_ids,
	287	s.pool_ids,
	288	s.osd_count,
	289	s.pg_target)
	290
	291	return result, pool_root
	292
11fdf7f2 TL	293	def _get_pool_status(
	294	self,
	295	osdmap,
	296	pools,
	297	threshold=3.0,
	298	):
	299	assert threshold >= 2.0
	300
	301	crush_map = osdmap.get_crush()
	302
	303	root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
	304
	305	df = self.get('df')
	306	pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
	307
	308	ret = []
	309
	310	# iterate over all pools to determine how they should be sized
81eedcae	311	for pool_name, p in iteritems(pools):
11fdf7f2	312	pool_id = p['pool']
eafe8130 TL	313	if pool_id not in pool_stats:
	314	# race with pool deletion; skip
	315	continue
11fdf7f2 TL	316
	317	# FIXME: we assume there is only one take per pool, but that
	318	# may not be true.
	319	cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
	320	root_id = int(crush_map.get_rule_root(cr_name))
	321	pool_root[pool_name] = root_id
	322
	323	capacity = root_map[root_id].capacity
	324	if capacity == 0:
	325	self.log.debug('skipping empty subtree %s', cr_name)
	326	continue
	327
	328	raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
	329
eafe8130	330	pool_logical_used = pool_stats[pool_id]['stored']
11fdf7f2	331	bias = p['options'].get('pg_autoscale_bias', 1.0)
9f95a23c TL	332	target_bytes = 0
	333	# ratio takes precedence if both are set
	334	if p['options'].get('target_size_ratio', 0.0) == 0.0:
	335	target_bytes = p['options'].get('target_size_bytes', 0)
11fdf7f2 TL	336
	337	# What proportion of space are we using?
	338	actual_raw_used = pool_logical_used * raw_used_rate
	339	actual_capacity_ratio = float(actual_raw_used) / capacity
	340
	341	pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
	342	capacity_ratio = float(pool_raw_used) / capacity
	343
9f95a23c TL	344	self.log.info("effective_target_ratio {0} {1} {2} {3}".format(
	345	p['options'].get('target_size_ratio', 0.0),
	346	root_map[root_id].total_target_ratio,
	347	root_map[root_id].total_target_bytes,
	348	capacity))
	349	target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0),
	350	root_map[root_id].total_target_ratio,
	351	root_map[root_id].total_target_bytes,
	352	capacity)
	353
11fdf7f2 TL	354	final_ratio = max(capacity_ratio, target_ratio)
	355
	356	# So what proportion of pg allowance should we be using?
92f5a8d4	357	pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
11fdf7f2 TL	358
	359	final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
	360	nearest_power_of_two(pool_pg_target))
	361
	362	self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
	363	"pg target {4} quantized to {5} (current {6})".format(
	364	p['pool_name'],
	365	root_id,
	366	final_ratio,
	367	bias,
	368	pool_pg_target,
	369	final_pg_target,
	370	p['pg_num_target']
	371	))
	372
	373	adjust = False
	374	if (final_pg_target > p['pg_num_target'] * threshold or \
9f95a23c	375	final_pg_target < p['pg_num_target'] / threshold) and \
11fdf7f2 TL	376	final_ratio >= 0.0 and \
	377	final_ratio <= 1.0:
	378	adjust = True
	379
	380	ret.append({
	381	'pool_id': pool_id,
	382	'pool_name': p['pool_name'],
	383	'crush_root_id': root_id,
	384	'pg_autoscale_mode': p['pg_autoscale_mode'],
	385	'pg_num_target': p['pg_num_target'],
	386	'logical_used': pool_logical_used,
	387	'target_bytes': target_bytes,
	388	'raw_used_rate': raw_used_rate,
	389	'subtree_capacity': capacity,
	390	'actual_raw_used': actual_raw_used,
	391	'raw_used': pool_raw_used,
	392	'actual_capacity_ratio': actual_capacity_ratio,
	393	'capacity_ratio': capacity_ratio,
9f95a23c TL	394	'target_ratio': p['options'].get('target_size_ratio', 0.0),
9f95a23c TL	395	'effective_target_ratio': target_ratio,
11fdf7f2 TL	396	'pg_num_ideal': int(pool_pg_target),
	397	'pg_num_final': final_pg_target,
	398	'would_adjust': adjust,
	399	'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
	400	});
	401
	402	return (ret, root_map, pool_root)
	403
9f95a23c TL	404	def _update_progress_events(self):
	405	osdmap = self.get_osdmap()
	406	pools = osdmap.get_pools()
	407	for pool_id in list(self._event):
	408	ev = self._event[pool_id]
	409	pool_data = pools.get(pool_id)
	410	if pool_data is None or pool_data['pg_num'] == pool_data['pg_num_target']:
	411	# pool is gone or we've reached our target
	412	self.remote('progress', 'complete', ev.ev_id)
	413	del self._event[pool_id]
	414	continue
	415	ev.update(self, (ev.pg_num - pool_data['pg_num']) / (ev.pg_num - ev.pg_num_target))
11fdf7f2 TL	416
	417	def _maybe_adjust(self):
	418	self.log.info('_maybe_adjust')
	419	osdmap = self.get_osdmap()
9f95a23c TL	420	if osdmap.get_require_osd_release() < 'nautilus':
9f95a23c TL	421	return
11fdf7f2 TL	422	pools = osdmap.get_pools_by_name()
	423	ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
	424
	425	# Anyone in 'warn', set the health message for them and then
	426	# drop them from consideration.
	427	too_few = []
	428	too_many = []
9f95a23c	429	bytes_and_ratio = []
11fdf7f2 TL	430	health_checks = {}
11fdf7f2 TL	431
81eedcae TL	432	total_bytes = dict([(r, 0) for r in iter(root_map)])
	433	total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
	434	target_bytes_pools = dict([(r, []) for r in iter(root_map)])
11fdf7f2 TL	435
11fdf7f2 TL	436	for p in ps:
9f95a23c TL	437	pool_id = p['pool_id']
	438	pool_opts = pools[p['pool_name']]['options']
	439	if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0:
	440	bytes_and_ratio.append('Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name'])
11fdf7f2 TL	441	total_bytes[p['crush_root_id']] += max(
	442	p['actual_raw_used'],
	443	p['target_bytes'] * p['raw_used_rate'])
	444	if p['target_bytes'] > 0:
	445	total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
	446	target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
	447	if not p['would_adjust']:
	448	continue
	449	if p['pg_autoscale_mode'] == 'warn':
	450	msg = 'Pool %s has %d placement groups, should have %d' % (
	451	p['pool_name'],
	452	p['pg_num_target'],
	453	p['pg_num_final'])
	454	if p['pg_num_final'] > p['pg_num_target']:
	455	too_few.append(msg)
	456	else:
	457	too_many.append(msg)
	458
	459	if p['pg_autoscale_mode'] == 'on':
	460	# Note that setting pg_num actually sets pg_num_target (see
	461	# OSDMonitor.cc)
	462	r = self.mon_command({
	463	'prefix': 'osd pool set',
	464	'pool': p['pool_name'],
	465	'var': 'pg_num',
	466	'val': str(p['pg_num_final'])
	467	})
	468
9f95a23c TL	469	# create new event or update existing one to reflect
	470	# progress from current state to the new pg_num_target
	471	pool_data = pools[p['pool_name']]
	472	pg_num = pool_data['pg_num']
	473	new_target = p['pg_num_final']
	474	if pool_id in self._event:
	475	self._event[pool_id].reset(pg_num, new_target)
	476	else:
	477	self._event[pool_id] = PgAdjustmentProgress(pool_id, pg_num, new_target)
	478	self._event[pool_id].update(self, 0.0)
	479
11fdf7f2 TL	480	if r[0] != 0:
	481	# FIXME: this is a serious and unexpected thing,
	482	# we should expose it as a cluster log error once
	483	# the hook for doing that from ceph-mgr modules is
	484	# in.
	485	self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
	486	.format(p['pool_name'],
	487	p['pg_num_final'], r))
	488
	489	if too_few:
	490	summary = "{0} pools have too few placement groups".format(
	491	len(too_few))
	492	health_checks['POOL_TOO_FEW_PGS'] = {
	493	'severity': 'warning',
	494	'summary': summary,
9f95a23c	495	'count': len(too_few),
11fdf7f2 TL	496	'detail': too_few
	497	}
	498	if too_many:
	499	summary = "{0} pools have too many placement groups".format(
	500	len(too_many))
	501	health_checks['POOL_TOO_MANY_PGS'] = {
	502	'severity': 'warning',
	503	'summary': summary,
9f95a23c	504	'count': len(too_many),
11fdf7f2 TL	505	'detail': too_many
	506	}
	507
11fdf7f2	508	too_much_target_bytes = []
81eedcae	509	for root_id, total in iteritems(total_bytes):
11fdf7f2	510	total_target = total_target_bytes[root_id]
9f95a23c	511	if total_target > 0 and total > root_map[root_id].capacity and root_map[root_id].capacity:
11fdf7f2 TL	512	too_much_target_bytes.append(
	513	'Pools %s overcommit available storage by %.03fx due to '
	514	'target_size_bytes %s on pools %s' % (
	515	root_map[root_id].pool_names,
	516	total / root_map[root_id].capacity,
	517	mgr_util.format_bytes(total_target, 5, colored=False),
	518	target_bytes_pools[root_id]
	519	)
	520	)
9f95a23c	521	elif total_target > root_map[root_id].capacity and root_map[root_id].capacity:
11fdf7f2 TL	522	too_much_target_bytes.append(
	523	'Pools %s overcommit available storage by %.03fx due to '
	524	'collective target_size_bytes of %s' % (
	525	root_map[root_id].pool_names,
	526	total / root_map[root_id].capacity,
	527	mgr_util.format_bytes(total_target, 5, colored=False),
	528	)
	529	)
	530	if too_much_target_bytes:
	531	health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
	532	'severity': 'warning',
	533	'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
9f95a23c	534	'count': len(too_much_target_bytes),
11fdf7f2 TL	535	'detail': too_much_target_bytes,
	536	}
	537
9f95a23c TL	538	if bytes_and_ratio:
	539	health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
	540	'severity': 'warning',
	541	'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio),
	542	'count': len(bytes_and_ratio),
	543	'detail': bytes_and_ratio,
	544	}
11fdf7f2 TL	545
11fdf7f2 TL	546	self.set_health_checks(health_checks)