[ceph.git] / ceph / src / pybind / mgr / pg_autoscaler / module.py

"""
Automatically scale pg_num based on how much data is stored in each pool.
"""

import errno
import json
import mgr_util
import threading
import uuid
from six import itervalues, iteritems
from collections import defaultdict
from prettytable import PrettyTable, PLAIN_COLUMNS

from mgr_module import MgrModule

"""
Some terminology is made up for the purposes of this module:

 - "raw pgs": pg count after applying replication, i.e. the real resource
              consumption of a pool.
 - "grow/shrink" - increase/decrease the pg_num in a pool
 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
                     units of resource management.
"""

INTERVAL = 5

PG_NUM_MIN = 4  # unless specified on a per-pool basis

def nearest_power_of_two(n):
    v = int(n)

    v -= 1
    v |= v >> 1
    v |= v >> 2
    v |= v >> 4
    v |= v >> 8
    v |= v >> 16

    # High bound power of two
    v += 1

    # Low bound power of tow
    x = v >> 1

    return x if (v - n) > (n - x) else v


class PgAutoscaler(MgrModule):
    """
    PG autoscaler.
    """
    COMMANDS = [
        {
            "cmd": "osd pool autoscale-status",
            "desc": "report on pool pg_num sizing recommendation and intent",
            "perm": "r"
        },
    ]

    NATIVE_OPTIONS = [
        'mon_target_pg_per_osd',
        'mon_max_pg_per_osd',
    ]

    MODULE_OPTIONS = [
        {
            'name': 'sleep_interval',
            'default': str(60),
        },
    ]

    def __init__(self, *args, **kwargs):
        super(PgAutoscaler, self).__init__(*args, **kwargs)
        self._shutdown = threading.Event()

        # So much of what we do peeks at the osdmap that it's easiest
        # to just keep a copy of the pythonized version.
        self._osd_map = None

    def config_notify(self):
        for opt in self.NATIVE_OPTIONS:
            setattr(self,
                    opt,
                    self.get_ceph_option(opt))
            self.log.debug(' native option %s = %s', opt, getattr(self, opt))
        for opt in self.MODULE_OPTIONS:
            setattr(self,
                    opt['name'],
                    self.get_module_option(opt['name']) or opt['default'])
            self.log.debug(' mgr option %s = %s',
                           opt['name'], getattr(self, opt['name']))


    def handle_command(self, inbuf, cmd):
        if cmd['prefix'] == "osd pool autoscale-status":
            retval = self._command_autoscale_status(cmd)
        else:
            assert False  # ceph-mgr should never pass us unknown cmds
        return retval

    def _command_autoscale_status(self, cmd):
        osdmap = self.get_osdmap()
        pools = osdmap.get_pools_by_name()
        ps, root_map, pool_root = self._get_pool_status(osdmap, pools)

        if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
            return 0, json.dumps(ps, indent=2), ''
        else:
            table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
                                 'RATE', 'RAW CAPACITY',
                                 'RATIO', 'TARGET RATIO',
                                 'BIAS',
                                 'PG_NUM',
#                                 'IDEAL',
                                 'NEW PG_NUM', 'AUTOSCALE'],
                                border=False)
            table.left_padding_width = 0
            table.right_padding_width = 1
            table.align['POOL'] = 'l'
            table.align['SIZE'] = 'r'
            table.align['TARGET SIZE'] = 'r'
            table.align['RATE'] = 'r'
            table.align['RAW CAPACITY'] = 'r'
            table.align['RATIO'] = 'r'
            table.align['TARGET RATIO'] = 'r'
            table.align['BIAS'] = 'r'
            table.align['PG_NUM'] = 'r'
#            table.align['IDEAL'] = 'r'
            table.align['NEW PG_NUM'] = 'r'
            table.align['AUTOSCALE'] = 'l'
            for p in ps:
                if p['would_adjust']:
                    final = str(p['pg_num_final'])
                else:
                    final = ''
                if p['target_bytes'] > 0:
                    ts = mgr_util.format_bytes(p['target_bytes'], 6)
                else:
                    ts = ''
                if p['target_ratio'] > 0.0:
                    tr = '%.4f' % p['target_ratio']
                else:
                    tr = ''
                table.add_row([
                    p['pool_name'],
                    mgr_util.format_bytes(p['logical_used'], 6),
                    ts,
                    p['raw_used_rate'],
                    mgr_util.format_bytes(p['subtree_capacity'], 6),
                    '%.4f' % p['capacity_ratio'],
                    tr,
                    p['bias'],
                    p['pg_num_target'],
#                    p['pg_num_ideal'],
                    final,
                    p['pg_autoscale_mode'],
                ])
            return 0, table.get_string(), ''

    def serve(self):
        self.config_notify()
        while not self._shutdown.is_set():
            self._maybe_adjust()
            self._shutdown.wait(timeout=int(self.sleep_interval))

    def get_subtree_resource_status(self, osdmap, crush):
        """
        For each CRUSH subtree of interest (i.e. the roots under which
        we have pools), calculate the current resource usages and targets,
        such as how many PGs there are, vs. how many PGs we would
        like there to be.
        """
        result = {}
        pool_root = {}
        roots = []

        class CrushSubtreeResourceStatus(object):
            def __init__(self):
                self.root_ids = []
                self.osds = set()
                self.osd_count = None  # Number of OSDs
                self.pg_target = None  # Ideal full-capacity PG count?
                self.pg_current = 0  # How many PGs already?
                self.capacity = None  # Total capacity of OSDs in subtree
                self.pool_ids = []
                self.pool_names = []

        # identify subtrees (note that they may overlap!)
        for pool_id, pool in osdmap.get_pools().items():
            cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
            root_id = int(crush.get_rule_root(cr_name))
            pool_root[pool_id] = root_id
            osds = set(crush.get_osds_under(root_id))

            # do we intersect an existing root?
            s = None
            for prev in itervalues(result):
                if osds & prev.osds:
                    s = prev
                    break
            if not s:
                s = CrushSubtreeResourceStatus()
                roots.append(s)
            result[root_id] = s
            s.root_ids.append(root_id)
            s.osds |= osds
            s.pool_ids.append(int(pool_id))
            s.pool_names.append(pool['pool_name'])
            s.pg_current += pool['pg_num_target'] * pool['size']


        # finish subtrees
        all_stats = self.get('osd_stats')
        for s in roots:
            s.osd_count = len(s.osds)
            s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd)

            capacity = 0.0
            for osd_stats in all_stats['osd_stats']:
                if osd_stats['osd'] in s.osds:
                    # Intentionally do not apply the OSD's reweight to
                    # this, because we want to calculate PG counts based
                    # on the physical storage available, not how it is
                    # reweighted right now.
                    capacity += osd_stats['kb'] * 1024

            s.capacity = capacity

            self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
                           s.root_ids,
                           s.pool_ids,
                           s.osd_count,
                           s.pg_target)

        return result, pool_root


    def _get_pool_status(
            self,
            osdmap,
            pools,
            threshold=3.0,
    ):
        assert threshold >= 2.0

        crush_map = osdmap.get_crush()

        root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)

        df = self.get('df')
        pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])

        ret = []

        # iterate over all pools to determine how they should be sized
        for pool_name, p in iteritems(pools):
            pool_id = p['pool']
            if pool_id not in pool_stats:
                # race with pool deletion; skip
                continue

            # FIXME: we assume there is only one take per pool, but that
            # may not be true.
            cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
            root_id = int(crush_map.get_rule_root(cr_name))
            pool_root[pool_name] = root_id

            capacity = root_map[root_id].capacity
            if capacity == 0:
                self.log.debug('skipping empty subtree %s', cr_name)
                continue

            raw_used_rate = osdmap.pool_raw_used_rate(pool_id)

            pool_logical_used = pool_stats[pool_id]['stored']
            bias = p['options'].get('pg_autoscale_bias', 1.0)
            target_bytes = p['options'].get('target_size_bytes', 0)

            # What proportion of space are we using?
            actual_raw_used = pool_logical_used * raw_used_rate
            actual_capacity_ratio = float(actual_raw_used) / capacity

            pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
            capacity_ratio = float(pool_raw_used) / capacity

            target_ratio = p['options'].get('target_size_ratio', 0.0)
            final_ratio = max(capacity_ratio, target_ratio)

            # So what proportion of pg allowance should we be using?
            pool_pg_target = (final_ratio * root_map[root_id].pg_target) / raw_used_rate * bias

            final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
                                  nearest_power_of_two(pool_pg_target))

            self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
                          "pg target {4} quantized to {5} (current {6})".format(
                              p['pool_name'],
                              root_id,
                              final_ratio,
                              bias,
                              pool_pg_target,
                              final_pg_target,
                              p['pg_num_target']
                          ))

            adjust = False
            if (final_pg_target > p['pg_num_target'] * threshold or \
                final_pg_target <= p['pg_num_target'] / threshold) and \
                final_ratio >= 0.0 and \
                final_ratio <= 1.0:
                adjust = True

            ret.append({
                'pool_id': pool_id,
                'pool_name': p['pool_name'],
                'crush_root_id': root_id,
                'pg_autoscale_mode': p['pg_autoscale_mode'],
                'pg_num_target': p['pg_num_target'],
                'logical_used': pool_logical_used,
                'target_bytes': target_bytes,
                'raw_used_rate': raw_used_rate,
                'subtree_capacity': capacity,
                'actual_raw_used': actual_raw_used,
                'raw_used': pool_raw_used,
                'actual_capacity_ratio': actual_capacity_ratio,
                'capacity_ratio': capacity_ratio,
                'target_ratio': target_ratio,
                'pg_num_ideal': int(pool_pg_target),
                'pg_num_final': final_pg_target,
                'would_adjust': adjust,
                'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
                });

        return (ret, root_map, pool_root)


    def _maybe_adjust(self):
        self.log.info('_maybe_adjust')
        osdmap = self.get_osdmap()
        pools = osdmap.get_pools_by_name()
        ps, root_map, pool_root = self._get_pool_status(osdmap, pools)

        # Anyone in 'warn', set the health message for them and then
        # drop them from consideration.
        too_few = []
        too_many = []
        health_checks = {}

        total_ratio = dict([(r, 0.0) for r in iter(root_map)])
        total_target_ratio = dict([(r, 0.0) for r in iter(root_map)])
        target_ratio_pools = dict([(r, []) for r in iter(root_map)])

        total_bytes = dict([(r, 0) for r in iter(root_map)])
        total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
        target_bytes_pools = dict([(r, []) for r in iter(root_map)])

        for p in ps:
            total_ratio[p['crush_root_id']] += max(p['actual_capacity_ratio'],
                                                   p['target_ratio'])
            if p['target_ratio'] > 0:
                total_target_ratio[p['crush_root_id']] += p['target_ratio']
                target_ratio_pools[p['crush_root_id']].append(p['pool_name'])
            total_bytes[p['crush_root_id']] += max(
                p['actual_raw_used'],
                p['target_bytes'] * p['raw_used_rate'])
            if p['target_bytes'] > 0:
                total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
                target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
            if not p['would_adjust']:
                continue
            if p['pg_autoscale_mode'] == 'warn':
                msg = 'Pool %s has %d placement groups, should have %d' % (
                    p['pool_name'],
                    p['pg_num_target'],
                    p['pg_num_final'])
                if p['pg_num_final'] > p['pg_num_target']:
                    too_few.append(msg)
                else:
                    too_many.append(msg)

            if p['pg_autoscale_mode'] == 'on':
                # Note that setting pg_num actually sets pg_num_target (see
                # OSDMonitor.cc)
                r = self.mon_command({
                    'prefix': 'osd pool set',
                    'pool': p['pool_name'],
                    'var': 'pg_num',
                    'val': str(p['pg_num_final'])
                })

                if r[0] != 0:
                    # FIXME: this is a serious and unexpected thing,
                    # we should expose it as a cluster log error once
                    # the hook for doing that from ceph-mgr modules is
                    # in.
                    self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
                                   .format(p['pool_name'],
                                           p['pg_num_final'], r))

        if too_few:
            summary = "{0} pools have too few placement groups".format(
                len(too_few))
            health_checks['POOL_TOO_FEW_PGS'] = {
                'severity': 'warning',
                'summary': summary,
                'detail': too_few
            }
        if too_many:
            summary = "{0} pools have too many placement groups".format(
                len(too_many))
            health_checks['POOL_TOO_MANY_PGS'] = {
                'severity': 'warning',
                'summary': summary,
                'detail': too_many
            }

        too_much_target_ratio = []
        for root_id, total in iteritems(total_ratio):
            total_target = total_target_ratio[root_id]
            if total > 1.0:
                too_much_target_ratio.append(
                    'Pools %s overcommit available storage by %.03fx due to '
                    'target_size_ratio %.03f on pools %s' % (
                        root_map[root_id].pool_names,
                        total,
                        total_target,
                        target_ratio_pools[root_id]
                    )
                )
            elif total_target > 1.0:
                too_much_target_ratio.append(
                    'Pools %s have collective target_size_ratio %.03f > 1.0' % (
                        root_map[root_id].pool_names,
                        total_target
                    )
                )
        if too_much_target_ratio:
            health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
                'severity': 'warning',
                'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
                'detail': too_much_target_ratio,
            }

        too_much_target_bytes = []
        for root_id, total in iteritems(total_bytes):
            total_target = total_target_bytes[root_id]
            if total > root_map[root_id].capacity:
                too_much_target_bytes.append(
                    'Pools %s overcommit available storage by %.03fx due to '
                    'target_size_bytes %s on pools %s' % (
                        root_map[root_id].pool_names,
                        total / root_map[root_id].capacity,
                        mgr_util.format_bytes(total_target, 5, colored=False),
                        target_bytes_pools[root_id]
                    )
                )
            elif total_target > root_map[root_id].capacity:
                too_much_target_bytes.append(
                    'Pools %s overcommit available storage by %.03fx due to '
                    'collective target_size_bytes of %s' % (
                        root_map[root_id].pool_names,
                        total / root_map[root_id].capacity,
                        mgr_util.format_bytes(total_target, 5, colored=False),
                    )
                )
        if too_much_target_bytes:
            health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
                'severity': 'warning',
                'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
                'detail': too_much_target_bytes,
            }


        self.set_health_checks(health_checks)
Commit	Line	Data
11fdf7f2 TL	1	"""
	2	Automatically scale pg_num based on how much data is stored in each pool.
	3	"""
	4
	5	import errno
	6	import json
	7	import mgr_util
	8	import threading
	9	import uuid
81eedcae	10	from six import itervalues, iteritems
11fdf7f2	11	from collections import defaultdict
eafe8130	12	from prettytable import PrettyTable, PLAIN_COLUMNS
11fdf7f2 TL	13
	14	from mgr_module import MgrModule
	15
	16	"""
	17	Some terminology is made up for the purposes of this module:
	18
	19	- "raw pgs": pg count after applying replication, i.e. the real resource
	20	consumption of a pool.
	21	- "grow/shrink" - increase/decrease the pg_num in a pool
	22	- "crush subtree" - non-overlapping domains in crush hierarchy: used as
	23	units of resource management.
	24	"""
	25
	26	INTERVAL = 5
	27
	28	PG_NUM_MIN = 4 # unless specified on a per-pool basis
	29
	30	def nearest_power_of_two(n):
	31	v = int(n)
	32
	33	v -= 1
	34	v \|= v >> 1
	35	v \|= v >> 2
	36	v \|= v >> 4
	37	v \|= v >> 8
	38	v \|= v >> 16
	39
	40	# High bound power of two
	41	v += 1
	42
	43	# Low bound power of tow
	44	x = v >> 1
	45
	46	return x if (v - n) > (n - x) else v
	47
	48
	49	class PgAutoscaler(MgrModule):
	50	"""
	51	PG autoscaler.
	52	"""
	53	COMMANDS = [
	54	{
	55	"cmd": "osd pool autoscale-status",
	56	"desc": "report on pool pg_num sizing recommendation and intent",
	57	"perm": "r"
	58	},
	59	]
	60
	61	NATIVE_OPTIONS = [
	62	'mon_target_pg_per_osd',
	63	'mon_max_pg_per_osd',
	64	]
	65
	66	MODULE_OPTIONS = [
	67	{
	68	'name': 'sleep_interval',
	69	'default': str(60),
	70	},
	71	]
	72
	73	def __init__(self, args, *kwargs):
	74	super(PgAutoscaler, self).__init__(args, *kwargs)
	75	self._shutdown = threading.Event()
	76
77	# So much of what we do peeks at the osdmap that it's easiest
78	# to just keep a copy of the pythonized version.
79	self._osd_map = None
80
81	def config_notify(self):
82	for opt in self.NATIVE_OPTIONS:
83	setattr(self,
84	opt,
85	self.get_ceph_option(opt))
86	self.log.debug(' native option %s = %s', opt, getattr(self, opt))
87	for opt in self.MODULE_OPTIONS:
88	setattr(self,
89	opt['name'],
90	self.get_module_option(opt['name']) or opt['default'])
91	self.log.debug(' mgr option %s = %s',
92	opt['name'], getattr(self, opt['name']))
93
94
95	def handle_command(self, inbuf, cmd):
96	if cmd['prefix'] == "osd pool autoscale-status":
97	retval = self._command_autoscale_status(cmd)
98	else:
99	assert False # ceph-mgr should never pass us unknown cmds
100	return retval
101
102	def _command_autoscale_status(self, cmd):
103	osdmap = self.get_osdmap()
104	pools = osdmap.get_pools_by_name()
105	ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
106
107	if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
108	return 0, json.dumps(ps, indent=2), ''
109	else:
110	table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
111	'RATE', 'RAW CAPACITY',
112	'RATIO', 'TARGET RATIO',
113	'BIAS',
114	'PG_NUM',
115	# 'IDEAL',
116	'NEW PG_NUM', 'AUTOSCALE'],
117	border=False)
eafe8130 TL	118	table.left_padding_width = 0
eafe8130 TL	119	table.right_padding_width = 1
11fdf7f2 TL	120	table.align['POOL'] = 'l'
	121	table.align['SIZE'] = 'r'
	122	table.align['TARGET SIZE'] = 'r'
	123	table.align['RATE'] = 'r'
	124	table.align['RAW CAPACITY'] = 'r'
	125	table.align['RATIO'] = 'r'
	126	table.align['TARGET RATIO'] = 'r'
	127	table.align['BIAS'] = 'r'
	128	table.align['PG_NUM'] = 'r'
	129	# table.align['IDEAL'] = 'r'
	130	table.align['NEW PG_NUM'] = 'r'
	131	table.align['AUTOSCALE'] = 'l'
	132	for p in ps:
	133	if p['would_adjust']:
	134	final = str(p['pg_num_final'])
	135	else:
	136	final = ''
	137	if p['target_bytes'] > 0:
	138	ts = mgr_util.format_bytes(p['target_bytes'], 6)
	139	else:
	140	ts = ''
	141	if p['target_ratio'] > 0.0:
	142	tr = '%.4f' % p['target_ratio']
	143	else:
	144	tr = ''
	145	table.add_row([
	146	p['pool_name'],
	147	mgr_util.format_bytes(p['logical_used'], 6),
	148	ts,
	149	p['raw_used_rate'],
	150	mgr_util.format_bytes(p['subtree_capacity'], 6),
	151	'%.4f' % p['capacity_ratio'],
	152	tr,
	153	p['bias'],
	154	p['pg_num_target'],
	155	# p['pg_num_ideal'],
	156	final,
	157	p['pg_autoscale_mode'],
	158	])
	159	return 0, table.get_string(), ''
	160
	161	def serve(self):
	162	self.config_notify()
	163	while not self._shutdown.is_set():
	164	self._maybe_adjust()
	165	self._shutdown.wait(timeout=int(self.sleep_interval))
	166
	167	def get_subtree_resource_status(self, osdmap, crush):
	168	"""
	169	For each CRUSH subtree of interest (i.e. the roots under which
	170	we have pools), calculate the current resource usages and targets,
	171	such as how many PGs there are, vs. how many PGs we would
	172	like there to be.
	173	"""
	174	result = {}
	175	pool_root = {}
	176	roots = []
	177
	178	class CrushSubtreeResourceStatus(object):
	179	def __init__(self):
	180	self.root_ids = []
	181	self.osds = set()
	182	self.osd_count = None # Number of OSDs
	183	self.pg_target = None # Ideal full-capacity PG count?
184	self.pg_current = 0 # How many PGs already?
185	self.capacity = None # Total capacity of OSDs in subtree
186	self.pool_ids = []
187	self.pool_names = []
188
189	# identify subtrees (note that they may overlap!)
190	for pool_id, pool in osdmap.get_pools().items():
191	cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
192	root_id = int(crush.get_rule_root(cr_name))
193	pool_root[pool_id] = root_id
194	osds = set(crush.get_osds_under(root_id))
195
196	# do we intersect an existing root?
197	s = None
81eedcae	198	for prev in itervalues(result):
11fdf7f2 TL	199	if osds & prev.osds:
	200	s = prev
	201	break
	202	if not s:
	203	s = CrushSubtreeResourceStatus()
	204	roots.append(s)
	205	result[root_id] = s
	206	s.root_ids.append(root_id)
	207	s.osds \|= osds
	208	s.pool_ids.append(int(pool_id))
	209	s.pool_names.append(pool['pool_name'])
	210	s.pg_current += pool['pg_num_target'] * pool['size']
	211
	212
	213	# finish subtrees
	214	all_stats = self.get('osd_stats')
	215	for s in roots:
	216	s.osd_count = len(s.osds)
	217	s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd)
	218
	219	capacity = 0.0
	220	for osd_stats in all_stats['osd_stats']:
	221	if osd_stats['osd'] in s.osds:
	222	# Intentionally do not apply the OSD's reweight to
	223	# this, because we want to calculate PG counts based
	224	# on the physical storage available, not how it is
	225	# reweighted right now.
	226	capacity += osd_stats['kb'] * 1024
	227
	228	s.capacity = capacity
	229
	230	self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
	231	s.root_ids,
	232	s.pool_ids,
	233	s.osd_count,
	234	s.pg_target)
	235
	236	return result, pool_root
	237
	238
	239	def _get_pool_status(
	240	self,
	241	osdmap,
	242	pools,
	243	threshold=3.0,
	244	):
	245	assert threshold >= 2.0
	246
	247	crush_map = osdmap.get_crush()
	248
	249	root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
	250
	251	df = self.get('df')
	252	pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
	253
	254	ret = []
	255
	256	# iterate over all pools to determine how they should be sized
81eedcae	257	for pool_name, p in iteritems(pools):
11fdf7f2	258	pool_id = p['pool']
eafe8130 TL	259	if pool_id not in pool_stats:
	260	# race with pool deletion; skip
	261	continue
11fdf7f2 TL	262
	263	# FIXME: we assume there is only one take per pool, but that
	264	# may not be true.
	265	cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
	266	root_id = int(crush_map.get_rule_root(cr_name))
	267	pool_root[pool_name] = root_id
	268
	269	capacity = root_map[root_id].capacity
	270	if capacity == 0:
	271	self.log.debug('skipping empty subtree %s', cr_name)
	272	continue
	273
	274	raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
	275
eafe8130	276	pool_logical_used = pool_stats[pool_id]['stored']
11fdf7f2 TL	277	bias = p['options'].get('pg_autoscale_bias', 1.0)
	278	target_bytes = p['options'].get('target_size_bytes', 0)
	279
	280	# What proportion of space are we using?
	281	actual_raw_used = pool_logical_used * raw_used_rate
	282	actual_capacity_ratio = float(actual_raw_used) / capacity
	283
	284	pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
	285	capacity_ratio = float(pool_raw_used) / capacity
	286
	287	target_ratio = p['options'].get('target_size_ratio', 0.0)
	288	final_ratio = max(capacity_ratio, target_ratio)
	289
	290	# So what proportion of pg allowance should we be using?
	291	pool_pg_target = (final_ratio * root_map[root_id].pg_target) / raw_used_rate * bias
	292
	293	final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
	294	nearest_power_of_two(pool_pg_target))
	295
	296	self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
	297	"pg target {4} quantized to {5} (current {6})".format(
	298	p['pool_name'],
	299	root_id,
	300	final_ratio,
	301	bias,
	302	pool_pg_target,
	303	final_pg_target,
	304	p['pg_num_target']
	305	))
	306
	307	adjust = False
	308	if (final_pg_target > p['pg_num_target'] * threshold or \
	309	final_pg_target <= p['pg_num_target'] / threshold) and \
	310	final_ratio >= 0.0 and \
	311	final_ratio <= 1.0:
	312	adjust = True
	313
	314	ret.append({
	315	'pool_id': pool_id,
	316	'pool_name': p['pool_name'],
	317	'crush_root_id': root_id,
	318	'pg_autoscale_mode': p['pg_autoscale_mode'],
	319	'pg_num_target': p['pg_num_target'],
	320	'logical_used': pool_logical_used,
	321	'target_bytes': target_bytes,
	322	'raw_used_rate': raw_used_rate,
	323	'subtree_capacity': capacity,
	324	'actual_raw_used': actual_raw_used,
	325	'raw_used': pool_raw_used,
	326	'actual_capacity_ratio': actual_capacity_ratio,
	327	'capacity_ratio': capacity_ratio,
	328	'target_ratio': target_ratio,
	329	'pg_num_ideal': int(pool_pg_target),
	330	'pg_num_final': final_pg_target,
	331	'would_adjust': adjust,
	332	'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
	333	});
	334
	335	return (ret, root_map, pool_root)
	336
	337
	338	def _maybe_adjust(self):
	339	self.log.info('_maybe_adjust')
	340	osdmap = self.get_osdmap()
341	pools = osdmap.get_pools_by_name()
342	ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
343
344	# Anyone in 'warn', set the health message for them and then
345	# drop them from consideration.
346	too_few = []
347	too_many = []
348	health_checks = {}
349
81eedcae TL	350	total_ratio = dict([(r, 0.0) for r in iter(root_map)])
	351	total_target_ratio = dict([(r, 0.0) for r in iter(root_map)])
	352	target_ratio_pools = dict([(r, []) for r in iter(root_map)])
11fdf7f2	353
81eedcae TL	354	total_bytes = dict([(r, 0) for r in iter(root_map)])
	355	total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
	356	target_bytes_pools = dict([(r, []) for r in iter(root_map)])
11fdf7f2 TL	357
	358	for p in ps:
	359	total_ratio[p['crush_root_id']] += max(p['actual_capacity_ratio'],
	360	p['target_ratio'])
	361	if p['target_ratio'] > 0:
	362	total_target_ratio[p['crush_root_id']] += p['target_ratio']
	363	target_ratio_pools[p['crush_root_id']].append(p['pool_name'])
	364	total_bytes[p['crush_root_id']] += max(
	365	p['actual_raw_used'],
	366	p['target_bytes'] * p['raw_used_rate'])
	367	if p['target_bytes'] > 0:
	368	total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
	369	target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
	370	if not p['would_adjust']:
	371	continue
	372	if p['pg_autoscale_mode'] == 'warn':
	373	msg = 'Pool %s has %d placement groups, should have %d' % (
	374	p['pool_name'],
	375	p['pg_num_target'],
	376	p['pg_num_final'])
	377	if p['pg_num_final'] > p['pg_num_target']:
	378	too_few.append(msg)
	379	else:
	380	too_many.append(msg)
	381
	382	if p['pg_autoscale_mode'] == 'on':
	383	# Note that setting pg_num actually sets pg_num_target (see
	384	# OSDMonitor.cc)
	385	r = self.mon_command({
	386	'prefix': 'osd pool set',
	387	'pool': p['pool_name'],
	388	'var': 'pg_num',
	389	'val': str(p['pg_num_final'])
	390	})
	391
	392	if r[0] != 0:
	393	# FIXME: this is a serious and unexpected thing,
	394	# we should expose it as a cluster log error once
	395	# the hook for doing that from ceph-mgr modules is
	396	# in.
	397	self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
	398	.format(p['pool_name'],
	399	p['pg_num_final'], r))
	400
	401	if too_few:
	402	summary = "{0} pools have too few placement groups".format(
	403	len(too_few))
	404	health_checks['POOL_TOO_FEW_PGS'] = {
	405	'severity': 'warning',
	406	'summary': summary,
	407	'detail': too_few
	408	}
	409	if too_many:
	410	summary = "{0} pools have too many placement groups".format(
	411	len(too_many))
	412	health_checks['POOL_TOO_MANY_PGS'] = {
	413	'severity': 'warning',
	414	'summary': summary,
	415	'detail': too_many
	416	}
	417
	418	too_much_target_ratio = []
81eedcae	419	for root_id, total in iteritems(total_ratio):
11fdf7f2 TL	420	total_target = total_target_ratio[root_id]
	421	if total > 1.0:
	422	too_much_target_ratio.append(
	423	'Pools %s overcommit available storage by %.03fx due to '
	424	'target_size_ratio %.03f on pools %s' % (
	425	root_map[root_id].pool_names,
	426	total,
	427	total_target,
	428	target_ratio_pools[root_id]
	429	)
	430	)
	431	elif total_target > 1.0:
	432	too_much_target_ratio.append(
	433	'Pools %s have collective target_size_ratio %.03f > 1.0' % (
	434	root_map[root_id].pool_names,
	435	total_target
	436	)
	437	)
	438	if too_much_target_ratio:
	439	health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
	440	'severity': 'warning',
	441	'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
	442	'detail': too_much_target_ratio,
	443	}
	444
	445	too_much_target_bytes = []
81eedcae	446	for root_id, total in iteritems(total_bytes):
11fdf7f2 TL	447	total_target = total_target_bytes[root_id]
	448	if total > root_map[root_id].capacity:
	449	too_much_target_bytes.append(
	450	'Pools %s overcommit available storage by %.03fx due to '
	451	'target_size_bytes %s on pools %s' % (
	452	root_map[root_id].pool_names,
	453	total / root_map[root_id].capacity,
	454	mgr_util.format_bytes(total_target, 5, colored=False),
	455	target_bytes_pools[root_id]
	456	)
	457	)
	458	elif total_target > root_map[root_id].capacity:
	459	too_much_target_bytes.append(
	460	'Pools %s overcommit available storage by %.03fx due to '
	461	'collective target_size_bytes of %s' % (
	462	root_map[root_id].pool_names,
	463	total / root_map[root_id].capacity,
	464	mgr_util.format_bytes(total_target, 5, colored=False),
	465	)
	466	)
	467	if too_much_target_bytes:
	468	health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
	469	'severity': 'warning',
	470	'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
	471	'detail': too_much_target_bytes,
	472	}
	473
	474
	475	self.set_health_checks(health_checks)