]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/pg_autoscaler/module.py
2 Automatically scale pg_num based on how much data is stored in each pool.
10 from six
import itervalues
, iteritems
11 from collections
import defaultdict
12 from prettytable
import PrettyTable
, PLAIN_COLUMNS
14 from mgr_module
import MgrModule
17 Some terminology is made up for the purposes of this module:
19 - "raw pgs": pg count after applying replication, i.e. the real resource
20 consumption of a pool.
21 - "grow/shrink" - increase/decrease the pg_num in a pool
22 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
23 units of resource management.
28 PG_NUM_MIN
= 32 # unless specified on a per-pool basis
30 def nearest_power_of_two(n
):
40 # High bound power of two
43 # Low bound power of tow
46 return x
if (v
- n
) > (n
- x
) else v
49 class PgAutoscaler(MgrModule
):
55 "cmd": "osd pool autoscale-status",
56 "desc": "report on pool pg_num sizing recommendation and intent",
62 'mon_target_pg_per_osd',
68 'name': 'sleep_interval',
73 def __init__(self
, *args
, **kwargs
):
74 super(PgAutoscaler
, self
).__init
__(*args
, **kwargs
)
75 self
._shutdown
= threading
.Event()
77 # So much of what we do peeks at the osdmap that it's easiest
78 # to just keep a copy of the pythonized version.
81 def config_notify(self
):
82 for opt
in self
.NATIVE_OPTIONS
:
85 self
.get_ceph_option(opt
))
86 self
.log
.debug(' native option %s = %s', opt
, getattr(self
, opt
))
87 for opt
in self
.MODULE_OPTIONS
:
90 self
.get_module_option(opt
['name']) or opt
['default'])
91 self
.log
.debug(' mgr option %s = %s',
92 opt
['name'], getattr(self
, opt
['name']))
95 def handle_command(self
, inbuf
, cmd
):
96 if cmd
['prefix'] == "osd pool autoscale-status":
97 retval
= self
._command
_autoscale
_status
(cmd
)
99 assert False # ceph-mgr should never pass us unknown cmds
102 def _command_autoscale_status(self
, cmd
):
103 osdmap
= self
.get_osdmap()
104 pools
= osdmap
.get_pools_by_name()
105 ps
, root_map
, pool_root
= self
._get
_pool
_status
(osdmap
, pools
)
107 if cmd
.get('format') == 'json' or cmd
.get('format') == 'json-pretty':
108 return 0, json
.dumps(ps
, indent
=2), ''
110 table
= PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
111 'RATE', 'RAW CAPACITY',
112 'RATIO', 'TARGET RATIO',
116 'NEW PG_NUM', 'AUTOSCALE'],
118 table
.left_padding_width
= 0
119 table
.right_padding_width
= 1
120 table
.align
['POOL'] = 'l'
121 table
.align
['SIZE'] = 'r'
122 table
.align
['TARGET SIZE'] = 'r'
123 table
.align
['RATE'] = 'r'
124 table
.align
['RAW CAPACITY'] = 'r'
125 table
.align
['RATIO'] = 'r'
126 table
.align
['TARGET RATIO'] = 'r'
127 table
.align
['BIAS'] = 'r'
128 table
.align
['PG_NUM'] = 'r'
129 # table.align['IDEAL'] = 'r'
130 table
.align
['NEW PG_NUM'] = 'r'
131 table
.align
['AUTOSCALE'] = 'l'
133 if p
['would_adjust']:
134 final
= str(p
['pg_num_final'])
137 if p
['target_bytes'] > 0:
138 ts
= mgr_util
.format_bytes(p
['target_bytes'], 6)
141 if p
['target_ratio'] > 0.0:
142 tr
= '%.4f' % p
['target_ratio']
147 mgr_util
.format_bytes(p
['logical_used'], 6),
150 mgr_util
.format_bytes(p
['subtree_capacity'], 6),
151 '%.4f' % p
['capacity_ratio'],
157 p
['pg_autoscale_mode'],
159 return 0, table
.get_string(), ''
163 while not self
._shutdown
.is_set():
165 self
._shutdown
.wait(timeout
=int(self
.sleep_interval
))
168 self
.log
.info('Stopping pg_autoscaler')
171 def get_subtree_resource_status(self
, osdmap
, crush
):
173 For each CRUSH subtree of interest (i.e. the roots under which
174 we have pools), calculate the current resource usages and targets,
175 such as how many PGs there are, vs. how many PGs we would
182 class CrushSubtreeResourceStatus(object):
186 self
.osd_count
= None # Number of OSDs
187 self
.pg_target
= None # Ideal full-capacity PG count?
188 self
.pg_current
= 0 # How many PGs already?
189 self
.capacity
= None # Total capacity of OSDs in subtree
193 # identify subtrees (note that they may overlap!)
194 for pool_id
, pool
in osdmap
.get_pools().items():
195 cr_name
= crush
.get_rule_by_id(pool
['crush_rule'])['rule_name']
196 root_id
= int(crush
.get_rule_root(cr_name
))
197 pool_root
[pool_id
] = root_id
198 osds
= set(crush
.get_osds_under(root_id
))
200 # do we intersect an existing root?
202 for prev
in itervalues(result
):
207 s
= CrushSubtreeResourceStatus()
210 s
.root_ids
.append(root_id
)
212 s
.pool_ids
.append(int(pool_id
))
213 s
.pool_names
.append(pool
['pool_name'])
214 s
.pg_current
+= pool
['pg_num_target'] * pool
['size']
218 all_stats
= self
.get('osd_stats')
220 s
.osd_count
= len(s
.osds
)
221 s
.pg_target
= s
.osd_count
* int(self
.mon_target_pg_per_osd
)
224 for osd_stats
in all_stats
['osd_stats']:
225 if osd_stats
['osd'] in s
.osds
:
226 # Intentionally do not apply the OSD's reweight to
227 # this, because we want to calculate PG counts based
228 # on the physical storage available, not how it is
229 # reweighted right now.
230 capacity
+= osd_stats
['kb'] * 1024
232 s
.capacity
= capacity
234 self
.log
.debug('root_ids %s pools %s with %d osds, pg_target %d',
240 return result
, pool_root
243 def _get_pool_status(
249 assert threshold
>= 2.0
251 crush_map
= osdmap
.get_crush()
253 root_map
, pool_root
= self
.get_subtree_resource_status(osdmap
, crush_map
)
256 pool_stats
= dict([(p
['id'], p
['stats']) for p
in df
['pools']])
260 # iterate over all pools to determine how they should be sized
261 for pool_name
, p
in iteritems(pools
):
263 if pool_id
not in pool_stats
:
264 # race with pool deletion; skip
267 # FIXME: we assume there is only one take per pool, but that
269 cr_name
= crush_map
.get_rule_by_id(p
['crush_rule'])['rule_name']
270 root_id
= int(crush_map
.get_rule_root(cr_name
))
271 pool_root
[pool_name
] = root_id
273 capacity
= root_map
[root_id
].capacity
275 self
.log
.debug('skipping empty subtree %s', cr_name
)
278 raw_used_rate
= osdmap
.pool_raw_used_rate(pool_id
)
280 pool_logical_used
= pool_stats
[pool_id
]['stored']
281 bias
= p
['options'].get('pg_autoscale_bias', 1.0)
282 target_bytes
= p
['options'].get('target_size_bytes', 0)
284 # What proportion of space are we using?
285 actual_raw_used
= pool_logical_used
* raw_used_rate
286 actual_capacity_ratio
= float(actual_raw_used
) / capacity
288 pool_raw_used
= max(pool_logical_used
, target_bytes
) * raw_used_rate
289 capacity_ratio
= float(pool_raw_used
) / capacity
291 target_ratio
= p
['options'].get('target_size_ratio', 0.0)
292 final_ratio
= max(capacity_ratio
, target_ratio
)
294 # So what proportion of pg allowance should we be using?
295 pool_pg_target
= (final_ratio
* root_map
[root_id
].pg_target
) / p
['size'] * bias
297 final_pg_target
= max(p
['options'].get('pg_num_min', PG_NUM_MIN
),
298 nearest_power_of_two(pool_pg_target
))
300 self
.log
.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
301 "pg target {4} quantized to {5} (current {6})".format(
312 if (final_pg_target
> p
['pg_num_target'] * threshold
or \
313 final_pg_target
<= p
['pg_num_target'] / threshold
) and \
314 final_ratio
>= 0.0 and \
320 'pool_name': p
['pool_name'],
321 'crush_root_id': root_id
,
322 'pg_autoscale_mode': p
['pg_autoscale_mode'],
323 'pg_num_target': p
['pg_num_target'],
324 'logical_used': pool_logical_used
,
325 'target_bytes': target_bytes
,
326 'raw_used_rate': raw_used_rate
,
327 'subtree_capacity': capacity
,
328 'actual_raw_used': actual_raw_used
,
329 'raw_used': pool_raw_used
,
330 'actual_capacity_ratio': actual_capacity_ratio
,
331 'capacity_ratio': capacity_ratio
,
332 'target_ratio': target_ratio
,
333 'pg_num_ideal': int(pool_pg_target
),
334 'pg_num_final': final_pg_target
,
335 'would_adjust': adjust
,
336 'bias': p
.get('options', {}).get('pg_autoscale_bias', 1.0),
339 return (ret
, root_map
, pool_root
)
342 def _maybe_adjust(self
):
343 self
.log
.info('_maybe_adjust')
344 osdmap
= self
.get_osdmap()
345 pools
= osdmap
.get_pools_by_name()
346 ps
, root_map
, pool_root
= self
._get
_pool
_status
(osdmap
, pools
)
348 # Anyone in 'warn', set the health message for them and then
349 # drop them from consideration.
354 total_ratio
= dict([(r
, 0.0) for r
in iter(root_map
)])
355 total_target_ratio
= dict([(r
, 0.0) for r
in iter(root_map
)])
356 target_ratio_pools
= dict([(r
, []) for r
in iter(root_map
)])
358 total_bytes
= dict([(r
, 0) for r
in iter(root_map
)])
359 total_target_bytes
= dict([(r
, 0.0) for r
in iter(root_map
)])
360 target_bytes_pools
= dict([(r
, []) for r
in iter(root_map
)])
363 total_ratio
[p
['crush_root_id']] += max(p
['actual_capacity_ratio'],
365 if p
['target_ratio'] > 0:
366 total_target_ratio
[p
['crush_root_id']] += p
['target_ratio']
367 target_ratio_pools
[p
['crush_root_id']].append(p
['pool_name'])
368 total_bytes
[p
['crush_root_id']] += max(
369 p
['actual_raw_used'],
370 p
['target_bytes'] * p
['raw_used_rate'])
371 if p
['target_bytes'] > 0:
372 total_target_bytes
[p
['crush_root_id']] += p
['target_bytes'] * p
['raw_used_rate']
373 target_bytes_pools
[p
['crush_root_id']].append(p
['pool_name'])
374 if not p
['would_adjust']:
376 if p
['pg_autoscale_mode'] == 'warn':
377 msg
= 'Pool %s has %d placement groups, should have %d' % (
381 if p
['pg_num_final'] > p
['pg_num_target']:
386 if p
['pg_autoscale_mode'] == 'on':
387 # Note that setting pg_num actually sets pg_num_target (see
389 r
= self
.mon_command({
390 'prefix': 'osd pool set',
391 'pool': p
['pool_name'],
393 'val': str(p
['pg_num_final'])
397 # FIXME: this is a serious and unexpected thing,
398 # we should expose it as a cluster log error once
399 # the hook for doing that from ceph-mgr modules is
401 self
.log
.error("pg_num adjustment on {0} to {1} failed: {2}"
402 .format(p
['pool_name'],
403 p
['pg_num_final'], r
))
406 summary
= "{0} pools have too few placement groups".format(
408 health_checks
['POOL_TOO_FEW_PGS'] = {
409 'severity': 'warning',
414 summary
= "{0} pools have too many placement groups".format(
416 health_checks
['POOL_TOO_MANY_PGS'] = {
417 'severity': 'warning',
422 too_much_target_ratio
= []
423 for root_id
, total
in iteritems(total_ratio
):
424 total_target
= total_target_ratio
[root_id
]
425 if total_target
> 0 and total
> 1.0:
426 too_much_target_ratio
.append(
427 'Pools %s overcommit available storage by %.03fx due to '
428 'target_size_ratio %.03f on pools %s' % (
429 root_map
[root_id
].pool_names
,
432 target_ratio_pools
[root_id
]
435 elif total_target
> 1.0:
436 too_much_target_ratio
.append(
437 'Pools %s have collective target_size_ratio %.03f > 1.0' % (
438 root_map
[root_id
].pool_names
,
442 if too_much_target_ratio
:
443 health_checks
['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
444 'severity': 'warning',
445 'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio
),
446 'detail': too_much_target_ratio
,
449 too_much_target_bytes
= []
450 for root_id
, total
in iteritems(total_bytes
):
451 total_target
= total_target_bytes
[root_id
]
452 if total_target
> 0 and total
> root_map
[root_id
].capacity
:
453 too_much_target_bytes
.append(
454 'Pools %s overcommit available storage by %.03fx due to '
455 'target_size_bytes %s on pools %s' % (
456 root_map
[root_id
].pool_names
,
457 total
/ root_map
[root_id
].capacity
,
458 mgr_util
.format_bytes(total_target
, 5, colored
=False),
459 target_bytes_pools
[root_id
]
462 elif total_target
> root_map
[root_id
].capacity
:
463 too_much_target_bytes
.append(
464 'Pools %s overcommit available storage by %.03fx due to '
465 'collective target_size_bytes of %s' % (
466 root_map
[root_id
].pool_names
,
467 total
/ root_map
[root_id
].capacity
,
468 mgr_util
.format_bytes(total_target
, 5, colored
=False),
471 if too_much_target_bytes
:
472 health_checks
['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
473 'severity': 'warning',
474 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes
),
475 'detail': too_much_target_bytes
,
479 self
.set_health_checks(health_checks
)