]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/pg_autoscaler/module.py
2 Automatically scale pg_num based on how much data is stored in each pool.
9 from six
import itervalues
, iteritems
10 from prettytable
import PrettyTable
11 from mgr_module
import MgrModule
14 Some terminology is made up for the purposes of this module:
16 - "raw pgs": pg count after applying replication, i.e. the real resource
17 consumption of a pool.
18 - "grow/shrink" - increase/decrease the pg_num in a pool
19 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
20 units of resource management.
25 PG_NUM_MIN
= 32 # unless specified on a per-pool basis
27 def nearest_power_of_two(n
):
37 # High bound power of two
40 # Low bound power of tow
43 return x
if (v
- n
) > (n
- x
) else v
45 def effective_target_ratio(target_ratio
, total_target_ratio
, total_target_bytes
, capacity
):
47 Returns the target ratio after normalizing for ratios across pools and
48 adjusting for capacity reserved by pools that have target_size_bytes set.
50 target_ratio
= float(target_ratio
)
51 if total_target_ratio
:
52 target_ratio
= target_ratio
/ total_target_ratio
54 if total_target_bytes
and capacity
:
55 fraction_available
= 1.0 - min(1.0, float(total_target_bytes
) / capacity
)
56 target_ratio
*= fraction_available
61 class PgAdjustmentProgress(object):
63 Keeps the initial and target pg_num values
65 def __init__(self
, pool_id
, pg_num
, pg_num_target
):
66 self
.ev_id
= str(uuid
.uuid4())
67 self
.pool_id
= pool_id
68 self
.reset(pg_num
, pg_num_target
)
70 def reset(self
, pg_num
, pg_num_target
):
72 self
.pg_num_target
= pg_num_target
74 def update(self
, module
, progress
):
75 desc
= 'increasing' if self
.pg_num
< self
.pg_num_target
else 'decreasing'
76 module
.remote('progress', 'update', self
.ev_id
,
77 ev_msg
="PG autoscaler %s pool %d PGs from %d to %d" %
78 (desc
, self
.pool_id
, self
.pg_num
, self
.pg_num_target
),
80 refs
=[("pool", self
.pool_id
)])
83 class PgAutoscaler(MgrModule
):
89 "cmd": "osd pool autoscale-status",
90 "desc": "report on pool pg_num sizing recommendation and intent",
96 'mon_target_pg_per_osd',
102 'name': 'sleep_interval',
107 def __init__(self
, *args
, **kwargs
):
108 super(PgAutoscaler
, self
).__init
__(*args
, **kwargs
)
109 self
._shutdown
= threading
.Event()
112 # So much of what we do peeks at the osdmap that it's easiest
113 # to just keep a copy of the pythonized version.
116 def config_notify(self
):
117 for opt
in self
.NATIVE_OPTIONS
:
120 self
.get_ceph_option(opt
))
121 self
.log
.debug(' native option %s = %s', opt
, getattr(self
, opt
))
122 for opt
in self
.MODULE_OPTIONS
:
125 self
.get_module_option(opt
['name']))
126 self
.log
.debug(' mgr option %s = %s',
127 opt
['name'], getattr(self
, opt
['name']))
130 def handle_command(self
, inbuf
, cmd
):
131 if cmd
['prefix'] == "osd pool autoscale-status":
132 retval
= self
._command
_autoscale
_status
(cmd
)
134 assert False # ceph-mgr should never pass us unknown cmds
137 def _command_autoscale_status(self
, cmd
):
138 osdmap
= self
.get_osdmap()
139 pools
= osdmap
.get_pools_by_name()
140 ps
, root_map
, pool_root
= self
._get
_pool
_status
(osdmap
, pools
)
142 if cmd
.get('format') == 'json' or cmd
.get('format') == 'json-pretty':
143 return 0, json
.dumps(ps
, indent
=4, sort_keys
=True), ''
145 table
= PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
146 'RATE', 'RAW CAPACITY',
147 'RATIO', 'TARGET RATIO',
152 'NEW PG_NUM', 'AUTOSCALE'],
154 table
.left_padding_width
= 0
155 table
.right_padding_width
= 2
156 table
.align
['POOL'] = 'l'
157 table
.align
['SIZE'] = 'r'
158 table
.align
['TARGET SIZE'] = 'r'
159 table
.align
['RATE'] = 'r'
160 table
.align
['RAW CAPACITY'] = 'r'
161 table
.align
['RATIO'] = 'r'
162 table
.align
['TARGET RATIO'] = 'r'
163 table
.align
['EFFECTIVE RATIO'] = 'r'
164 table
.align
['BIAS'] = 'r'
165 table
.align
['PG_NUM'] = 'r'
166 # table.align['IDEAL'] = 'r'
167 table
.align
['NEW PG_NUM'] = 'r'
168 table
.align
['AUTOSCALE'] = 'l'
170 if p
['would_adjust']:
171 final
= str(p
['pg_num_final'])
174 if p
['target_bytes'] > 0:
175 ts
= mgr_util
.format_bytes(p
['target_bytes'], 6)
178 if p
['target_ratio'] > 0.0:
179 tr
= '%.4f' % p
['target_ratio']
182 if p
['effective_target_ratio'] > 0.0:
183 etr
= '%.4f' % p
['effective_target_ratio']
188 mgr_util
.format_bytes(p
['logical_used'], 6),
191 mgr_util
.format_bytes(p
['subtree_capacity'], 6),
192 '%.4f' % p
['capacity_ratio'],
199 p
['pg_autoscale_mode'],
201 return 0, table
.get_string(), ''
205 while not self
._shutdown
.is_set():
207 self
._update
_progress
_events
()
208 self
._shutdown
.wait(timeout
=int(self
.sleep_interval
))
211 self
.log
.info('Stopping pg_autoscaler')
214 def get_subtree_resource_status(self
, osdmap
, crush
):
216 For each CRUSH subtree of interest (i.e. the roots under which
217 we have pools), calculate the current resource usages and targets,
218 such as how many PGs there are, vs. how many PGs we would
225 class CrushSubtreeResourceStatus(object):
229 self
.osd_count
= None # Number of OSDs
230 self
.pg_target
= None # Ideal full-capacity PG count?
231 self
.pg_current
= 0 # How many PGs already?
232 self
.capacity
= None # Total capacity of OSDs in subtree
235 self
.total_target_ratio
= 0.0
236 self
.total_target_bytes
= 0 # including replication / EC overhead
238 # identify subtrees (note that they may overlap!)
239 for pool_id
, pool
in osdmap
.get_pools().items():
240 cr_name
= crush
.get_rule_by_id(pool
['crush_rule'])['rule_name']
241 root_id
= int(crush
.get_rule_root(cr_name
))
242 pool_root
[pool_id
] = root_id
243 osds
= set(crush
.get_osds_under(root_id
))
245 # do we intersect an existing root?
247 for prev
in itervalues(result
):
252 s
= CrushSubtreeResourceStatus()
255 s
.root_ids
.append(root_id
)
257 s
.pool_ids
.append(pool_id
)
258 s
.pool_names
.append(pool
['pool_name'])
259 s
.pg_current
+= pool
['pg_num_target'] * pool
['size']
260 target_ratio
= pool
['options'].get('target_size_ratio', 0.0)
262 s
.total_target_ratio
+= target_ratio
264 target_bytes
= pool
['options'].get('target_size_bytes', 0)
266 s
.total_target_bytes
+= target_bytes
* osdmap
.pool_raw_used_rate(pool_id
)
269 all_stats
= self
.get('osd_stats')
271 s
.osd_count
= len(s
.osds
)
272 s
.pg_target
= s
.osd_count
* self
.mon_target_pg_per_osd
275 for osd_stats
in all_stats
['osd_stats']:
276 if osd_stats
['osd'] in s
.osds
:
277 # Intentionally do not apply the OSD's reweight to
278 # this, because we want to calculate PG counts based
279 # on the physical storage available, not how it is
280 # reweighted right now.
281 capacity
+= osd_stats
['kb'] * 1024
283 s
.capacity
= capacity
285 self
.log
.debug('root_ids %s pools %s with %d osds, pg_target %d',
291 return result
, pool_root
293 def _get_pool_status(
299 assert threshold
>= 2.0
301 crush_map
= osdmap
.get_crush()
303 root_map
, pool_root
= self
.get_subtree_resource_status(osdmap
, crush_map
)
306 pool_stats
= dict([(p
['id'], p
['stats']) for p
in df
['pools']])
310 # iterate over all pools to determine how they should be sized
311 for pool_name
, p
in iteritems(pools
):
313 if pool_id
not in pool_stats
:
314 # race with pool deletion; skip
317 # FIXME: we assume there is only one take per pool, but that
319 cr_name
= crush_map
.get_rule_by_id(p
['crush_rule'])['rule_name']
320 root_id
= int(crush_map
.get_rule_root(cr_name
))
321 pool_root
[pool_name
] = root_id
323 capacity
= root_map
[root_id
].capacity
325 self
.log
.debug('skipping empty subtree %s', cr_name
)
328 raw_used_rate
= osdmap
.pool_raw_used_rate(pool_id
)
330 pool_logical_used
= pool_stats
[pool_id
]['stored']
331 bias
= p
['options'].get('pg_autoscale_bias', 1.0)
333 # ratio takes precedence if both are set
334 if p
['options'].get('target_size_ratio', 0.0) == 0.0:
335 target_bytes
= p
['options'].get('target_size_bytes', 0)
337 # What proportion of space are we using?
338 actual_raw_used
= pool_logical_used
* raw_used_rate
339 actual_capacity_ratio
= float(actual_raw_used
) / capacity
341 pool_raw_used
= max(pool_logical_used
, target_bytes
) * raw_used_rate
342 capacity_ratio
= float(pool_raw_used
) / capacity
344 self
.log
.info("effective_target_ratio {0} {1} {2} {3}".format(
345 p
['options'].get('target_size_ratio', 0.0),
346 root_map
[root_id
].total_target_ratio
,
347 root_map
[root_id
].total_target_bytes
,
349 target_ratio
= effective_target_ratio(p
['options'].get('target_size_ratio', 0.0),
350 root_map
[root_id
].total_target_ratio
,
351 root_map
[root_id
].total_target_bytes
,
354 final_ratio
= max(capacity_ratio
, target_ratio
)
356 # So what proportion of pg allowance should we be using?
357 pool_pg_target
= (final_ratio
* root_map
[root_id
].pg_target
) / p
['size'] * bias
359 final_pg_target
= max(p
['options'].get('pg_num_min', PG_NUM_MIN
),
360 nearest_power_of_two(pool_pg_target
))
362 self
.log
.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
363 "pg target {4} quantized to {5} (current {6})".format(
374 if (final_pg_target
> p
['pg_num_target'] * threshold
or \
375 final_pg_target
< p
['pg_num_target'] / threshold
) and \
376 final_ratio
>= 0.0 and \
382 'pool_name': p
['pool_name'],
383 'crush_root_id': root_id
,
384 'pg_autoscale_mode': p
['pg_autoscale_mode'],
385 'pg_num_target': p
['pg_num_target'],
386 'logical_used': pool_logical_used
,
387 'target_bytes': target_bytes
,
388 'raw_used_rate': raw_used_rate
,
389 'subtree_capacity': capacity
,
390 'actual_raw_used': actual_raw_used
,
391 'raw_used': pool_raw_used
,
392 'actual_capacity_ratio': actual_capacity_ratio
,
393 'capacity_ratio': capacity_ratio
,
394 'target_ratio': p
['options'].get('target_size_ratio', 0.0),
395 'effective_target_ratio': target_ratio
,
396 'pg_num_ideal': int(pool_pg_target
),
397 'pg_num_final': final_pg_target
,
398 'would_adjust': adjust
,
399 'bias': p
.get('options', {}).get('pg_autoscale_bias', 1.0),
402 return (ret
, root_map
, pool_root
)
404 def _update_progress_events(self
):
405 osdmap
= self
.get_osdmap()
406 pools
= osdmap
.get_pools()
407 for pool_id
in list(self
._event
):
408 ev
= self
._event
[pool_id
]
409 pool_data
= pools
.get(pool_id
)
410 if pool_data
is None or pool_data
['pg_num'] == pool_data
['pg_num_target']:
411 # pool is gone or we've reached our target
412 self
.remote('progress', 'complete', ev
.ev_id
)
413 del self
._event
[pool_id
]
415 ev
.update(self
, (ev
.pg_num
- pool_data
['pg_num']) / (ev
.pg_num
- ev
.pg_num_target
))
417 def _maybe_adjust(self
):
418 self
.log
.info('_maybe_adjust')
419 osdmap
= self
.get_osdmap()
420 if osdmap
.get_require_osd_release() < 'nautilus':
422 pools
= osdmap
.get_pools_by_name()
423 ps
, root_map
, pool_root
= self
._get
_pool
_status
(osdmap
, pools
)
425 # Anyone in 'warn', set the health message for them and then
426 # drop them from consideration.
432 total_bytes
= dict([(r
, 0) for r
in iter(root_map
)])
433 total_target_bytes
= dict([(r
, 0.0) for r
in iter(root_map
)])
434 target_bytes_pools
= dict([(r
, []) for r
in iter(root_map
)])
437 pool_id
= p
['pool_id']
438 pool_opts
= pools
[p
['pool_name']]['options']
439 if pool_opts
.get('target_size_ratio', 0) > 0 and pool_opts
.get('target_size_bytes', 0) > 0:
440 bytes_and_ratio
.append('Pool %s has target_size_bytes and target_size_ratio set' % p
['pool_name'])
441 total_bytes
[p
['crush_root_id']] += max(
442 p
['actual_raw_used'],
443 p
['target_bytes'] * p
['raw_used_rate'])
444 if p
['target_bytes'] > 0:
445 total_target_bytes
[p
['crush_root_id']] += p
['target_bytes'] * p
['raw_used_rate']
446 target_bytes_pools
[p
['crush_root_id']].append(p
['pool_name'])
447 if not p
['would_adjust']:
449 if p
['pg_autoscale_mode'] == 'warn':
450 msg
= 'Pool %s has %d placement groups, should have %d' % (
454 if p
['pg_num_final'] > p
['pg_num_target']:
459 if p
['pg_autoscale_mode'] == 'on':
460 # Note that setting pg_num actually sets pg_num_target (see
462 r
= self
.mon_command({
463 'prefix': 'osd pool set',
464 'pool': p
['pool_name'],
466 'val': str(p
['pg_num_final'])
469 # create new event or update existing one to reflect
470 # progress from current state to the new pg_num_target
471 pool_data
= pools
[p
['pool_name']]
472 pg_num
= pool_data
['pg_num']
473 new_target
= p
['pg_num_final']
474 if pool_id
in self
._event
:
475 self
._event
[pool_id
].reset(pg_num
, new_target
)
477 self
._event
[pool_id
] = PgAdjustmentProgress(pool_id
, pg_num
, new_target
)
478 self
._event
[pool_id
].update(self
, 0.0)
481 # FIXME: this is a serious and unexpected thing,
482 # we should expose it as a cluster log error once
483 # the hook for doing that from ceph-mgr modules is
485 self
.log
.error("pg_num adjustment on {0} to {1} failed: {2}"
486 .format(p
['pool_name'],
487 p
['pg_num_final'], r
))
490 summary
= "{0} pools have too few placement groups".format(
492 health_checks
['POOL_TOO_FEW_PGS'] = {
493 'severity': 'warning',
495 'count': len(too_few
),
499 summary
= "{0} pools have too many placement groups".format(
501 health_checks
['POOL_TOO_MANY_PGS'] = {
502 'severity': 'warning',
504 'count': len(too_many
),
508 too_much_target_bytes
= []
509 for root_id
, total
in iteritems(total_bytes
):
510 total_target
= total_target_bytes
[root_id
]
511 if total_target
> 0 and total
> root_map
[root_id
].capacity
and root_map
[root_id
].capacity
:
512 too_much_target_bytes
.append(
513 'Pools %s overcommit available storage by %.03fx due to '
514 'target_size_bytes %s on pools %s' % (
515 root_map
[root_id
].pool_names
,
516 total
/ root_map
[root_id
].capacity
,
517 mgr_util
.format_bytes(total_target
, 5, colored
=False),
518 target_bytes_pools
[root_id
]
521 elif total_target
> root_map
[root_id
].capacity
and root_map
[root_id
].capacity
:
522 too_much_target_bytes
.append(
523 'Pools %s overcommit available storage by %.03fx due to '
524 'collective target_size_bytes of %s' % (
525 root_map
[root_id
].pool_names
,
526 total
/ root_map
[root_id
].capacity
,
527 mgr_util
.format_bytes(total_target
, 5, colored
=False),
530 if too_much_target_bytes
:
531 health_checks
['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
532 'severity': 'warning',
533 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes
),
534 'count': len(too_much_target_bytes
),
535 'detail': too_much_target_bytes
,
539 health_checks
['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
540 'severity': 'warning',
541 'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio
),
542 'count': len(bytes_and_ratio
),
543 'detail': bytes_and_ratio
,
546 self
.set_health_checks(health_checks
)