]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/pg_autoscaler/module.py
import 15.2.0 Octopus source
[ceph.git] / ceph / src / pybind / mgr / pg_autoscaler / module.py
CommitLineData
11fdf7f2
TL
1"""
2Automatically scale pg_num based on how much data is stored in each pool.
3"""
4
11fdf7f2
TL
5import json
6import mgr_util
7import threading
8import uuid
81eedcae 9from six import itervalues, iteritems
9f95a23c 10from prettytable import PrettyTable
11fdf7f2
TL
11from mgr_module import MgrModule
12
13"""
14Some terminology is made up for the purposes of this module:
15
16 - "raw pgs": pg count after applying replication, i.e. the real resource
17 consumption of a pool.
18 - "grow/shrink" - increase/decrease the pg_num in a pool
19 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
20 units of resource management.
21"""
22
23INTERVAL = 5
24
92f5a8d4 25PG_NUM_MIN = 32 # unless specified on a per-pool basis
11fdf7f2
TL
26
27def nearest_power_of_two(n):
28 v = int(n)
29
30 v -= 1
31 v |= v >> 1
32 v |= v >> 2
33 v |= v >> 4
34 v |= v >> 8
35 v |= v >> 16
36
37 # High bound power of two
38 v += 1
39
40 # Low bound power of tow
41 x = v >> 1
42
43 return x if (v - n) > (n - x) else v
44
9f95a23c
TL
45def effective_target_ratio(target_ratio, total_target_ratio, total_target_bytes, capacity):
46 """
47 Returns the target ratio after normalizing for ratios across pools and
48 adjusting for capacity reserved by pools that have target_size_bytes set.
49 """
50 target_ratio = float(target_ratio)
51 if total_target_ratio:
52 target_ratio = target_ratio / total_target_ratio
53
54 if total_target_bytes and capacity:
55 fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity)
56 target_ratio *= fraction_available
57
58 return target_ratio
59
60
61class PgAdjustmentProgress(object):
62 """
63 Keeps the initial and target pg_num values
64 """
65 def __init__(self, pool_id, pg_num, pg_num_target):
66 self.ev_id = str(uuid.uuid4())
67 self.pool_id = pool_id
68 self.reset(pg_num, pg_num_target)
69
70 def reset(self, pg_num, pg_num_target):
71 self.pg_num = pg_num
72 self.pg_num_target = pg_num_target
73
74 def update(self, module, progress):
75 desc = 'increasing' if self.pg_num < self.pg_num_target else 'decreasing'
76 module.remote('progress', 'update', self.ev_id,
77 ev_msg="PG autoscaler %s pool %d PGs from %d to %d" %
78 (desc, self.pool_id, self.pg_num, self.pg_num_target),
79 ev_progress=progress,
80 refs=[("pool", self.pool_id)])
81
11fdf7f2
TL
82
83class PgAutoscaler(MgrModule):
84 """
85 PG autoscaler.
86 """
87 COMMANDS = [
88 {
89 "cmd": "osd pool autoscale-status",
90 "desc": "report on pool pg_num sizing recommendation and intent",
91 "perm": "r"
92 },
93 ]
94
95 NATIVE_OPTIONS = [
96 'mon_target_pg_per_osd',
97 'mon_max_pg_per_osd',
98 ]
99
100 MODULE_OPTIONS = [
101 {
102 'name': 'sleep_interval',
103 'default': str(60),
104 },
105 ]
106
107 def __init__(self, *args, **kwargs):
108 super(PgAutoscaler, self).__init__(*args, **kwargs)
109 self._shutdown = threading.Event()
9f95a23c 110 self._event = {}
11fdf7f2
TL
111
112 # So much of what we do peeks at the osdmap that it's easiest
113 # to just keep a copy of the pythonized version.
114 self._osd_map = None
115
116 def config_notify(self):
117 for opt in self.NATIVE_OPTIONS:
118 setattr(self,
119 opt,
120 self.get_ceph_option(opt))
121 self.log.debug(' native option %s = %s', opt, getattr(self, opt))
122 for opt in self.MODULE_OPTIONS:
123 setattr(self,
124 opt['name'],
9f95a23c 125 self.get_module_option(opt['name']))
11fdf7f2
TL
126 self.log.debug(' mgr option %s = %s',
127 opt['name'], getattr(self, opt['name']))
128
129
130 def handle_command(self, inbuf, cmd):
131 if cmd['prefix'] == "osd pool autoscale-status":
132 retval = self._command_autoscale_status(cmd)
133 else:
134 assert False # ceph-mgr should never pass us unknown cmds
135 return retval
136
137 def _command_autoscale_status(self, cmd):
138 osdmap = self.get_osdmap()
139 pools = osdmap.get_pools_by_name()
140 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
141
142 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
9f95a23c 143 return 0, json.dumps(ps, indent=4, sort_keys=True), ''
11fdf7f2
TL
144 else:
145 table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
146 'RATE', 'RAW CAPACITY',
147 'RATIO', 'TARGET RATIO',
9f95a23c 148 'EFFECTIVE RATIO',
11fdf7f2
TL
149 'BIAS',
150 'PG_NUM',
151# 'IDEAL',
152 'NEW PG_NUM', 'AUTOSCALE'],
153 border=False)
eafe8130 154 table.left_padding_width = 0
9f95a23c 155 table.right_padding_width = 2
11fdf7f2
TL
156 table.align['POOL'] = 'l'
157 table.align['SIZE'] = 'r'
158 table.align['TARGET SIZE'] = 'r'
159 table.align['RATE'] = 'r'
160 table.align['RAW CAPACITY'] = 'r'
161 table.align['RATIO'] = 'r'
162 table.align['TARGET RATIO'] = 'r'
9f95a23c 163 table.align['EFFECTIVE RATIO'] = 'r'
11fdf7f2
TL
164 table.align['BIAS'] = 'r'
165 table.align['PG_NUM'] = 'r'
166# table.align['IDEAL'] = 'r'
167 table.align['NEW PG_NUM'] = 'r'
168 table.align['AUTOSCALE'] = 'l'
169 for p in ps:
170 if p['would_adjust']:
171 final = str(p['pg_num_final'])
172 else:
173 final = ''
174 if p['target_bytes'] > 0:
175 ts = mgr_util.format_bytes(p['target_bytes'], 6)
176 else:
177 ts = ''
178 if p['target_ratio'] > 0.0:
179 tr = '%.4f' % p['target_ratio']
180 else:
181 tr = ''
9f95a23c
TL
182 if p['effective_target_ratio'] > 0.0:
183 etr = '%.4f' % p['effective_target_ratio']
184 else:
185 etr = ''
11fdf7f2
TL
186 table.add_row([
187 p['pool_name'],
188 mgr_util.format_bytes(p['logical_used'], 6),
189 ts,
190 p['raw_used_rate'],
191 mgr_util.format_bytes(p['subtree_capacity'], 6),
192 '%.4f' % p['capacity_ratio'],
193 tr,
9f95a23c 194 etr,
11fdf7f2
TL
195 p['bias'],
196 p['pg_num_target'],
197# p['pg_num_ideal'],
198 final,
199 p['pg_autoscale_mode'],
200 ])
201 return 0, table.get_string(), ''
202
203 def serve(self):
204 self.config_notify()
205 while not self._shutdown.is_set():
206 self._maybe_adjust()
9f95a23c 207 self._update_progress_events()
11fdf7f2
TL
208 self._shutdown.wait(timeout=int(self.sleep_interval))
209
92f5a8d4
TL
210 def shutdown(self):
211 self.log.info('Stopping pg_autoscaler')
212 self._shutdown.set()
213
11fdf7f2
TL
214 def get_subtree_resource_status(self, osdmap, crush):
215 """
216 For each CRUSH subtree of interest (i.e. the roots under which
217 we have pools), calculate the current resource usages and targets,
218 such as how many PGs there are, vs. how many PGs we would
219 like there to be.
220 """
221 result = {}
222 pool_root = {}
223 roots = []
224
225 class CrushSubtreeResourceStatus(object):
226 def __init__(self):
227 self.root_ids = []
228 self.osds = set()
229 self.osd_count = None # Number of OSDs
230 self.pg_target = None # Ideal full-capacity PG count?
231 self.pg_current = 0 # How many PGs already?
232 self.capacity = None # Total capacity of OSDs in subtree
233 self.pool_ids = []
234 self.pool_names = []
9f95a23c
TL
235 self.total_target_ratio = 0.0
236 self.total_target_bytes = 0 # including replication / EC overhead
11fdf7f2
TL
237
238 # identify subtrees (note that they may overlap!)
239 for pool_id, pool in osdmap.get_pools().items():
240 cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
241 root_id = int(crush.get_rule_root(cr_name))
242 pool_root[pool_id] = root_id
243 osds = set(crush.get_osds_under(root_id))
244
245 # do we intersect an existing root?
246 s = None
81eedcae 247 for prev in itervalues(result):
11fdf7f2
TL
248 if osds & prev.osds:
249 s = prev
250 break
251 if not s:
252 s = CrushSubtreeResourceStatus()
253 roots.append(s)
254 result[root_id] = s
255 s.root_ids.append(root_id)
256 s.osds |= osds
9f95a23c 257 s.pool_ids.append(pool_id)
11fdf7f2
TL
258 s.pool_names.append(pool['pool_name'])
259 s.pg_current += pool['pg_num_target'] * pool['size']
9f95a23c
TL
260 target_ratio = pool['options'].get('target_size_ratio', 0.0)
261 if target_ratio:
262 s.total_target_ratio += target_ratio
263 else:
264 target_bytes = pool['options'].get('target_size_bytes', 0)
265 if target_bytes:
266 s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id)
11fdf7f2
TL
267
268 # finish subtrees
269 all_stats = self.get('osd_stats')
270 for s in roots:
271 s.osd_count = len(s.osds)
9f95a23c 272 s.pg_target = s.osd_count * self.mon_target_pg_per_osd
11fdf7f2
TL
273
274 capacity = 0.0
275 for osd_stats in all_stats['osd_stats']:
276 if osd_stats['osd'] in s.osds:
277 # Intentionally do not apply the OSD's reweight to
278 # this, because we want to calculate PG counts based
279 # on the physical storage available, not how it is
280 # reweighted right now.
281 capacity += osd_stats['kb'] * 1024
282
283 s.capacity = capacity
284
285 self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
286 s.root_ids,
287 s.pool_ids,
288 s.osd_count,
289 s.pg_target)
290
291 return result, pool_root
292
11fdf7f2
TL
293 def _get_pool_status(
294 self,
295 osdmap,
296 pools,
297 threshold=3.0,
298 ):
299 assert threshold >= 2.0
300
301 crush_map = osdmap.get_crush()
302
303 root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
304
305 df = self.get('df')
306 pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
307
308 ret = []
309
310 # iterate over all pools to determine how they should be sized
81eedcae 311 for pool_name, p in iteritems(pools):
11fdf7f2 312 pool_id = p['pool']
eafe8130
TL
313 if pool_id not in pool_stats:
314 # race with pool deletion; skip
315 continue
11fdf7f2
TL
316
317 # FIXME: we assume there is only one take per pool, but that
318 # may not be true.
319 cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
320 root_id = int(crush_map.get_rule_root(cr_name))
321 pool_root[pool_name] = root_id
322
323 capacity = root_map[root_id].capacity
324 if capacity == 0:
325 self.log.debug('skipping empty subtree %s', cr_name)
326 continue
327
328 raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
329
eafe8130 330 pool_logical_used = pool_stats[pool_id]['stored']
11fdf7f2 331 bias = p['options'].get('pg_autoscale_bias', 1.0)
9f95a23c
TL
332 target_bytes = 0
333 # ratio takes precedence if both are set
334 if p['options'].get('target_size_ratio', 0.0) == 0.0:
335 target_bytes = p['options'].get('target_size_bytes', 0)
11fdf7f2
TL
336
337 # What proportion of space are we using?
338 actual_raw_used = pool_logical_used * raw_used_rate
339 actual_capacity_ratio = float(actual_raw_used) / capacity
340
341 pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
342 capacity_ratio = float(pool_raw_used) / capacity
343
9f95a23c
TL
344 self.log.info("effective_target_ratio {0} {1} {2} {3}".format(
345 p['options'].get('target_size_ratio', 0.0),
346 root_map[root_id].total_target_ratio,
347 root_map[root_id].total_target_bytes,
348 capacity))
349 target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0),
350 root_map[root_id].total_target_ratio,
351 root_map[root_id].total_target_bytes,
352 capacity)
353
11fdf7f2
TL
354 final_ratio = max(capacity_ratio, target_ratio)
355
356 # So what proportion of pg allowance should we be using?
92f5a8d4 357 pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
11fdf7f2
TL
358
359 final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
360 nearest_power_of_two(pool_pg_target))
361
362 self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
363 "pg target {4} quantized to {5} (current {6})".format(
364 p['pool_name'],
365 root_id,
366 final_ratio,
367 bias,
368 pool_pg_target,
369 final_pg_target,
370 p['pg_num_target']
371 ))
372
373 adjust = False
374 if (final_pg_target > p['pg_num_target'] * threshold or \
9f95a23c 375 final_pg_target < p['pg_num_target'] / threshold) and \
11fdf7f2
TL
376 final_ratio >= 0.0 and \
377 final_ratio <= 1.0:
378 adjust = True
379
380 ret.append({
381 'pool_id': pool_id,
382 'pool_name': p['pool_name'],
383 'crush_root_id': root_id,
384 'pg_autoscale_mode': p['pg_autoscale_mode'],
385 'pg_num_target': p['pg_num_target'],
386 'logical_used': pool_logical_used,
387 'target_bytes': target_bytes,
388 'raw_used_rate': raw_used_rate,
389 'subtree_capacity': capacity,
390 'actual_raw_used': actual_raw_used,
391 'raw_used': pool_raw_used,
392 'actual_capacity_ratio': actual_capacity_ratio,
393 'capacity_ratio': capacity_ratio,
9f95a23c
TL
394 'target_ratio': p['options'].get('target_size_ratio', 0.0),
395 'effective_target_ratio': target_ratio,
11fdf7f2
TL
396 'pg_num_ideal': int(pool_pg_target),
397 'pg_num_final': final_pg_target,
398 'would_adjust': adjust,
399 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
400 });
401
402 return (ret, root_map, pool_root)
403
9f95a23c
TL
404 def _update_progress_events(self):
405 osdmap = self.get_osdmap()
406 pools = osdmap.get_pools()
407 for pool_id in list(self._event):
408 ev = self._event[pool_id]
409 pool_data = pools.get(pool_id)
410 if pool_data is None or pool_data['pg_num'] == pool_data['pg_num_target']:
411 # pool is gone or we've reached our target
412 self.remote('progress', 'complete', ev.ev_id)
413 del self._event[pool_id]
414 continue
415 ev.update(self, (ev.pg_num - pool_data['pg_num']) / (ev.pg_num - ev.pg_num_target))
11fdf7f2
TL
416
417 def _maybe_adjust(self):
418 self.log.info('_maybe_adjust')
419 osdmap = self.get_osdmap()
9f95a23c
TL
420 if osdmap.get_require_osd_release() < 'nautilus':
421 return
11fdf7f2
TL
422 pools = osdmap.get_pools_by_name()
423 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
424
425 # Anyone in 'warn', set the health message for them and then
426 # drop them from consideration.
427 too_few = []
428 too_many = []
9f95a23c 429 bytes_and_ratio = []
11fdf7f2
TL
430 health_checks = {}
431
81eedcae
TL
432 total_bytes = dict([(r, 0) for r in iter(root_map)])
433 total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
434 target_bytes_pools = dict([(r, []) for r in iter(root_map)])
11fdf7f2
TL
435
436 for p in ps:
9f95a23c
TL
437 pool_id = p['pool_id']
438 pool_opts = pools[p['pool_name']]['options']
439 if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0:
440 bytes_and_ratio.append('Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name'])
11fdf7f2
TL
441 total_bytes[p['crush_root_id']] += max(
442 p['actual_raw_used'],
443 p['target_bytes'] * p['raw_used_rate'])
444 if p['target_bytes'] > 0:
445 total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
446 target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
447 if not p['would_adjust']:
448 continue
449 if p['pg_autoscale_mode'] == 'warn':
450 msg = 'Pool %s has %d placement groups, should have %d' % (
451 p['pool_name'],
452 p['pg_num_target'],
453 p['pg_num_final'])
454 if p['pg_num_final'] > p['pg_num_target']:
455 too_few.append(msg)
456 else:
457 too_many.append(msg)
458
459 if p['pg_autoscale_mode'] == 'on':
460 # Note that setting pg_num actually sets pg_num_target (see
461 # OSDMonitor.cc)
462 r = self.mon_command({
463 'prefix': 'osd pool set',
464 'pool': p['pool_name'],
465 'var': 'pg_num',
466 'val': str(p['pg_num_final'])
467 })
468
9f95a23c
TL
469 # create new event or update existing one to reflect
470 # progress from current state to the new pg_num_target
471 pool_data = pools[p['pool_name']]
472 pg_num = pool_data['pg_num']
473 new_target = p['pg_num_final']
474 if pool_id in self._event:
475 self._event[pool_id].reset(pg_num, new_target)
476 else:
477 self._event[pool_id] = PgAdjustmentProgress(pool_id, pg_num, new_target)
478 self._event[pool_id].update(self, 0.0)
479
11fdf7f2
TL
480 if r[0] != 0:
481 # FIXME: this is a serious and unexpected thing,
482 # we should expose it as a cluster log error once
483 # the hook for doing that from ceph-mgr modules is
484 # in.
485 self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
486 .format(p['pool_name'],
487 p['pg_num_final'], r))
488
489 if too_few:
490 summary = "{0} pools have too few placement groups".format(
491 len(too_few))
492 health_checks['POOL_TOO_FEW_PGS'] = {
493 'severity': 'warning',
494 'summary': summary,
9f95a23c 495 'count': len(too_few),
11fdf7f2
TL
496 'detail': too_few
497 }
498 if too_many:
499 summary = "{0} pools have too many placement groups".format(
500 len(too_many))
501 health_checks['POOL_TOO_MANY_PGS'] = {
502 'severity': 'warning',
503 'summary': summary,
9f95a23c 504 'count': len(too_many),
11fdf7f2
TL
505 'detail': too_many
506 }
507
11fdf7f2 508 too_much_target_bytes = []
81eedcae 509 for root_id, total in iteritems(total_bytes):
11fdf7f2 510 total_target = total_target_bytes[root_id]
9f95a23c 511 if total_target > 0 and total > root_map[root_id].capacity and root_map[root_id].capacity:
11fdf7f2
TL
512 too_much_target_bytes.append(
513 'Pools %s overcommit available storage by %.03fx due to '
514 'target_size_bytes %s on pools %s' % (
515 root_map[root_id].pool_names,
516 total / root_map[root_id].capacity,
517 mgr_util.format_bytes(total_target, 5, colored=False),
518 target_bytes_pools[root_id]
519 )
520 )
9f95a23c 521 elif total_target > root_map[root_id].capacity and root_map[root_id].capacity:
11fdf7f2
TL
522 too_much_target_bytes.append(
523 'Pools %s overcommit available storage by %.03fx due to '
524 'collective target_size_bytes of %s' % (
525 root_map[root_id].pool_names,
526 total / root_map[root_id].capacity,
527 mgr_util.format_bytes(total_target, 5, colored=False),
528 )
529 )
530 if too_much_target_bytes:
531 health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
532 'severity': 'warning',
533 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
9f95a23c 534 'count': len(too_much_target_bytes),
11fdf7f2
TL
535 'detail': too_much_target_bytes,
536 }
537
9f95a23c
TL
538 if bytes_and_ratio:
539 health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
540 'severity': 'warning',
541 'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio),
542 'count': len(bytes_and_ratio),
543 'detail': bytes_and_ratio,
544 }
11fdf7f2
TL
545
546 self.set_health_checks(health_checks)