]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/pg_autoscaler/module.py
import 15.2.0 Octopus source
[ceph.git] / ceph / src / pybind / mgr / pg_autoscaler / module.py
1 """
2 Automatically scale pg_num based on how much data is stored in each pool.
3 """
4
5 import json
6 import mgr_util
7 import threading
8 import uuid
9 from six import itervalues, iteritems
10 from prettytable import PrettyTable
11 from mgr_module import MgrModule
12
13 """
14 Some terminology is made up for the purposes of this module:
15
16 - "raw pgs": pg count after applying replication, i.e. the real resource
17 consumption of a pool.
18 - "grow/shrink" - increase/decrease the pg_num in a pool
19 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
20 units of resource management.
21 """
22
23 INTERVAL = 5
24
25 PG_NUM_MIN = 32 # unless specified on a per-pool basis
26
27 def nearest_power_of_two(n):
28 v = int(n)
29
30 v -= 1
31 v |= v >> 1
32 v |= v >> 2
33 v |= v >> 4
34 v |= v >> 8
35 v |= v >> 16
36
37 # High bound power of two
38 v += 1
39
40 # Low bound power of tow
41 x = v >> 1
42
43 return x if (v - n) > (n - x) else v
44
45 def effective_target_ratio(target_ratio, total_target_ratio, total_target_bytes, capacity):
46 """
47 Returns the target ratio after normalizing for ratios across pools and
48 adjusting for capacity reserved by pools that have target_size_bytes set.
49 """
50 target_ratio = float(target_ratio)
51 if total_target_ratio:
52 target_ratio = target_ratio / total_target_ratio
53
54 if total_target_bytes and capacity:
55 fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity)
56 target_ratio *= fraction_available
57
58 return target_ratio
59
60
61 class PgAdjustmentProgress(object):
62 """
63 Keeps the initial and target pg_num values
64 """
65 def __init__(self, pool_id, pg_num, pg_num_target):
66 self.ev_id = str(uuid.uuid4())
67 self.pool_id = pool_id
68 self.reset(pg_num, pg_num_target)
69
70 def reset(self, pg_num, pg_num_target):
71 self.pg_num = pg_num
72 self.pg_num_target = pg_num_target
73
74 def update(self, module, progress):
75 desc = 'increasing' if self.pg_num < self.pg_num_target else 'decreasing'
76 module.remote('progress', 'update', self.ev_id,
77 ev_msg="PG autoscaler %s pool %d PGs from %d to %d" %
78 (desc, self.pool_id, self.pg_num, self.pg_num_target),
79 ev_progress=progress,
80 refs=[("pool", self.pool_id)])
81
82
83 class PgAutoscaler(MgrModule):
84 """
85 PG autoscaler.
86 """
87 COMMANDS = [
88 {
89 "cmd": "osd pool autoscale-status",
90 "desc": "report on pool pg_num sizing recommendation and intent",
91 "perm": "r"
92 },
93 ]
94
95 NATIVE_OPTIONS = [
96 'mon_target_pg_per_osd',
97 'mon_max_pg_per_osd',
98 ]
99
100 MODULE_OPTIONS = [
101 {
102 'name': 'sleep_interval',
103 'default': str(60),
104 },
105 ]
106
107 def __init__(self, *args, **kwargs):
108 super(PgAutoscaler, self).__init__(*args, **kwargs)
109 self._shutdown = threading.Event()
110 self._event = {}
111
112 # So much of what we do peeks at the osdmap that it's easiest
113 # to just keep a copy of the pythonized version.
114 self._osd_map = None
115
116 def config_notify(self):
117 for opt in self.NATIVE_OPTIONS:
118 setattr(self,
119 opt,
120 self.get_ceph_option(opt))
121 self.log.debug(' native option %s = %s', opt, getattr(self, opt))
122 for opt in self.MODULE_OPTIONS:
123 setattr(self,
124 opt['name'],
125 self.get_module_option(opt['name']))
126 self.log.debug(' mgr option %s = %s',
127 opt['name'], getattr(self, opt['name']))
128
129
130 def handle_command(self, inbuf, cmd):
131 if cmd['prefix'] == "osd pool autoscale-status":
132 retval = self._command_autoscale_status(cmd)
133 else:
134 assert False # ceph-mgr should never pass us unknown cmds
135 return retval
136
137 def _command_autoscale_status(self, cmd):
138 osdmap = self.get_osdmap()
139 pools = osdmap.get_pools_by_name()
140 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
141
142 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
143 return 0, json.dumps(ps, indent=4, sort_keys=True), ''
144 else:
145 table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
146 'RATE', 'RAW CAPACITY',
147 'RATIO', 'TARGET RATIO',
148 'EFFECTIVE RATIO',
149 'BIAS',
150 'PG_NUM',
151 # 'IDEAL',
152 'NEW PG_NUM', 'AUTOSCALE'],
153 border=False)
154 table.left_padding_width = 0
155 table.right_padding_width = 2
156 table.align['POOL'] = 'l'
157 table.align['SIZE'] = 'r'
158 table.align['TARGET SIZE'] = 'r'
159 table.align['RATE'] = 'r'
160 table.align['RAW CAPACITY'] = 'r'
161 table.align['RATIO'] = 'r'
162 table.align['TARGET RATIO'] = 'r'
163 table.align['EFFECTIVE RATIO'] = 'r'
164 table.align['BIAS'] = 'r'
165 table.align['PG_NUM'] = 'r'
166 # table.align['IDEAL'] = 'r'
167 table.align['NEW PG_NUM'] = 'r'
168 table.align['AUTOSCALE'] = 'l'
169 for p in ps:
170 if p['would_adjust']:
171 final = str(p['pg_num_final'])
172 else:
173 final = ''
174 if p['target_bytes'] > 0:
175 ts = mgr_util.format_bytes(p['target_bytes'], 6)
176 else:
177 ts = ''
178 if p['target_ratio'] > 0.0:
179 tr = '%.4f' % p['target_ratio']
180 else:
181 tr = ''
182 if p['effective_target_ratio'] > 0.0:
183 etr = '%.4f' % p['effective_target_ratio']
184 else:
185 etr = ''
186 table.add_row([
187 p['pool_name'],
188 mgr_util.format_bytes(p['logical_used'], 6),
189 ts,
190 p['raw_used_rate'],
191 mgr_util.format_bytes(p['subtree_capacity'], 6),
192 '%.4f' % p['capacity_ratio'],
193 tr,
194 etr,
195 p['bias'],
196 p['pg_num_target'],
197 # p['pg_num_ideal'],
198 final,
199 p['pg_autoscale_mode'],
200 ])
201 return 0, table.get_string(), ''
202
203 def serve(self):
204 self.config_notify()
205 while not self._shutdown.is_set():
206 self._maybe_adjust()
207 self._update_progress_events()
208 self._shutdown.wait(timeout=int(self.sleep_interval))
209
210 def shutdown(self):
211 self.log.info('Stopping pg_autoscaler')
212 self._shutdown.set()
213
214 def get_subtree_resource_status(self, osdmap, crush):
215 """
216 For each CRUSH subtree of interest (i.e. the roots under which
217 we have pools), calculate the current resource usages and targets,
218 such as how many PGs there are, vs. how many PGs we would
219 like there to be.
220 """
221 result = {}
222 pool_root = {}
223 roots = []
224
225 class CrushSubtreeResourceStatus(object):
226 def __init__(self):
227 self.root_ids = []
228 self.osds = set()
229 self.osd_count = None # Number of OSDs
230 self.pg_target = None # Ideal full-capacity PG count?
231 self.pg_current = 0 # How many PGs already?
232 self.capacity = None # Total capacity of OSDs in subtree
233 self.pool_ids = []
234 self.pool_names = []
235 self.total_target_ratio = 0.0
236 self.total_target_bytes = 0 # including replication / EC overhead
237
238 # identify subtrees (note that they may overlap!)
239 for pool_id, pool in osdmap.get_pools().items():
240 cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
241 root_id = int(crush.get_rule_root(cr_name))
242 pool_root[pool_id] = root_id
243 osds = set(crush.get_osds_under(root_id))
244
245 # do we intersect an existing root?
246 s = None
247 for prev in itervalues(result):
248 if osds & prev.osds:
249 s = prev
250 break
251 if not s:
252 s = CrushSubtreeResourceStatus()
253 roots.append(s)
254 result[root_id] = s
255 s.root_ids.append(root_id)
256 s.osds |= osds
257 s.pool_ids.append(pool_id)
258 s.pool_names.append(pool['pool_name'])
259 s.pg_current += pool['pg_num_target'] * pool['size']
260 target_ratio = pool['options'].get('target_size_ratio', 0.0)
261 if target_ratio:
262 s.total_target_ratio += target_ratio
263 else:
264 target_bytes = pool['options'].get('target_size_bytes', 0)
265 if target_bytes:
266 s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id)
267
268 # finish subtrees
269 all_stats = self.get('osd_stats')
270 for s in roots:
271 s.osd_count = len(s.osds)
272 s.pg_target = s.osd_count * self.mon_target_pg_per_osd
273
274 capacity = 0.0
275 for osd_stats in all_stats['osd_stats']:
276 if osd_stats['osd'] in s.osds:
277 # Intentionally do not apply the OSD's reweight to
278 # this, because we want to calculate PG counts based
279 # on the physical storage available, not how it is
280 # reweighted right now.
281 capacity += osd_stats['kb'] * 1024
282
283 s.capacity = capacity
284
285 self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
286 s.root_ids,
287 s.pool_ids,
288 s.osd_count,
289 s.pg_target)
290
291 return result, pool_root
292
293 def _get_pool_status(
294 self,
295 osdmap,
296 pools,
297 threshold=3.0,
298 ):
299 assert threshold >= 2.0
300
301 crush_map = osdmap.get_crush()
302
303 root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
304
305 df = self.get('df')
306 pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
307
308 ret = []
309
310 # iterate over all pools to determine how they should be sized
311 for pool_name, p in iteritems(pools):
312 pool_id = p['pool']
313 if pool_id not in pool_stats:
314 # race with pool deletion; skip
315 continue
316
317 # FIXME: we assume there is only one take per pool, but that
318 # may not be true.
319 cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
320 root_id = int(crush_map.get_rule_root(cr_name))
321 pool_root[pool_name] = root_id
322
323 capacity = root_map[root_id].capacity
324 if capacity == 0:
325 self.log.debug('skipping empty subtree %s', cr_name)
326 continue
327
328 raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
329
330 pool_logical_used = pool_stats[pool_id]['stored']
331 bias = p['options'].get('pg_autoscale_bias', 1.0)
332 target_bytes = 0
333 # ratio takes precedence if both are set
334 if p['options'].get('target_size_ratio', 0.0) == 0.0:
335 target_bytes = p['options'].get('target_size_bytes', 0)
336
337 # What proportion of space are we using?
338 actual_raw_used = pool_logical_used * raw_used_rate
339 actual_capacity_ratio = float(actual_raw_used) / capacity
340
341 pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
342 capacity_ratio = float(pool_raw_used) / capacity
343
344 self.log.info("effective_target_ratio {0} {1} {2} {3}".format(
345 p['options'].get('target_size_ratio', 0.0),
346 root_map[root_id].total_target_ratio,
347 root_map[root_id].total_target_bytes,
348 capacity))
349 target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0),
350 root_map[root_id].total_target_ratio,
351 root_map[root_id].total_target_bytes,
352 capacity)
353
354 final_ratio = max(capacity_ratio, target_ratio)
355
356 # So what proportion of pg allowance should we be using?
357 pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
358
359 final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
360 nearest_power_of_two(pool_pg_target))
361
362 self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
363 "pg target {4} quantized to {5} (current {6})".format(
364 p['pool_name'],
365 root_id,
366 final_ratio,
367 bias,
368 pool_pg_target,
369 final_pg_target,
370 p['pg_num_target']
371 ))
372
373 adjust = False
374 if (final_pg_target > p['pg_num_target'] * threshold or \
375 final_pg_target < p['pg_num_target'] / threshold) and \
376 final_ratio >= 0.0 and \
377 final_ratio <= 1.0:
378 adjust = True
379
380 ret.append({
381 'pool_id': pool_id,
382 'pool_name': p['pool_name'],
383 'crush_root_id': root_id,
384 'pg_autoscale_mode': p['pg_autoscale_mode'],
385 'pg_num_target': p['pg_num_target'],
386 'logical_used': pool_logical_used,
387 'target_bytes': target_bytes,
388 'raw_used_rate': raw_used_rate,
389 'subtree_capacity': capacity,
390 'actual_raw_used': actual_raw_used,
391 'raw_used': pool_raw_used,
392 'actual_capacity_ratio': actual_capacity_ratio,
393 'capacity_ratio': capacity_ratio,
394 'target_ratio': p['options'].get('target_size_ratio', 0.0),
395 'effective_target_ratio': target_ratio,
396 'pg_num_ideal': int(pool_pg_target),
397 'pg_num_final': final_pg_target,
398 'would_adjust': adjust,
399 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
400 });
401
402 return (ret, root_map, pool_root)
403
404 def _update_progress_events(self):
405 osdmap = self.get_osdmap()
406 pools = osdmap.get_pools()
407 for pool_id in list(self._event):
408 ev = self._event[pool_id]
409 pool_data = pools.get(pool_id)
410 if pool_data is None or pool_data['pg_num'] == pool_data['pg_num_target']:
411 # pool is gone or we've reached our target
412 self.remote('progress', 'complete', ev.ev_id)
413 del self._event[pool_id]
414 continue
415 ev.update(self, (ev.pg_num - pool_data['pg_num']) / (ev.pg_num - ev.pg_num_target))
416
417 def _maybe_adjust(self):
418 self.log.info('_maybe_adjust')
419 osdmap = self.get_osdmap()
420 if osdmap.get_require_osd_release() < 'nautilus':
421 return
422 pools = osdmap.get_pools_by_name()
423 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
424
425 # Anyone in 'warn', set the health message for them and then
426 # drop them from consideration.
427 too_few = []
428 too_many = []
429 bytes_and_ratio = []
430 health_checks = {}
431
432 total_bytes = dict([(r, 0) for r in iter(root_map)])
433 total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
434 target_bytes_pools = dict([(r, []) for r in iter(root_map)])
435
436 for p in ps:
437 pool_id = p['pool_id']
438 pool_opts = pools[p['pool_name']]['options']
439 if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0:
440 bytes_and_ratio.append('Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name'])
441 total_bytes[p['crush_root_id']] += max(
442 p['actual_raw_used'],
443 p['target_bytes'] * p['raw_used_rate'])
444 if p['target_bytes'] > 0:
445 total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
446 target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
447 if not p['would_adjust']:
448 continue
449 if p['pg_autoscale_mode'] == 'warn':
450 msg = 'Pool %s has %d placement groups, should have %d' % (
451 p['pool_name'],
452 p['pg_num_target'],
453 p['pg_num_final'])
454 if p['pg_num_final'] > p['pg_num_target']:
455 too_few.append(msg)
456 else:
457 too_many.append(msg)
458
459 if p['pg_autoscale_mode'] == 'on':
460 # Note that setting pg_num actually sets pg_num_target (see
461 # OSDMonitor.cc)
462 r = self.mon_command({
463 'prefix': 'osd pool set',
464 'pool': p['pool_name'],
465 'var': 'pg_num',
466 'val': str(p['pg_num_final'])
467 })
468
469 # create new event or update existing one to reflect
470 # progress from current state to the new pg_num_target
471 pool_data = pools[p['pool_name']]
472 pg_num = pool_data['pg_num']
473 new_target = p['pg_num_final']
474 if pool_id in self._event:
475 self._event[pool_id].reset(pg_num, new_target)
476 else:
477 self._event[pool_id] = PgAdjustmentProgress(pool_id, pg_num, new_target)
478 self._event[pool_id].update(self, 0.0)
479
480 if r[0] != 0:
481 # FIXME: this is a serious and unexpected thing,
482 # we should expose it as a cluster log error once
483 # the hook for doing that from ceph-mgr modules is
484 # in.
485 self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
486 .format(p['pool_name'],
487 p['pg_num_final'], r))
488
489 if too_few:
490 summary = "{0} pools have too few placement groups".format(
491 len(too_few))
492 health_checks['POOL_TOO_FEW_PGS'] = {
493 'severity': 'warning',
494 'summary': summary,
495 'count': len(too_few),
496 'detail': too_few
497 }
498 if too_many:
499 summary = "{0} pools have too many placement groups".format(
500 len(too_many))
501 health_checks['POOL_TOO_MANY_PGS'] = {
502 'severity': 'warning',
503 'summary': summary,
504 'count': len(too_many),
505 'detail': too_many
506 }
507
508 too_much_target_bytes = []
509 for root_id, total in iteritems(total_bytes):
510 total_target = total_target_bytes[root_id]
511 if total_target > 0 and total > root_map[root_id].capacity and root_map[root_id].capacity:
512 too_much_target_bytes.append(
513 'Pools %s overcommit available storage by %.03fx due to '
514 'target_size_bytes %s on pools %s' % (
515 root_map[root_id].pool_names,
516 total / root_map[root_id].capacity,
517 mgr_util.format_bytes(total_target, 5, colored=False),
518 target_bytes_pools[root_id]
519 )
520 )
521 elif total_target > root_map[root_id].capacity and root_map[root_id].capacity:
522 too_much_target_bytes.append(
523 'Pools %s overcommit available storage by %.03fx due to '
524 'collective target_size_bytes of %s' % (
525 root_map[root_id].pool_names,
526 total / root_map[root_id].capacity,
527 mgr_util.format_bytes(total_target, 5, colored=False),
528 )
529 )
530 if too_much_target_bytes:
531 health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
532 'severity': 'warning',
533 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
534 'count': len(too_much_target_bytes),
535 'detail': too_much_target_bytes,
536 }
537
538 if bytes_and_ratio:
539 health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = {
540 'severity': 'warning',
541 'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio),
542 'count': len(bytes_and_ratio),
543 'detail': bytes_and_ratio,
544 }
545
546 self.set_health_checks(health_checks)