]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/pg_autoscaler/module.py
import ceph 14.2.5
[ceph.git] / ceph / src / pybind / mgr / pg_autoscaler / module.py
CommitLineData
11fdf7f2
TL
1"""
2Automatically scale pg_num based on how much data is stored in each pool.
3"""
4
5import errno
6import json
7import mgr_util
8import threading
9import uuid
81eedcae 10from six import itervalues, iteritems
11fdf7f2 11from collections import defaultdict
eafe8130 12from prettytable import PrettyTable, PLAIN_COLUMNS
11fdf7f2
TL
13
14from mgr_module import MgrModule
15
16"""
17Some terminology is made up for the purposes of this module:
18
19 - "raw pgs": pg count after applying replication, i.e. the real resource
20 consumption of a pool.
21 - "grow/shrink" - increase/decrease the pg_num in a pool
22 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
23 units of resource management.
24"""
25
26INTERVAL = 5
27
28PG_NUM_MIN = 4 # unless specified on a per-pool basis
29
30def nearest_power_of_two(n):
31 v = int(n)
32
33 v -= 1
34 v |= v >> 1
35 v |= v >> 2
36 v |= v >> 4
37 v |= v >> 8
38 v |= v >> 16
39
40 # High bound power of two
41 v += 1
42
43 # Low bound power of tow
44 x = v >> 1
45
46 return x if (v - n) > (n - x) else v
47
48
49class PgAutoscaler(MgrModule):
50 """
51 PG autoscaler.
52 """
53 COMMANDS = [
54 {
55 "cmd": "osd pool autoscale-status",
56 "desc": "report on pool pg_num sizing recommendation and intent",
57 "perm": "r"
58 },
59 ]
60
61 NATIVE_OPTIONS = [
62 'mon_target_pg_per_osd',
63 'mon_max_pg_per_osd',
64 ]
65
66 MODULE_OPTIONS = [
67 {
68 'name': 'sleep_interval',
69 'default': str(60),
70 },
71 ]
72
73 def __init__(self, *args, **kwargs):
74 super(PgAutoscaler, self).__init__(*args, **kwargs)
75 self._shutdown = threading.Event()
76
77 # So much of what we do peeks at the osdmap that it's easiest
78 # to just keep a copy of the pythonized version.
79 self._osd_map = None
80
81 def config_notify(self):
82 for opt in self.NATIVE_OPTIONS:
83 setattr(self,
84 opt,
85 self.get_ceph_option(opt))
86 self.log.debug(' native option %s = %s', opt, getattr(self, opt))
87 for opt in self.MODULE_OPTIONS:
88 setattr(self,
89 opt['name'],
90 self.get_module_option(opt['name']) or opt['default'])
91 self.log.debug(' mgr option %s = %s',
92 opt['name'], getattr(self, opt['name']))
93
94
95 def handle_command(self, inbuf, cmd):
96 if cmd['prefix'] == "osd pool autoscale-status":
97 retval = self._command_autoscale_status(cmd)
98 else:
99 assert False # ceph-mgr should never pass us unknown cmds
100 return retval
101
102 def _command_autoscale_status(self, cmd):
103 osdmap = self.get_osdmap()
104 pools = osdmap.get_pools_by_name()
105 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
106
107 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
108 return 0, json.dumps(ps, indent=2), ''
109 else:
110 table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
111 'RATE', 'RAW CAPACITY',
112 'RATIO', 'TARGET RATIO',
113 'BIAS',
114 'PG_NUM',
115# 'IDEAL',
116 'NEW PG_NUM', 'AUTOSCALE'],
117 border=False)
eafe8130
TL
118 table.left_padding_width = 0
119 table.right_padding_width = 1
11fdf7f2
TL
120 table.align['POOL'] = 'l'
121 table.align['SIZE'] = 'r'
122 table.align['TARGET SIZE'] = 'r'
123 table.align['RATE'] = 'r'
124 table.align['RAW CAPACITY'] = 'r'
125 table.align['RATIO'] = 'r'
126 table.align['TARGET RATIO'] = 'r'
127 table.align['BIAS'] = 'r'
128 table.align['PG_NUM'] = 'r'
129# table.align['IDEAL'] = 'r'
130 table.align['NEW PG_NUM'] = 'r'
131 table.align['AUTOSCALE'] = 'l'
132 for p in ps:
133 if p['would_adjust']:
134 final = str(p['pg_num_final'])
135 else:
136 final = ''
137 if p['target_bytes'] > 0:
138 ts = mgr_util.format_bytes(p['target_bytes'], 6)
139 else:
140 ts = ''
141 if p['target_ratio'] > 0.0:
142 tr = '%.4f' % p['target_ratio']
143 else:
144 tr = ''
145 table.add_row([
146 p['pool_name'],
147 mgr_util.format_bytes(p['logical_used'], 6),
148 ts,
149 p['raw_used_rate'],
150 mgr_util.format_bytes(p['subtree_capacity'], 6),
151 '%.4f' % p['capacity_ratio'],
152 tr,
153 p['bias'],
154 p['pg_num_target'],
155# p['pg_num_ideal'],
156 final,
157 p['pg_autoscale_mode'],
158 ])
159 return 0, table.get_string(), ''
160
161 def serve(self):
162 self.config_notify()
163 while not self._shutdown.is_set():
164 self._maybe_adjust()
165 self._shutdown.wait(timeout=int(self.sleep_interval))
166
167 def get_subtree_resource_status(self, osdmap, crush):
168 """
169 For each CRUSH subtree of interest (i.e. the roots under which
170 we have pools), calculate the current resource usages and targets,
171 such as how many PGs there are, vs. how many PGs we would
172 like there to be.
173 """
174 result = {}
175 pool_root = {}
176 roots = []
177
178 class CrushSubtreeResourceStatus(object):
179 def __init__(self):
180 self.root_ids = []
181 self.osds = set()
182 self.osd_count = None # Number of OSDs
183 self.pg_target = None # Ideal full-capacity PG count?
184 self.pg_current = 0 # How many PGs already?
185 self.capacity = None # Total capacity of OSDs in subtree
186 self.pool_ids = []
187 self.pool_names = []
188
189 # identify subtrees (note that they may overlap!)
190 for pool_id, pool in osdmap.get_pools().items():
191 cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
192 root_id = int(crush.get_rule_root(cr_name))
193 pool_root[pool_id] = root_id
194 osds = set(crush.get_osds_under(root_id))
195
196 # do we intersect an existing root?
197 s = None
81eedcae 198 for prev in itervalues(result):
11fdf7f2
TL
199 if osds & prev.osds:
200 s = prev
201 break
202 if not s:
203 s = CrushSubtreeResourceStatus()
204 roots.append(s)
205 result[root_id] = s
206 s.root_ids.append(root_id)
207 s.osds |= osds
208 s.pool_ids.append(int(pool_id))
209 s.pool_names.append(pool['pool_name'])
210 s.pg_current += pool['pg_num_target'] * pool['size']
211
212
213 # finish subtrees
214 all_stats = self.get('osd_stats')
215 for s in roots:
216 s.osd_count = len(s.osds)
217 s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd)
218
219 capacity = 0.0
220 for osd_stats in all_stats['osd_stats']:
221 if osd_stats['osd'] in s.osds:
222 # Intentionally do not apply the OSD's reweight to
223 # this, because we want to calculate PG counts based
224 # on the physical storage available, not how it is
225 # reweighted right now.
226 capacity += osd_stats['kb'] * 1024
227
228 s.capacity = capacity
229
230 self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
231 s.root_ids,
232 s.pool_ids,
233 s.osd_count,
234 s.pg_target)
235
236 return result, pool_root
237
238
239 def _get_pool_status(
240 self,
241 osdmap,
242 pools,
243 threshold=3.0,
244 ):
245 assert threshold >= 2.0
246
247 crush_map = osdmap.get_crush()
248
249 root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
250
251 df = self.get('df')
252 pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
253
254 ret = []
255
256 # iterate over all pools to determine how they should be sized
81eedcae 257 for pool_name, p in iteritems(pools):
11fdf7f2 258 pool_id = p['pool']
eafe8130
TL
259 if pool_id not in pool_stats:
260 # race with pool deletion; skip
261 continue
11fdf7f2
TL
262
263 # FIXME: we assume there is only one take per pool, but that
264 # may not be true.
265 cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
266 root_id = int(crush_map.get_rule_root(cr_name))
267 pool_root[pool_name] = root_id
268
269 capacity = root_map[root_id].capacity
270 if capacity == 0:
271 self.log.debug('skipping empty subtree %s', cr_name)
272 continue
273
274 raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
275
eafe8130 276 pool_logical_used = pool_stats[pool_id]['stored']
11fdf7f2
TL
277 bias = p['options'].get('pg_autoscale_bias', 1.0)
278 target_bytes = p['options'].get('target_size_bytes', 0)
279
280 # What proportion of space are we using?
281 actual_raw_used = pool_logical_used * raw_used_rate
282 actual_capacity_ratio = float(actual_raw_used) / capacity
283
284 pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
285 capacity_ratio = float(pool_raw_used) / capacity
286
287 target_ratio = p['options'].get('target_size_ratio', 0.0)
288 final_ratio = max(capacity_ratio, target_ratio)
289
290 # So what proportion of pg allowance should we be using?
291 pool_pg_target = (final_ratio * root_map[root_id].pg_target) / raw_used_rate * bias
292
293 final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
294 nearest_power_of_two(pool_pg_target))
295
296 self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
297 "pg target {4} quantized to {5} (current {6})".format(
298 p['pool_name'],
299 root_id,
300 final_ratio,
301 bias,
302 pool_pg_target,
303 final_pg_target,
304 p['pg_num_target']
305 ))
306
307 adjust = False
308 if (final_pg_target > p['pg_num_target'] * threshold or \
309 final_pg_target <= p['pg_num_target'] / threshold) and \
310 final_ratio >= 0.0 and \
311 final_ratio <= 1.0:
312 adjust = True
313
314 ret.append({
315 'pool_id': pool_id,
316 'pool_name': p['pool_name'],
317 'crush_root_id': root_id,
318 'pg_autoscale_mode': p['pg_autoscale_mode'],
319 'pg_num_target': p['pg_num_target'],
320 'logical_used': pool_logical_used,
321 'target_bytes': target_bytes,
322 'raw_used_rate': raw_used_rate,
323 'subtree_capacity': capacity,
324 'actual_raw_used': actual_raw_used,
325 'raw_used': pool_raw_used,
326 'actual_capacity_ratio': actual_capacity_ratio,
327 'capacity_ratio': capacity_ratio,
328 'target_ratio': target_ratio,
329 'pg_num_ideal': int(pool_pg_target),
330 'pg_num_final': final_pg_target,
331 'would_adjust': adjust,
332 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
333 });
334
335 return (ret, root_map, pool_root)
336
337
338 def _maybe_adjust(self):
339 self.log.info('_maybe_adjust')
340 osdmap = self.get_osdmap()
341 pools = osdmap.get_pools_by_name()
342 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
343
344 # Anyone in 'warn', set the health message for them and then
345 # drop them from consideration.
346 too_few = []
347 too_many = []
348 health_checks = {}
349
81eedcae
TL
350 total_ratio = dict([(r, 0.0) for r in iter(root_map)])
351 total_target_ratio = dict([(r, 0.0) for r in iter(root_map)])
352 target_ratio_pools = dict([(r, []) for r in iter(root_map)])
11fdf7f2 353
81eedcae
TL
354 total_bytes = dict([(r, 0) for r in iter(root_map)])
355 total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
356 target_bytes_pools = dict([(r, []) for r in iter(root_map)])
11fdf7f2
TL
357
358 for p in ps:
359 total_ratio[p['crush_root_id']] += max(p['actual_capacity_ratio'],
360 p['target_ratio'])
361 if p['target_ratio'] > 0:
362 total_target_ratio[p['crush_root_id']] += p['target_ratio']
363 target_ratio_pools[p['crush_root_id']].append(p['pool_name'])
364 total_bytes[p['crush_root_id']] += max(
365 p['actual_raw_used'],
366 p['target_bytes'] * p['raw_used_rate'])
367 if p['target_bytes'] > 0:
368 total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
369 target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
370 if not p['would_adjust']:
371 continue
372 if p['pg_autoscale_mode'] == 'warn':
373 msg = 'Pool %s has %d placement groups, should have %d' % (
374 p['pool_name'],
375 p['pg_num_target'],
376 p['pg_num_final'])
377 if p['pg_num_final'] > p['pg_num_target']:
378 too_few.append(msg)
379 else:
380 too_many.append(msg)
381
382 if p['pg_autoscale_mode'] == 'on':
383 # Note that setting pg_num actually sets pg_num_target (see
384 # OSDMonitor.cc)
385 r = self.mon_command({
386 'prefix': 'osd pool set',
387 'pool': p['pool_name'],
388 'var': 'pg_num',
389 'val': str(p['pg_num_final'])
390 })
391
392 if r[0] != 0:
393 # FIXME: this is a serious and unexpected thing,
394 # we should expose it as a cluster log error once
395 # the hook for doing that from ceph-mgr modules is
396 # in.
397 self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
398 .format(p['pool_name'],
399 p['pg_num_final'], r))
400
401 if too_few:
402 summary = "{0} pools have too few placement groups".format(
403 len(too_few))
404 health_checks['POOL_TOO_FEW_PGS'] = {
405 'severity': 'warning',
406 'summary': summary,
407 'detail': too_few
408 }
409 if too_many:
410 summary = "{0} pools have too many placement groups".format(
411 len(too_many))
412 health_checks['POOL_TOO_MANY_PGS'] = {
413 'severity': 'warning',
414 'summary': summary,
415 'detail': too_many
416 }
417
418 too_much_target_ratio = []
81eedcae 419 for root_id, total in iteritems(total_ratio):
11fdf7f2
TL
420 total_target = total_target_ratio[root_id]
421 if total > 1.0:
422 too_much_target_ratio.append(
423 'Pools %s overcommit available storage by %.03fx due to '
424 'target_size_ratio %.03f on pools %s' % (
425 root_map[root_id].pool_names,
426 total,
427 total_target,
428 target_ratio_pools[root_id]
429 )
430 )
431 elif total_target > 1.0:
432 too_much_target_ratio.append(
433 'Pools %s have collective target_size_ratio %.03f > 1.0' % (
434 root_map[root_id].pool_names,
435 total_target
436 )
437 )
438 if too_much_target_ratio:
439 health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
440 'severity': 'warning',
441 'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
442 'detail': too_much_target_ratio,
443 }
444
445 too_much_target_bytes = []
81eedcae 446 for root_id, total in iteritems(total_bytes):
11fdf7f2
TL
447 total_target = total_target_bytes[root_id]
448 if total > root_map[root_id].capacity:
449 too_much_target_bytes.append(
450 'Pools %s overcommit available storage by %.03fx due to '
451 'target_size_bytes %s on pools %s' % (
452 root_map[root_id].pool_names,
453 total / root_map[root_id].capacity,
454 mgr_util.format_bytes(total_target, 5, colored=False),
455 target_bytes_pools[root_id]
456 )
457 )
458 elif total_target > root_map[root_id].capacity:
459 too_much_target_bytes.append(
460 'Pools %s overcommit available storage by %.03fx due to '
461 'collective target_size_bytes of %s' % (
462 root_map[root_id].pool_names,
463 total / root_map[root_id].capacity,
464 mgr_util.format_bytes(total_target, 5, colored=False),
465 )
466 )
467 if too_much_target_bytes:
468 health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
469 'severity': 'warning',
470 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
471 'detail': too_much_target_bytes,
472 }
473
474
475 self.set_health_checks(health_checks)