]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/pg_autoscaler/module.py
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / pybind / mgr / pg_autoscaler / module.py
1 """
2 Automatically scale pg_num based on how much data is stored in each pool.
3 """
4
5 import errno
6 import json
7 import mgr_util
8 import threading
9 import uuid
10 from six import itervalues, iteritems
11 from collections import defaultdict
12 from prettytable import PrettyTable, PLAIN_COLUMNS
13
14 from mgr_module import MgrModule
15
16 """
17 Some terminology is made up for the purposes of this module:
18
19 - "raw pgs": pg count after applying replication, i.e. the real resource
20 consumption of a pool.
21 - "grow/shrink" - increase/decrease the pg_num in a pool
22 - "crush subtree" - non-overlapping domains in crush hierarchy: used as
23 units of resource management.
24 """
25
26 INTERVAL = 5
27
28 PG_NUM_MIN = 32 # unless specified on a per-pool basis
29
30 def nearest_power_of_two(n):
31 v = int(n)
32
33 v -= 1
34 v |= v >> 1
35 v |= v >> 2
36 v |= v >> 4
37 v |= v >> 8
38 v |= v >> 16
39
40 # High bound power of two
41 v += 1
42
43 # Low bound power of tow
44 x = v >> 1
45
46 return x if (v - n) > (n - x) else v
47
48
49 class PgAutoscaler(MgrModule):
50 """
51 PG autoscaler.
52 """
53 COMMANDS = [
54 {
55 "cmd": "osd pool autoscale-status",
56 "desc": "report on pool pg_num sizing recommendation and intent",
57 "perm": "r"
58 },
59 ]
60
61 NATIVE_OPTIONS = [
62 'mon_target_pg_per_osd',
63 'mon_max_pg_per_osd',
64 ]
65
66 MODULE_OPTIONS = [
67 {
68 'name': 'sleep_interval',
69 'default': str(60),
70 },
71 ]
72
73 def __init__(self, *args, **kwargs):
74 super(PgAutoscaler, self).__init__(*args, **kwargs)
75 self._shutdown = threading.Event()
76
77 # So much of what we do peeks at the osdmap that it's easiest
78 # to just keep a copy of the pythonized version.
79 self._osd_map = None
80
81 def config_notify(self):
82 for opt in self.NATIVE_OPTIONS:
83 setattr(self,
84 opt,
85 self.get_ceph_option(opt))
86 self.log.debug(' native option %s = %s', opt, getattr(self, opt))
87 for opt in self.MODULE_OPTIONS:
88 setattr(self,
89 opt['name'],
90 self.get_module_option(opt['name']) or opt['default'])
91 self.log.debug(' mgr option %s = %s',
92 opt['name'], getattr(self, opt['name']))
93
94
95 def handle_command(self, inbuf, cmd):
96 if cmd['prefix'] == "osd pool autoscale-status":
97 retval = self._command_autoscale_status(cmd)
98 else:
99 assert False # ceph-mgr should never pass us unknown cmds
100 return retval
101
102 def _command_autoscale_status(self, cmd):
103 osdmap = self.get_osdmap()
104 pools = osdmap.get_pools_by_name()
105 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
106
107 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
108 return 0, json.dumps(ps, indent=2), ''
109 else:
110 table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE',
111 'RATE', 'RAW CAPACITY',
112 'RATIO', 'TARGET RATIO',
113 'BIAS',
114 'PG_NUM',
115 # 'IDEAL',
116 'NEW PG_NUM', 'AUTOSCALE'],
117 border=False)
118 table.left_padding_width = 0
119 table.right_padding_width = 1
120 table.align['POOL'] = 'l'
121 table.align['SIZE'] = 'r'
122 table.align['TARGET SIZE'] = 'r'
123 table.align['RATE'] = 'r'
124 table.align['RAW CAPACITY'] = 'r'
125 table.align['RATIO'] = 'r'
126 table.align['TARGET RATIO'] = 'r'
127 table.align['BIAS'] = 'r'
128 table.align['PG_NUM'] = 'r'
129 # table.align['IDEAL'] = 'r'
130 table.align['NEW PG_NUM'] = 'r'
131 table.align['AUTOSCALE'] = 'l'
132 for p in ps:
133 if p['would_adjust']:
134 final = str(p['pg_num_final'])
135 else:
136 final = ''
137 if p['target_bytes'] > 0:
138 ts = mgr_util.format_bytes(p['target_bytes'], 6)
139 else:
140 ts = ''
141 if p['target_ratio'] > 0.0:
142 tr = '%.4f' % p['target_ratio']
143 else:
144 tr = ''
145 table.add_row([
146 p['pool_name'],
147 mgr_util.format_bytes(p['logical_used'], 6),
148 ts,
149 p['raw_used_rate'],
150 mgr_util.format_bytes(p['subtree_capacity'], 6),
151 '%.4f' % p['capacity_ratio'],
152 tr,
153 p['bias'],
154 p['pg_num_target'],
155 # p['pg_num_ideal'],
156 final,
157 p['pg_autoscale_mode'],
158 ])
159 return 0, table.get_string(), ''
160
161 def serve(self):
162 self.config_notify()
163 while not self._shutdown.is_set():
164 self._maybe_adjust()
165 self._shutdown.wait(timeout=int(self.sleep_interval))
166
167 def shutdown(self):
168 self.log.info('Stopping pg_autoscaler')
169 self._shutdown.set()
170
171 def get_subtree_resource_status(self, osdmap, crush):
172 """
173 For each CRUSH subtree of interest (i.e. the roots under which
174 we have pools), calculate the current resource usages and targets,
175 such as how many PGs there are, vs. how many PGs we would
176 like there to be.
177 """
178 result = {}
179 pool_root = {}
180 roots = []
181
182 class CrushSubtreeResourceStatus(object):
183 def __init__(self):
184 self.root_ids = []
185 self.osds = set()
186 self.osd_count = None # Number of OSDs
187 self.pg_target = None # Ideal full-capacity PG count?
188 self.pg_current = 0 # How many PGs already?
189 self.capacity = None # Total capacity of OSDs in subtree
190 self.pool_ids = []
191 self.pool_names = []
192
193 # identify subtrees (note that they may overlap!)
194 for pool_id, pool in osdmap.get_pools().items():
195 cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name']
196 root_id = int(crush.get_rule_root(cr_name))
197 pool_root[pool_id] = root_id
198 osds = set(crush.get_osds_under(root_id))
199
200 # do we intersect an existing root?
201 s = None
202 for prev in itervalues(result):
203 if osds & prev.osds:
204 s = prev
205 break
206 if not s:
207 s = CrushSubtreeResourceStatus()
208 roots.append(s)
209 result[root_id] = s
210 s.root_ids.append(root_id)
211 s.osds |= osds
212 s.pool_ids.append(int(pool_id))
213 s.pool_names.append(pool['pool_name'])
214 s.pg_current += pool['pg_num_target'] * pool['size']
215
216
217 # finish subtrees
218 all_stats = self.get('osd_stats')
219 for s in roots:
220 s.osd_count = len(s.osds)
221 s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd)
222
223 capacity = 0.0
224 for osd_stats in all_stats['osd_stats']:
225 if osd_stats['osd'] in s.osds:
226 # Intentionally do not apply the OSD's reweight to
227 # this, because we want to calculate PG counts based
228 # on the physical storage available, not how it is
229 # reweighted right now.
230 capacity += osd_stats['kb'] * 1024
231
232 s.capacity = capacity
233
234 self.log.debug('root_ids %s pools %s with %d osds, pg_target %d',
235 s.root_ids,
236 s.pool_ids,
237 s.osd_count,
238 s.pg_target)
239
240 return result, pool_root
241
242
243 def _get_pool_status(
244 self,
245 osdmap,
246 pools,
247 threshold=3.0,
248 ):
249 assert threshold >= 2.0
250
251 crush_map = osdmap.get_crush()
252
253 root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map)
254
255 df = self.get('df')
256 pool_stats = dict([(p['id'], p['stats']) for p in df['pools']])
257
258 ret = []
259
260 # iterate over all pools to determine how they should be sized
261 for pool_name, p in iteritems(pools):
262 pool_id = p['pool']
263 if pool_id not in pool_stats:
264 # race with pool deletion; skip
265 continue
266
267 # FIXME: we assume there is only one take per pool, but that
268 # may not be true.
269 cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name']
270 root_id = int(crush_map.get_rule_root(cr_name))
271 pool_root[pool_name] = root_id
272
273 capacity = root_map[root_id].capacity
274 if capacity == 0:
275 self.log.debug('skipping empty subtree %s', cr_name)
276 continue
277
278 raw_used_rate = osdmap.pool_raw_used_rate(pool_id)
279
280 pool_logical_used = pool_stats[pool_id]['stored']
281 bias = p['options'].get('pg_autoscale_bias', 1.0)
282 target_bytes = p['options'].get('target_size_bytes', 0)
283
284 # What proportion of space are we using?
285 actual_raw_used = pool_logical_used * raw_used_rate
286 actual_capacity_ratio = float(actual_raw_used) / capacity
287
288 pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate
289 capacity_ratio = float(pool_raw_used) / capacity
290
291 target_ratio = p['options'].get('target_size_ratio', 0.0)
292 final_ratio = max(capacity_ratio, target_ratio)
293
294 # So what proportion of pg allowance should we be using?
295 pool_pg_target = (final_ratio * root_map[root_id].pg_target) / p['size'] * bias
296
297 final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN),
298 nearest_power_of_two(pool_pg_target))
299
300 self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, "
301 "pg target {4} quantized to {5} (current {6})".format(
302 p['pool_name'],
303 root_id,
304 final_ratio,
305 bias,
306 pool_pg_target,
307 final_pg_target,
308 p['pg_num_target']
309 ))
310
311 adjust = False
312 if (final_pg_target > p['pg_num_target'] * threshold or \
313 final_pg_target <= p['pg_num_target'] / threshold) and \
314 final_ratio >= 0.0 and \
315 final_ratio <= 1.0:
316 adjust = True
317
318 ret.append({
319 'pool_id': pool_id,
320 'pool_name': p['pool_name'],
321 'crush_root_id': root_id,
322 'pg_autoscale_mode': p['pg_autoscale_mode'],
323 'pg_num_target': p['pg_num_target'],
324 'logical_used': pool_logical_used,
325 'target_bytes': target_bytes,
326 'raw_used_rate': raw_used_rate,
327 'subtree_capacity': capacity,
328 'actual_raw_used': actual_raw_used,
329 'raw_used': pool_raw_used,
330 'actual_capacity_ratio': actual_capacity_ratio,
331 'capacity_ratio': capacity_ratio,
332 'target_ratio': target_ratio,
333 'pg_num_ideal': int(pool_pg_target),
334 'pg_num_final': final_pg_target,
335 'would_adjust': adjust,
336 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0),
337 });
338
339 return (ret, root_map, pool_root)
340
341
342 def _maybe_adjust(self):
343 self.log.info('_maybe_adjust')
344 osdmap = self.get_osdmap()
345 pools = osdmap.get_pools_by_name()
346 ps, root_map, pool_root = self._get_pool_status(osdmap, pools)
347
348 # Anyone in 'warn', set the health message for them and then
349 # drop them from consideration.
350 too_few = []
351 too_many = []
352 health_checks = {}
353
354 total_ratio = dict([(r, 0.0) for r in iter(root_map)])
355 total_target_ratio = dict([(r, 0.0) for r in iter(root_map)])
356 target_ratio_pools = dict([(r, []) for r in iter(root_map)])
357
358 total_bytes = dict([(r, 0) for r in iter(root_map)])
359 total_target_bytes = dict([(r, 0.0) for r in iter(root_map)])
360 target_bytes_pools = dict([(r, []) for r in iter(root_map)])
361
362 for p in ps:
363 total_ratio[p['crush_root_id']] += max(p['actual_capacity_ratio'],
364 p['target_ratio'])
365 if p['target_ratio'] > 0:
366 total_target_ratio[p['crush_root_id']] += p['target_ratio']
367 target_ratio_pools[p['crush_root_id']].append(p['pool_name'])
368 total_bytes[p['crush_root_id']] += max(
369 p['actual_raw_used'],
370 p['target_bytes'] * p['raw_used_rate'])
371 if p['target_bytes'] > 0:
372 total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate']
373 target_bytes_pools[p['crush_root_id']].append(p['pool_name'])
374 if not p['would_adjust']:
375 continue
376 if p['pg_autoscale_mode'] == 'warn':
377 msg = 'Pool %s has %d placement groups, should have %d' % (
378 p['pool_name'],
379 p['pg_num_target'],
380 p['pg_num_final'])
381 if p['pg_num_final'] > p['pg_num_target']:
382 too_few.append(msg)
383 else:
384 too_many.append(msg)
385
386 if p['pg_autoscale_mode'] == 'on':
387 # Note that setting pg_num actually sets pg_num_target (see
388 # OSDMonitor.cc)
389 r = self.mon_command({
390 'prefix': 'osd pool set',
391 'pool': p['pool_name'],
392 'var': 'pg_num',
393 'val': str(p['pg_num_final'])
394 })
395
396 if r[0] != 0:
397 # FIXME: this is a serious and unexpected thing,
398 # we should expose it as a cluster log error once
399 # the hook for doing that from ceph-mgr modules is
400 # in.
401 self.log.error("pg_num adjustment on {0} to {1} failed: {2}"
402 .format(p['pool_name'],
403 p['pg_num_final'], r))
404
405 if too_few:
406 summary = "{0} pools have too few placement groups".format(
407 len(too_few))
408 health_checks['POOL_TOO_FEW_PGS'] = {
409 'severity': 'warning',
410 'summary': summary,
411 'detail': too_few
412 }
413 if too_many:
414 summary = "{0} pools have too many placement groups".format(
415 len(too_many))
416 health_checks['POOL_TOO_MANY_PGS'] = {
417 'severity': 'warning',
418 'summary': summary,
419 'detail': too_many
420 }
421
422 too_much_target_ratio = []
423 for root_id, total in iteritems(total_ratio):
424 total_target = total_target_ratio[root_id]
425 if total_target > 0 and total > 1.0:
426 too_much_target_ratio.append(
427 'Pools %s overcommit available storage by %.03fx due to '
428 'target_size_ratio %.03f on pools %s' % (
429 root_map[root_id].pool_names,
430 total,
431 total_target,
432 target_ratio_pools[root_id]
433 )
434 )
435 elif total_target > 1.0:
436 too_much_target_ratio.append(
437 'Pools %s have collective target_size_ratio %.03f > 1.0' % (
438 root_map[root_id].pool_names,
439 total_target
440 )
441 )
442 if too_much_target_ratio:
443 health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
444 'severity': 'warning',
445 'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
446 'detail': too_much_target_ratio,
447 }
448
449 too_much_target_bytes = []
450 for root_id, total in iteritems(total_bytes):
451 total_target = total_target_bytes[root_id]
452 if total_target > 0 and total > root_map[root_id].capacity:
453 too_much_target_bytes.append(
454 'Pools %s overcommit available storage by %.03fx due to '
455 'target_size_bytes %s on pools %s' % (
456 root_map[root_id].pool_names,
457 total / root_map[root_id].capacity,
458 mgr_util.format_bytes(total_target, 5, colored=False),
459 target_bytes_pools[root_id]
460 )
461 )
462 elif total_target > root_map[root_id].capacity:
463 too_much_target_bytes.append(
464 'Pools %s overcommit available storage by %.03fx due to '
465 'collective target_size_bytes of %s' % (
466 root_map[root_id].pool_names,
467 total / root_map[root_id].capacity,
468 mgr_util.format_bytes(total_target, 5, colored=False),
469 )
470 )
471 if too_much_target_bytes:
472 health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
473 'severity': 'warning',
474 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
475 'detail': too_much_target_bytes,
476 }
477
478
479 self.set_health_checks(health_checks)