]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/balancer/module.py
import 15.2.4
[ceph.git] / ceph / src / pybind / mgr / balancer / module.py
CommitLineData
3efd9988
FG
1"""
2Balance PG distribution across OSDs.
3"""
4
5import copy
6import errno
7import json
8import math
9import random
1adf2230 10import six
3efd9988
FG
11import time
12from mgr_module import MgrModule, CommandResult
13from threading import Event
b32b8144 14from mgr_module import CRUSHMap
eafe8130 15import datetime
3efd9988 16
3efd9988
FG
17TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
18
3efd9988 19class MappingState:
9f95a23c 20 def __init__(self, osdmap, raw_pg_stats, raw_pool_stats, desc=''):
3efd9988
FG
21 self.desc = desc
22 self.osdmap = osdmap
23 self.osdmap_dump = self.osdmap.dump()
24 self.crush = osdmap.get_crush()
25 self.crush_dump = self.crush.dump()
9f95a23c
TL
26 self.raw_pg_stats = raw_pg_stats
27 self.raw_pool_stats = raw_pool_stats
3efd9988 28 self.pg_stat = {
9f95a23c 29 i['pgid']: i['stat_sum'] for i in raw_pg_stats.get('pg_stats', [])
3efd9988 30 }
94b18763 31 osd_poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])]
9f95a23c 32 pg_poolids = [p['poolid'] for p in raw_pool_stats.get('pool_stats', [])]
94b18763 33 self.poolids = set(osd_poolids) & set(pg_poolids)
3efd9988
FG
34 self.pg_up = {}
35 self.pg_up_by_poolid = {}
36 for poolid in self.poolids:
37 self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid)
1adf2230 38 for a,b in six.iteritems(self.pg_up_by_poolid[poolid]):
3efd9988
FG
39 self.pg_up[a] = b
40
41 def calc_misplaced_from(self, other_ms):
42 num = len(other_ms.pg_up)
43 misplaced = 0
1adf2230 44 for pgid, before in six.iteritems(other_ms.pg_up):
3efd9988
FG
45 if before != self.pg_up.get(pgid, []):
46 misplaced += 1
47 if num > 0:
48 return float(misplaced) / float(num)
49 return 0.0
50
9f95a23c
TL
51class Plan(object):
52 def __init__(self, name, mode, osdmap, pools):
3efd9988 53 self.name = name
9f95a23c
TL
54 self.mode = mode
55 self.osdmap = osdmap
56 self.osdmap_dump = osdmap.dump()
94b18763 57 self.pools = pools
3efd9988
FG
58 self.osd_weights = {}
59 self.compat_ws = {}
9f95a23c
TL
60 self.inc = osdmap.new_incremental()
61 self.pg_status = {}
62
63class MsPlan(Plan):
64 """
65 Plan with a preloaded MappingState member.
66 """
67 def __init__(self, name, mode, ms, pools):
68 super(MsPlan, self).__init__(name, mode, ms.osdmap, pools)
69 self.initial = ms
3efd9988
FG
70
71 def final_state(self):
72 self.inc.set_osd_reweights(self.osd_weights)
73 self.inc.set_crush_compat_weight_set_weights(self.compat_ws)
74 return MappingState(self.initial.osdmap.apply_incremental(self.inc),
9f95a23c
TL
75 self.initial.raw_pg_stats,
76 self.initial.raw_pool_stats,
3efd9988
FG
77 'plan %s final' % self.name)
78
79 def dump(self):
9f95a23c 80 return json.dumps(self.inc.dump(), indent=4, sort_keys=True)
3efd9988
FG
81
82 def show(self):
83 ls = []
84 ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch())
85 ls.append('# starting crush version %d' %
86 self.initial.osdmap.get_crush_version())
87 ls.append('# mode %s' % self.mode)
88 if len(self.compat_ws) and \
11fdf7f2 89 not CRUSHMap.have_default_choose_args(self.initial.crush_dump):
3efd9988 90 ls.append('ceph osd crush weight-set create-compat')
1adf2230 91 for osd, weight in six.iteritems(self.compat_ws):
3efd9988
FG
92 ls.append('ceph osd crush weight-set reweight-compat %s %f' %
93 (osd, weight))
1adf2230 94 for osd, weight in six.iteritems(self.osd_weights):
3efd9988
FG
95 ls.append('ceph osd reweight osd.%d %f' % (osd, weight))
96 incdump = self.inc.dump()
97 for pgid in incdump.get('old_pg_upmap_items', []):
98 ls.append('ceph osd rm-pg-upmap-items %s' % pgid)
99 for item in incdump.get('new_pg_upmap_items', []):
100 osdlist = []
101 for m in item['mappings']:
102 osdlist += [m['from'], m['to']]
103 ls.append('ceph osd pg-upmap-items %s %s' %
104 (item['pgid'], ' '.join([str(a) for a in osdlist])))
105 return '\n'.join(ls)
106
107
108class Eval:
3efd9988
FG
109 def __init__(self, ms):
110 self.ms = ms
94b18763
FG
111 self.root_ids = {} # root name -> id
112 self.pool_name = {} # pool id -> pool name
113 self.pool_id = {} # pool name -> id
114 self.pool_roots = {} # pool name -> root name
115 self.root_pools = {} # root name -> pools
116 self.target_by_root = {} # root name -> target weight map
117 self.count_by_pool = {}
118 self.count_by_root = {}
119 self.actual_by_pool = {} # pool -> by_* -> actual weight map
120 self.actual_by_root = {} # pool -> by_* -> actual weight map
121 self.total_by_pool = {} # pool -> by_* -> total
122 self.total_by_root = {} # root -> by_* -> total
123 self.stats_by_pool = {} # pool -> by_* -> stddev or avg -> value
124 self.stats_by_root = {} # root -> by_* -> stddev or avg -> value
125
126 self.score_by_pool = {}
127 self.score_by_root = {}
128
129 self.score = 0.0
3efd9988
FG
130
131 def show(self, verbose=False):
132 if verbose:
133 r = self.ms.desc + '\n'
134 r += 'target_by_root %s\n' % self.target_by_root
135 r += 'actual_by_pool %s\n' % self.actual_by_pool
136 r += 'actual_by_root %s\n' % self.actual_by_root
137 r += 'count_by_pool %s\n' % self.count_by_pool
138 r += 'count_by_root %s\n' % self.count_by_root
139 r += 'total_by_pool %s\n' % self.total_by_pool
140 r += 'total_by_root %s\n' % self.total_by_root
141 r += 'stats_by_root %s\n' % self.stats_by_root
142 r += 'score_by_pool %s\n' % self.score_by_pool
143 r += 'score_by_root %s\n' % self.score_by_root
144 else:
145 r = self.ms.desc + ' '
146 r += 'score %f (lower is better)\n' % self.score
147 return r
148
149 def calc_stats(self, count, target, total):
150 num = max(len(target), 1)
151 r = {}
152 for t in ('pgs', 'objects', 'bytes'):
94b18763
FG
153 if total[t] == 0:
154 r[t] = {
9f95a23c
TL
155 'max': 0,
156 'min': 0,
94b18763
FG
157 'avg': 0,
158 'stddev': 0,
159 'sum_weight': 0,
160 'score': 0,
161 }
162 continue
163
3efd9988
FG
164 avg = float(total[t]) / float(num)
165 dev = 0.0
166
167 # score is a measure of how uneven the data distribution is.
168 # score lies between [0, 1), 0 means perfect distribution.
169 score = 0.0
170 sum_weight = 0.0
171
1adf2230 172 for k, v in six.iteritems(count[t]):
3efd9988
FG
173 # adjust/normalize by weight
174 if target[k]:
175 adjusted = float(v) / target[k] / float(num)
176 else:
177 adjusted = 0.0
178
179 # Overweighted devices and their weights are factors to calculate reweight_urgency.
180 # One 10% underfilled device with 5 2% overfilled devices, is arguably a better
181 # situation than one 10% overfilled with 5 2% underfilled devices
182 if adjusted > avg:
183 '''
184 F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution
185 x = (adjusted - avg)/avg.
186 Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1).
187 To bring range of F(x) in range [0, 1), we need to make the above modification.
188
189 In general, we need to use a function F(x), where x = (adjusted - avg)/avg
190 1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded.
191 2. A larger value of x, should imply more urgency to reweight.
192 3. Also, the difference between F(x) when x is large, should be minimal.
193 4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply.
194
195 Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use.
196
197 cdf of standard normal distribution: https://stackoverflow.com/a/29273201
198 '''
199 score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0)))
200 sum_weight += target[k]
201 dev += (avg - adjusted) * (avg - adjusted)
202 stddev = math.sqrt(dev / float(max(num - 1, 1)))
203 score = score / max(sum_weight, 1)
204 r[t] = {
9f95a23c
TL
205 'max': max(count[t].values()),
206 'min': min(count[t].values()),
3efd9988
FG
207 'avg': avg,
208 'stddev': stddev,
209 'sum_weight': sum_weight,
210 'score': score,
211 }
212 return r
213
214class Module(MgrModule):
11fdf7f2
TL
215 MODULE_OPTIONS = [
216 {
217 'name': 'active',
218 'type': 'bool',
219 'default': False,
220 'desc': 'automatically balance PGs across cluster',
221 'runtime': True,
222 },
223 {
224 'name': 'begin_time',
225 'type': 'str',
226 'default': '0000',
227 'desc': 'beginning time of day to automatically balance',
228 'long_desc': 'This is a time of day in the format HHMM.',
229 'runtime': True,
230 },
231 {
232 'name': 'end_time',
233 'type': 'str',
234 'default': '2400',
235 'desc': 'ending time of day to automatically balance',
236 'long_desc': 'This is a time of day in the format HHMM.',
237 'runtime': True,
238 },
239 {
240 'name': 'begin_weekday',
241 'type': 'uint',
242 'default': 0,
243 'min': 0,
244 'max': 7,
245 'desc': 'Restrict automatic balancing to this day of the week or later',
246 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.',
247 'runtime': True,
248 },
249 {
250 'name': 'end_weekday',
251 'type': 'uint',
252 'default': 7,
253 'min': 0,
254 'max': 7,
255 'desc': 'Restrict automatic balancing to days of the week earlier than this',
256 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.',
257 'runtime': True,
258 },
259 {
260 'name': 'crush_compat_max_iterations',
261 'type': 'uint',
262 'default': 25,
263 'min': 1,
264 'max': 250,
265 'desc': 'maximum number of iterations to attempt optimization',
266 'runtime': True,
267 },
268 {
269 'name': 'crush_compat_metrics',
270 'type': 'str',
271 'default': 'pgs,objects,bytes',
272 'desc': 'metrics with which to calculate OSD utilization',
273 'long_desc': 'Value is a list of one or more of "pgs", "objects", or "bytes", and indicates which metrics to use to balance utilization.',
274 'runtime': True,
275 },
276 {
277 'name': 'crush_compat_step',
278 'type': 'float',
279 'default': .5,
280 'min': .001,
281 'max': .999,
282 'desc': 'aggressiveness of optimization',
283 'long_desc': '.99 is very aggressive, .01 is less aggressive',
284 'runtime': True,
285 },
286 {
287 'name': 'min_score',
288 'type': 'float',
289 'default': 0,
290 'desc': 'minimum score, below which no optimization is attempted',
291 'runtime': True,
292 },
293 {
294 'name': 'mode',
295 'desc': 'Balancer mode',
296 'default': 'none',
297 'enum_allowed': ['none', 'crush-compat', 'upmap'],
298 'runtime': True,
299 },
300 {
301 'name': 'sleep_interval',
302 'type': 'secs',
303 'default': 60,
304 'desc': 'how frequently to wake up and attempt optimization',
305 'runtime': True,
306 },
307 {
9f95a23c 308 'name': 'upmap_max_optimizations',
11fdf7f2
TL
309 'type': 'uint',
310 'default': 10,
9f95a23c 311 'desc': 'maximum upmap optimizations to make per attempt',
11fdf7f2
TL
312 'runtime': True,
313 },
314 {
315 'name': 'upmap_max_deviation',
92f5a8d4
TL
316 'type': 'int',
317 'default': 5,
318 'min': 1,
11fdf7f2 319 'desc': 'deviation below which no optimization is attempted',
92f5a8d4 320 'long_desc': 'If the number of PGs are within this count then no optimization is attempted',
11fdf7f2
TL
321 'runtime': True,
322 },
323 {
324 'name': 'pool_ids',
325 'type': 'str',
326 'default': '',
327 'desc': 'pools which the automatic balancing will be limited to',
328 'runtime': True,
329 },
330 ]
331
3efd9988
FG
332 COMMANDS = [
333 {
334 "cmd": "balancer status",
335 "desc": "Show balancer status",
336 "perm": "r",
337 },
338 {
339 "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap",
340 "desc": "Set balancer mode",
341 "perm": "rw",
342 },
343 {
344 "cmd": "balancer on",
345 "desc": "Enable automatic balancing",
346 "perm": "rw",
347 },
348 {
349 "cmd": "balancer off",
350 "desc": "Disable automatic balancing",
351 "perm": "rw",
352 },
11fdf7f2
TL
353 {
354 "cmd": "balancer pool ls",
355 "desc": "List automatic balancing pools. "
356 "Note that empty list means all existing pools will be automatic balancing targets, "
357 "which is the default behaviour of balancer.",
358 "perm": "r",
359 },
360 {
361 "cmd": "balancer pool add name=pools,type=CephString,n=N",
362 "desc": "Enable automatic balancing for specific pools",
363 "perm": "rw",
364 },
365 {
366 "cmd": "balancer pool rm name=pools,type=CephString,n=N",
367 "desc": "Disable automatic balancing for specific pools",
368 "perm": "rw",
369 },
3efd9988 370 {
94b18763
FG
371 "cmd": "balancer eval name=option,type=CephString,req=false",
372 "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan",
3efd9988
FG
373 "perm": "r",
374 },
375 {
94b18763
FG
376 "cmd": "balancer eval-verbose name=option,type=CephString,req=false",
377 "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan (verbosely)",
3efd9988
FG
378 "perm": "r",
379 },
380 {
94b18763 381 "cmd": "balancer optimize name=plan,type=CephString name=pools,type=CephString,n=N,req=false",
3efd9988
FG
382 "desc": "Run optimizer to create a new plan",
383 "perm": "rw",
384 },
385 {
386 "cmd": "balancer show name=plan,type=CephString",
387 "desc": "Show details of an optimization plan",
388 "perm": "r",
389 },
390 {
391 "cmd": "balancer rm name=plan,type=CephString",
392 "desc": "Discard an optimization plan",
393 "perm": "rw",
394 },
395 {
396 "cmd": "balancer reset",
397 "desc": "Discard all optimization plans",
398 "perm": "rw",
399 },
400 {
401 "cmd": "balancer dump name=plan,type=CephString",
402 "desc": "Show an optimization plan",
403 "perm": "r",
404 },
f64942e4
AA
405 {
406 "cmd": "balancer ls",
407 "desc": "List all plans",
408 "perm": "r",
409 },
3efd9988
FG
410 {
411 "cmd": "balancer execute name=plan,type=CephString",
412 "desc": "Execute an optimization plan",
a8e16298 413 "perm": "rw",
3efd9988
FG
414 },
415 ]
416 active = False
417 run = True
418 plans = {}
419 mode = ''
eafe8130
TL
420 optimizing = False
421 last_optimize_started = ''
422 last_optimize_duration = ''
423 optimize_result = ''
424 success_string = 'Optimization plan created successfully'
425 in_progress_string = 'in progress'
3efd9988
FG
426
427 def __init__(self, *args, **kwargs):
428 super(Module, self).__init__(*args, **kwargs)
429 self.event = Event()
430
11fdf7f2 431 def handle_command(self, inbuf, command):
e306af50 432 self.log.warning("Handling command: '%s'" % str(command))
3efd9988
FG
433 if command['prefix'] == 'balancer status':
434 s = {
f64942e4 435 'plans': list(self.plans.keys()),
3efd9988 436 'active': self.active,
eafe8130
TL
437 'last_optimize_started': self.last_optimize_started,
438 'last_optimize_duration': self.last_optimize_duration,
439 'optimize_result': self.optimize_result,
11fdf7f2 440 'mode': self.get_module_option('mode'),
3efd9988 441 }
9f95a23c 442 return (0, json.dumps(s, indent=4, sort_keys=True), '')
3efd9988 443 elif command['prefix'] == 'balancer mode':
11fdf7f2
TL
444 if command['mode'] == 'upmap':
445 min_compat_client = self.get_osdmap().dump().get('require_min_compat_client', '')
446 if min_compat_client < 'luminous': # works well because version is alphabetized..
447 warn = 'min_compat_client "%s" ' \
448 '< "luminous", which is required for pg-upmap. ' \
449 'Try "ceph osd set-require-min-compat-client luminous" ' \
450 'before enabling this mode' % min_compat_client
451 return (-errno.EPERM, '', warn)
81eedcae
TL
452 elif command['mode'] == 'crush-compat':
453 ms = MappingState(self.get_osdmap(),
9f95a23c
TL
454 self.get("pg_stats"),
455 self.get("pool_stats"),
81eedcae
TL
456 'initialize compat weight-set')
457 self.get_compat_weight_set_weights(ms) # ignore error
11fdf7f2 458 self.set_module_option('mode', command['mode'])
3efd9988
FG
459 return (0, '', '')
460 elif command['prefix'] == 'balancer on':
461 if not self.active:
11fdf7f2 462 self.set_module_option('active', 'true')
3efd9988
FG
463 self.active = True
464 self.event.set()
465 return (0, '', '')
466 elif command['prefix'] == 'balancer off':
467 if self.active:
11fdf7f2 468 self.set_module_option('active', 'false')
3efd9988
FG
469 self.active = False
470 self.event.set()
471 return (0, '', '')
11fdf7f2
TL
472 elif command['prefix'] == 'balancer pool ls':
473 pool_ids = self.get_module_option('pool_ids')
474 if pool_ids is '':
475 return (0, '', '')
476 pool_ids = pool_ids.split(',')
477 pool_ids = [int(p) for p in pool_ids]
478 pool_name_by_id = dict((p['pool'], p['pool_name']) for p in self.get_osdmap().dump().get('pools', []))
479 should_prune = False
480 final_ids = []
481 final_names = []
482 for p in pool_ids:
483 if p in pool_name_by_id:
484 final_ids.append(p)
485 final_names.append(pool_name_by_id[p])
486 else:
487 should_prune = True
488 if should_prune: # some pools were gone, prune
489 self.set_module_option('pool_ids', ','.join(final_ids))
9f95a23c 490 return (0, json.dumps(sorted(final_names), indent=4, sort_keys=True), '')
11fdf7f2
TL
491 elif command['prefix'] == 'balancer pool add':
492 raw_names = command['pools']
493 pool_id_by_name = dict((p['pool_name'], p['pool']) for p in self.get_osdmap().dump().get('pools', []))
494 invalid_names = [p for p in raw_names if p not in pool_id_by_name]
495 if invalid_names:
496 return (-errno.EINVAL, '', 'pool(s) %s not found' % invalid_names)
497 to_add = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name]
498 existing = self.get_module_option('pool_ids')
499 final = to_add
500 if existing is not '':
501 existing = existing.split(',')
502 final = set(to_add) | set(existing)
503 self.set_module_option('pool_ids', ','.join(final))
504 return (0, '', '')
505 elif command['prefix'] == 'balancer pool rm':
506 raw_names = command['pools']
507 existing = self.get_module_option('pool_ids')
508 if existing is '': # for idempotence
509 return (0, '', '')
510 existing = existing.split(',')
511 osdmap = self.get_osdmap()
512 pool_ids = [str(p['pool']) for p in osdmap.dump().get('pools', [])]
513 pool_id_by_name = dict((p['pool_name'], p['pool']) for p in osdmap.dump().get('pools', []))
514 final = [p for p in existing if p in pool_ids]
515 to_delete = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name]
516 final = set(final) - set(to_delete)
517 self.set_module_option('pool_ids', ','.join(final))
518 return (0, '', '')
3efd9988
FG
519 elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose':
520 verbose = command['prefix'] == 'balancer eval-verbose'
94b18763
FG
521 pools = []
522 if 'option' in command:
523 plan = self.plans.get(command['option'])
3efd9988 524 if not plan:
94b18763
FG
525 # not a plan, does it look like a pool?
526 osdmap = self.get_osdmap()
527 valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])]
528 option = command['option']
529 if option not in valid_pool_names:
530 return (-errno.EINVAL, '', 'option "%s" not a plan or a pool' % option)
531 pools.append(option)
9f95a23c 532 ms = MappingState(osdmap, self.get("pg_stats"), self.get("pool_stats"), 'pool "%s"' % option)
94b18763
FG
533 else:
534 pools = plan.pools
9f95a23c
TL
535 if plan.mode == 'upmap':
536 # Note that for upmap, to improve the efficiency,
537 # we use a basic version of Plan without keeping the obvious
538 # *redundant* MS member.
539 # Hence ms might not be accurate here since we are basically
540 # using an old snapshotted osdmap vs a fresh copy of pg_stats.
541 # It should not be a big deal though..
542 ms = MappingState(plan.osdmap,
543 self.get("pg_stats"),
544 self.get("pool_stats"),
545 'plan "%s"' % plan.name)
546 else:
547 ms = plan.final_state()
3efd9988
FG
548 else:
549 ms = MappingState(self.get_osdmap(),
9f95a23c
TL
550 self.get("pg_stats"),
551 self.get("pool_stats"),
3efd9988 552 'current cluster')
94b18763 553 return (0, self.evaluate(ms, pools, verbose=verbose), '')
3efd9988 554 elif command['prefix'] == 'balancer optimize':
eafe8130
TL
555 # The GIL can be release by the active balancer, so disallow when active
556 if self.active:
557 return (-errno.EINVAL, '', 'Balancer enabled, disable to optimize manually')
558 if self.optimizing:
559 return (-errno.EINVAL, '', 'Balancer finishing up....try again')
94b18763
FG
560 pools = []
561 if 'pools' in command:
562 pools = command['pools']
563 osdmap = self.get_osdmap()
564 valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])]
565 invalid_pool_names = []
566 for p in pools:
567 if p not in valid_pool_names:
568 invalid_pool_names.append(p)
569 if len(invalid_pool_names):
570 return (-errno.EINVAL, '', 'pools %s not found' % invalid_pool_names)
571 plan = self.plan_create(command['plan'], osdmap, pools)
eafe8130
TL
572 self.last_optimize_started = time.asctime(time.localtime())
573 self.optimize_result = self.in_progress_string
574 start = time.time()
94b18763 575 r, detail = self.optimize(plan)
eafe8130
TL
576 end = time.time()
577 self.last_optimize_duration = str(datetime.timedelta(seconds=(end - start)))
578 if r == 0:
579 # Add plan if an optimization was created
580 self.optimize_result = self.success_string
581 self.plans[command['plan']] = plan
582 else:
583 self.optimize_result = detail
94b18763 584 return (r, '', detail)
3efd9988 585 elif command['prefix'] == 'balancer rm':
b32b8144 586 self.plan_rm(command['plan'])
3efd9988
FG
587 return (0, '', '')
588 elif command['prefix'] == 'balancer reset':
589 self.plans = {}
590 return (0, '', '')
f64942e4 591 elif command['prefix'] == 'balancer ls':
9f95a23c 592 return (0, json.dumps([p for p in self.plans], indent=4, sort_keys=True), '')
3efd9988
FG
593 elif command['prefix'] == 'balancer dump':
594 plan = self.plans.get(command['plan'])
595 if not plan:
596 return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
597 return (0, plan.dump(), '')
598 elif command['prefix'] == 'balancer show':
599 plan = self.plans.get(command['plan'])
600 if not plan:
601 return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
602 return (0, plan.show(), '')
603 elif command['prefix'] == 'balancer execute':
eafe8130
TL
604 # The GIL can be release by the active balancer, so disallow when active
605 if self.active:
606 return (-errno.EINVAL, '', 'Balancer enabled, disable to execute a plan')
607 if self.optimizing:
608 return (-errno.EINVAL, '', 'Balancer finishing up....try again')
3efd9988
FG
609 plan = self.plans.get(command['plan'])
610 if not plan:
611 return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
94b18763
FG
612 r, detail = self.execute(plan)
613 self.plan_rm(command['plan'])
614 return (r, '', detail)
3efd9988
FG
615 else:
616 return (-errno.EINVAL, '',
617 "Command not found '{0}'".format(command['prefix']))
618
619 def shutdown(self):
620 self.log.info('Stopping')
621 self.run = False
622 self.event.set()
623
a8e16298
TL
624 def time_permit(self):
625 local_time = time.localtime()
626 time_of_day = time.strftime('%H%M', local_time)
627 weekday = (local_time.tm_wday + 1) % 7 # be compatible with C
628 permit = False
629
11fdf7f2
TL
630 begin_time = self.get_module_option('begin_time')
631 end_time = self.get_module_option('end_time')
a8e16298
TL
632 if begin_time <= end_time:
633 permit = begin_time <= time_of_day < end_time
634 else:
635 permit = time_of_day >= begin_time or time_of_day < end_time
636 if not permit:
637 self.log.debug("should run between %s - %s, now %s, skipping",
638 begin_time, end_time, time_of_day)
639 return False
640
11fdf7f2
TL
641 begin_weekday = self.get_module_option('begin_weekday')
642 end_weekday = self.get_module_option('end_weekday')
a8e16298
TL
643 if begin_weekday <= end_weekday:
644 permit = begin_weekday <= weekday < end_weekday
3efd9988 645 else:
a8e16298
TL
646 permit = weekday >= begin_weekday or weekday < end_weekday
647 if not permit:
648 self.log.debug("should run between weekday %d - %d, now %d, skipping",
649 begin_weekday, end_weekday, weekday)
650 return False
651
652 return True
3efd9988
FG
653
654 def serve(self):
655 self.log.info('Starting')
656 while self.run:
11fdf7f2
TL
657 self.active = self.get_module_option('active')
658 sleep_interval = self.get_module_option('sleep_interval')
a8e16298
TL
659 self.log.debug('Waking up [%s, now %s]',
660 "active" if self.active else "inactive",
661 time.strftime(TIME_FORMAT, time.localtime()))
662 if self.active and self.time_permit():
3efd9988
FG
663 self.log.debug('Running')
664 name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
11fdf7f2
TL
665 osdmap = self.get_osdmap()
666 allow = self.get_module_option('pool_ids')
667 final = []
668 if allow is not '':
669 allow = allow.split(',')
670 valid = [str(p['pool']) for p in osdmap.dump().get('pools', [])]
671 final = set(allow) & set(valid)
672 if set(allow) - set(valid): # some pools were gone, prune
673 self.set_module_option('pool_ids', ','.join(final))
674 pool_name_by_id = dict((p['pool'], p['pool_name']) for p in osdmap.dump().get('pools', []))
675 final = [int(p) for p in final]
676 final = [pool_name_by_id[p] for p in final if p in pool_name_by_id]
677 plan = self.plan_create(name, osdmap, final)
eafe8130
TL
678 self.optimizing = True
679 self.last_optimize_started = time.asctime(time.localtime())
680 self.optimize_result = self.in_progress_string
681 start = time.time()
94b18763 682 r, detail = self.optimize(plan)
eafe8130
TL
683 end = time.time()
684 self.last_optimize_duration = str(datetime.timedelta(seconds=(end - start)))
94b18763 685 if r == 0:
eafe8130 686 self.optimize_result = self.success_string
3efd9988 687 self.execute(plan)
eafe8130
TL
688 else:
689 self.optimize_result = detail
690 self.optimizing = False
3efd9988
FG
691 self.log.debug('Sleeping for %d', sleep_interval)
692 self.event.wait(sleep_interval)
693 self.event.clear()
694
94b18763 695 def plan_create(self, name, osdmap, pools):
9f95a23c
TL
696 mode = self.get_module_option('mode')
697 if mode == 'upmap':
698 # drop unnecessary MS member for upmap mode.
699 # this way we could effectively eliminate the usage of a
700 # complete pg_stats, which can become horribly inefficient
701 # as pg_num grows..
702 plan = Plan(name, mode, osdmap, pools)
703 else:
704 plan = MsPlan(name,
705 mode,
706 MappingState(osdmap,
707 self.get("pg_stats"),
708 self.get("pool_stats"),
709 'plan %s initial' % name),
710 pools)
3efd9988
FG
711 return plan
712
713 def plan_rm(self, name):
714 if name in self.plans:
715 del self.plans[name]
716
94b18763 717 def calc_eval(self, ms, pools):
3efd9988
FG
718 pe = Eval(ms)
719 pool_rule = {}
720 pool_info = {}
721 for p in ms.osdmap_dump.get('pools',[]):
94b18763
FG
722 if len(pools) and p['pool_name'] not in pools:
723 continue
724 # skip dead or not-yet-ready pools too
725 if p['pool'] not in ms.poolids:
726 continue
3efd9988
FG
727 pe.pool_name[p['pool']] = p['pool_name']
728 pe.pool_id[p['pool_name']] = p['pool']
729 pool_rule[p['pool_name']] = p['crush_rule']
730 pe.pool_roots[p['pool_name']] = []
731 pool_info[p['pool_name']] = p
94b18763 732 if len(pool_info) == 0:
3efd9988
FG
733 return pe
734 self.log.debug('pool_name %s' % pe.pool_name)
735 self.log.debug('pool_id %s' % pe.pool_id)
736 self.log.debug('pools %s' % pools)
737 self.log.debug('pool_rule %s' % pool_rule)
738
739 osd_weight = { a['osd']: a['weight']
94b18763 740 for a in ms.osdmap_dump.get('osds',[]) if a['weight'] > 0 }
3efd9988
FG
741
742 # get expected distributions by root
743 actual_by_root = {}
744 rootids = ms.crush.find_takes()
745 roots = []
746 for rootid in rootids:
3efd9988 747 ls = ms.osdmap.get_pools_by_take(rootid)
94b18763
FG
748 want = []
749 # find out roots associating with pools we are passed in
750 for candidate in ls:
751 if candidate in pe.pool_name:
752 want.append(candidate)
753 if len(want) == 0:
754 continue
755 root = ms.crush.get_item_name(rootid)
3efd9988 756 pe.root_pools[root] = []
94b18763 757 for poolid in want:
3efd9988
FG
758 pe.pool_roots[pe.pool_name[poolid]].append(root)
759 pe.root_pools[root].append(pe.pool_name[poolid])
94b18763
FG
760 pe.root_ids[root] = rootid
761 roots.append(root)
3efd9988
FG
762 weight_map = ms.crush.get_take_weight_osd_map(rootid)
763 adjusted_map = {
94b18763 764 osd: cw * osd_weight[osd]
1adf2230 765 for osd,cw in six.iteritems(weight_map) if osd in osd_weight and cw > 0
3efd9988 766 }
94b18763
FG
767 sum_w = sum(adjusted_map.values())
768 assert len(adjusted_map) == 0 or sum_w > 0
3efd9988 769 pe.target_by_root[root] = { osd: w / sum_w
1adf2230 770 for osd,w in six.iteritems(adjusted_map) }
3efd9988
FG
771 actual_by_root[root] = {
772 'pgs': {},
773 'objects': {},
774 'bytes': {},
775 }
11fdf7f2 776 for osd in pe.target_by_root[root]:
3efd9988
FG
777 actual_by_root[root]['pgs'][osd] = 0
778 actual_by_root[root]['objects'][osd] = 0
779 actual_by_root[root]['bytes'][osd] = 0
780 pe.total_by_root[root] = {
781 'pgs': 0,
782 'objects': 0,
783 'bytes': 0,
784 }
785 self.log.debug('pool_roots %s' % pe.pool_roots)
786 self.log.debug('root_pools %s' % pe.root_pools)
787 self.log.debug('target_by_root %s' % pe.target_by_root)
788
789 # pool and root actual
1adf2230 790 for pool, pi in six.iteritems(pool_info):
3efd9988
FG
791 poolid = pi['pool']
792 pm = ms.pg_up_by_poolid[poolid]
793 pgs = 0
794 objects = 0
795 bytes = 0
796 pgs_by_osd = {}
797 objects_by_osd = {}
798 bytes_by_osd = {}
1adf2230 799 for pgid, up in six.iteritems(pm):
3efd9988 800 for osd in [int(osd) for osd in up]:
b32b8144
FG
801 if osd == CRUSHMap.ITEM_NONE:
802 continue
9f95a23c
TL
803 if osd not in pgs_by_osd:
804 pgs_by_osd[osd] = 0
805 objects_by_osd[osd] = 0
806 bytes_by_osd[osd] = 0
3efd9988
FG
807 pgs_by_osd[osd] += 1
808 objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects']
809 bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes']
810 # pick a root to associate this pg instance with.
811 # note that this is imprecise if the roots have
812 # overlapping children.
813 # FIXME: divide bytes by k for EC pools.
814 for root in pe.pool_roots[pool]:
815 if osd in pe.target_by_root[root]:
816 actual_by_root[root]['pgs'][osd] += 1
817 actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects']
818 actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes']
819 pgs += 1
820 objects += ms.pg_stat[pgid]['num_objects']
821 bytes += ms.pg_stat[pgid]['num_bytes']
822 pe.total_by_root[root]['pgs'] += 1
823 pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects']
824 pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes']
825 break
826 pe.count_by_pool[pool] = {
827 'pgs': {
828 k: v
1adf2230 829 for k, v in six.iteritems(pgs_by_osd)
3efd9988
FG
830 },
831 'objects': {
832 k: v
1adf2230 833 for k, v in six.iteritems(objects_by_osd)
3efd9988
FG
834 },
835 'bytes': {
836 k: v
1adf2230 837 for k, v in six.iteritems(bytes_by_osd)
3efd9988
FG
838 },
839 }
840 pe.actual_by_pool[pool] = {
841 'pgs': {
842 k: float(v) / float(max(pgs, 1))
1adf2230 843 for k, v in six.iteritems(pgs_by_osd)
3efd9988
FG
844 },
845 'objects': {
846 k: float(v) / float(max(objects, 1))
1adf2230 847 for k, v in six.iteritems(objects_by_osd)
3efd9988
FG
848 },
849 'bytes': {
850 k: float(v) / float(max(bytes, 1))
1adf2230 851 for k, v in six.iteritems(bytes_by_osd)
3efd9988
FG
852 },
853 }
854 pe.total_by_pool[pool] = {
855 'pgs': pgs,
856 'objects': objects,
857 'bytes': bytes,
858 }
11fdf7f2 859 for root in pe.total_by_root:
3efd9988
FG
860 pe.count_by_root[root] = {
861 'pgs': {
862 k: float(v)
1adf2230 863 for k, v in six.iteritems(actual_by_root[root]['pgs'])
3efd9988
FG
864 },
865 'objects': {
866 k: float(v)
1adf2230 867 for k, v in six.iteritems(actual_by_root[root]['objects'])
3efd9988
FG
868 },
869 'bytes': {
870 k: float(v)
1adf2230 871 for k, v in six.iteritems(actual_by_root[root]['bytes'])
3efd9988
FG
872 },
873 }
874 pe.actual_by_root[root] = {
875 'pgs': {
876 k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1))
1adf2230 877 for k, v in six.iteritems(actual_by_root[root]['pgs'])
3efd9988
FG
878 },
879 'objects': {
880 k: float(v) / float(max(pe.total_by_root[root]['objects'], 1))
1adf2230 881 for k, v in six.iteritems(actual_by_root[root]['objects'])
3efd9988
FG
882 },
883 'bytes': {
884 k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1))
1adf2230 885 for k, v in six.iteritems(actual_by_root[root]['bytes'])
3efd9988
FG
886 },
887 }
888 self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
889 self.log.debug('actual_by_root %s' % pe.actual_by_root)
890
891 # average and stddev and score
892 pe.stats_by_root = {
893 a: pe.calc_stats(
894 b,
895 pe.target_by_root[a],
896 pe.total_by_root[a]
1adf2230 897 ) for a, b in six.iteritems(pe.count_by_root)
3efd9988 898 }
94b18763 899 self.log.debug('stats_by_root %s' % pe.stats_by_root)
3efd9988
FG
900
901 # the scores are already normalized
902 pe.score_by_root = {
903 r: {
904 'pgs': pe.stats_by_root[r]['pgs']['score'],
905 'objects': pe.stats_by_root[r]['objects']['score'],
906 'bytes': pe.stats_by_root[r]['bytes']['score'],
907 } for r in pe.total_by_root.keys()
908 }
94b18763 909 self.log.debug('score_by_root %s' % pe.score_by_root)
3efd9988 910
f64942e4 911 # get the list of score metrics, comma separated
11fdf7f2 912 metrics = self.get_module_option('crush_compat_metrics').split(',')
f64942e4 913
3efd9988
FG
914 # total score is just average of normalized stddevs
915 pe.score = 0.0
1adf2230
AA
916 for r, vs in six.iteritems(pe.score_by_root):
917 for k, v in six.iteritems(vs):
f64942e4
AA
918 if k in metrics:
919 pe.score += v
920 pe.score /= len(metrics) * len(roots)
3efd9988
FG
921 return pe
922
94b18763
FG
923 def evaluate(self, ms, pools, verbose=False):
924 pe = self.calc_eval(ms, pools)
3efd9988
FG
925 return pe.show(verbose=verbose)
926
927 def optimize(self, plan):
928 self.log.info('Optimize plan %s' % plan.name)
9f95a23c 929 max_misplaced = self.get_ceph_option('target_max_misplaced_ratio')
3efd9988
FG
930 self.log.info('Mode %s, max misplaced %f' %
931 (plan.mode, max_misplaced))
932
933 info = self.get('pg_status')
934 unknown = info.get('unknown_pgs_ratio', 0.0)
935 degraded = info.get('degraded_ratio', 0.0)
936 inactive = info.get('inactive_pgs_ratio', 0.0)
937 misplaced = info.get('misplaced_ratio', 0.0)
9f95a23c 938 plan.pg_status = info
3efd9988
FG
939 self.log.debug('unknown %f degraded %f inactive %f misplaced %g',
940 unknown, degraded, inactive, misplaced)
941 if unknown > 0.0:
94b18763
FG
942 detail = 'Some PGs (%f) are unknown; try again later' % unknown
943 self.log.info(detail)
944 return -errno.EAGAIN, detail
3efd9988 945 elif degraded > 0.0:
94b18763
FG
946 detail = 'Some objects (%f) are degraded; try again later' % degraded
947 self.log.info(detail)
948 return -errno.EAGAIN, detail
3efd9988 949 elif inactive > 0.0:
94b18763
FG
950 detail = 'Some PGs (%f) are inactive; try again later' % inactive
951 self.log.info(detail)
952 return -errno.EAGAIN, detail
3efd9988 953 elif misplaced >= max_misplaced:
94b18763
FG
954 detail = 'Too many objects (%f > %f) are misplaced; ' \
955 'try again later' % (misplaced, max_misplaced)
956 self.log.info(detail)
957 return -errno.EAGAIN, detail
3efd9988
FG
958 else:
959 if plan.mode == 'upmap':
960 return self.do_upmap(plan)
961 elif plan.mode == 'crush-compat':
962 return self.do_crush_compat(plan)
963 elif plan.mode == 'none':
94b18763 964 detail = 'Please do "ceph balancer mode" to choose a valid mode first'
3efd9988 965 self.log.info('Idle')
94b18763 966 return -errno.ENOEXEC, detail
3efd9988 967 else:
94b18763
FG
968 detail = 'Unrecognized mode %s' % plan.mode
969 self.log.info(detail)
970 return -errno.EINVAL, detail
3efd9988
FG
971
972 def do_upmap(self, plan):
973 self.log.info('do_upmap')
9f95a23c 974 max_optimizations = self.get_module_option('upmap_max_optimizations')
11fdf7f2 975 max_deviation = self.get_module_option('upmap_max_deviation')
9f95a23c 976 osdmap_dump = plan.osdmap_dump
3efd9988 977
94b18763
FG
978 if len(plan.pools):
979 pools = plan.pools
980 else: # all
9f95a23c 981 pools = [str(i['pool_name']) for i in osdmap_dump.get('pools',[])]
3efd9988 982 if len(pools) == 0:
94b18763
FG
983 detail = 'No pools available'
984 self.log.info(detail)
985 return -errno.ENOENT, detail
92f5a8d4
TL
986 # shuffle pool list so they all get equal (in)attention
987 random.shuffle(pools)
988 self.log.info('pools %s' % pools)
3efd9988 989
92f5a8d4 990 adjusted_pools = []
3efd9988
FG
991 inc = plan.inc
992 total_did = 0
9f95a23c 993 left = max_optimizations
11fdf7f2
TL
994 pools_with_pg_merge = [p['pool_name'] for p in osdmap_dump.get('pools', [])
995 if p['pg_num'] > p['pg_num_target']]
996 crush_rule_by_pool_name = dict((p['pool_name'], p['crush_rule']) for p in osdmap_dump.get('pools', []))
3efd9988 997 for pool in pools:
11fdf7f2
TL
998 if pool not in crush_rule_by_pool_name:
999 self.log.info('pool %s does not exist' % pool)
1000 continue
1001 if pool in pools_with_pg_merge:
1002 self.log.info('pool %s has pending PG(s) for merging, skipping for now' % pool)
1003 continue
92f5a8d4 1004 adjusted_pools.append(pool)
11fdf7f2 1005 # shuffle so all pools get equal (in)attention
92f5a8d4 1006 random.shuffle(adjusted_pools)
9f95a23c 1007 pool_dump = osdmap_dump.get('pools', [])
92f5a8d4 1008 for pool in adjusted_pools:
9f95a23c
TL
1009 num_pg = 0
1010 for p in pool_dump:
1011 if p['pool_name'] == pool:
1012 num_pg = p['pg_num']
1013 pool_id = p['pool']
1014 break
1015
1016 # note that here we deliberately exclude any scrubbing pgs too
1017 # since scrubbing activities have significant impacts on performance
1018 num_pg_active_clean = 0
1019 for p in plan.pg_status.get('pgs_by_pool_state', []):
1020 pgs_pool_id = p['pool_id']
1021 if pgs_pool_id != pool_id:
1022 continue
1023 for s in p['pg_state_counts']:
1024 if s['state_name'] == 'active+clean':
1025 num_pg_active_clean += s['count']
1026 break
1027 available = left - (num_pg - num_pg_active_clean)
1028 did = plan.osdmap.calc_pg_upmaps(inc, max_deviation, available, [pool])
3efd9988
FG
1029 total_did += did
1030 left -= did
1031 if left <= 0:
1032 break
9f95a23c 1033 self.log.info('prepared %d/%d changes' % (total_did, max_optimizations))
94b18763 1034 if total_did == 0:
11fdf7f2 1035 return -errno.EALREADY, 'Unable to find further optimization, ' \
92f5a8d4 1036 'or pool(s) pg_num is decreasing, ' \
94b18763
FG
1037 'or distribution is already perfect'
1038 return 0, ''
3efd9988
FG
1039
1040 def do_crush_compat(self, plan):
1041 self.log.info('do_crush_compat')
11fdf7f2 1042 max_iterations = self.get_module_option('crush_compat_max_iterations')
3efd9988 1043 if max_iterations < 1:
94b18763 1044 return -errno.EINVAL, '"crush_compat_max_iterations" must be >= 1'
11fdf7f2 1045 step = self.get_module_option('crush_compat_step')
3efd9988 1046 if step <= 0 or step >= 1.0:
94b18763 1047 return -errno.EINVAL, '"crush_compat_step" must be in (0, 1)'
9f95a23c 1048 max_misplaced = self.get_ceph_option('target_max_misplaced_ratio')
3efd9988
FG
1049 min_pg_per_osd = 2
1050
1051 ms = plan.initial
1052 osdmap = ms.osdmap
1053 crush = osdmap.get_crush()
94b18763 1054 pe = self.calc_eval(ms, plan.pools)
11fdf7f2 1055 min_score_to_optimize = self.get_module_option('min_score')
94b18763
FG
1056 if pe.score <= min_score_to_optimize:
1057 if pe.score == 0:
1058 detail = 'Distribution is already perfect'
1059 else:
1060 detail = 'score %f <= min_score %f, will not optimize' \
1061 % (pe.score, min_score_to_optimize)
1062 self.log.info(detail)
1063 return -errno.EALREADY, detail
3efd9988
FG
1064
1065 # get current osd reweights
1066 orig_osd_weight = { a['osd']: a['weight']
1067 for a in ms.osdmap_dump.get('osds',[]) }
1adf2230 1068 reweighted_osds = [ a for a,b in six.iteritems(orig_osd_weight)
3efd9988
FG
1069 if b < 1.0 and b > 0.0 ]
1070
1071 # get current compat weight-set weights
94b18763
FG
1072 orig_ws = self.get_compat_weight_set_weights(ms)
1073 if not orig_ws:
1074 return -errno.EAGAIN, 'compat weight-set not available'
1adf2230 1075 orig_ws = { a: b for a, b in six.iteritems(orig_ws) if a >= 0 }
3efd9988
FG
1076
1077 # Make sure roots don't overlap their devices. If so, we
1078 # can't proceed.
eafe8130 1079 roots = list(pe.target_by_root.keys())
3efd9988
FG
1080 self.log.debug('roots %s', roots)
1081 visited = {}
1082 overlap = {}
1083 root_ids = {}
1adf2230 1084 for root, wm in six.iteritems(pe.target_by_root):
11fdf7f2 1085 for osd in wm:
3efd9988 1086 if osd in visited:
11fdf7f2
TL
1087 if osd not in overlap:
1088 overlap[osd] = [ visited[osd] ]
1089 overlap[osd].append(root)
1090 visited[osd] = root
3efd9988 1091 if len(overlap) > 0:
94b18763 1092 detail = 'Some osds belong to multiple subtrees: %s' % \
11fdf7f2 1093 overlap
94b18763
FG
1094 self.log.error(detail)
1095 return -errno.EOPNOTSUPP, detail
3efd9988 1096
f64942e4 1097 # rebalance by pgs, objects, or bytes
11fdf7f2 1098 metrics = self.get_module_option('crush_compat_metrics').split(',')
f64942e4
AA
1099 key = metrics[0] # balancing using the first score metric
1100 if key not in ['pgs', 'bytes', 'objects']:
e306af50 1101 self.log.warning("Invalid crush_compat balancing key %s. Using 'pgs'." % key)
f64942e4 1102 key = 'pgs'
3efd9988
FG
1103
1104 # go
1105 best_ws = copy.deepcopy(orig_ws)
1106 best_ow = copy.deepcopy(orig_osd_weight)
1107 best_pe = pe
1108 left = max_iterations
1109 bad_steps = 0
1110 next_ws = copy.deepcopy(best_ws)
1111 next_ow = copy.deepcopy(best_ow)
1112 while left > 0:
1113 # adjust
1114 self.log.debug('best_ws %s' % best_ws)
1115 random.shuffle(roots)
1116 for root in roots:
1117 pools = best_pe.root_pools[root]
94b18763
FG
1118 osds = len(best_pe.target_by_root[root])
1119 min_pgs = osds * min_pg_per_osd
1120 if best_pe.total_by_root[root][key] < min_pgs:
3efd9988
FG
1121 self.log.info('Skipping root %s (pools %s), total pgs %d '
1122 '< minimum %d (%d per osd)',
94b18763
FG
1123 root, pools,
1124 best_pe.total_by_root[root][key],
1125 min_pgs, min_pg_per_osd)
3efd9988
FG
1126 continue
1127 self.log.info('Balancing root %s (pools %s) by %s' %
1128 (root, pools, key))
1129 target = best_pe.target_by_root[root]
1130 actual = best_pe.actual_by_root[root][key]
1131 queue = sorted(actual.keys(),
1132 key=lambda osd: -abs(target[osd] - actual[osd]))
1133 for osd in queue:
1134 if orig_osd_weight[osd] == 0:
1135 self.log.debug('skipping out osd.%d', osd)
1136 else:
1137 deviation = target[osd] - actual[osd]
1138 if deviation == 0:
1139 break
1140 self.log.debug('osd.%d deviation %f', osd, deviation)
1141 weight = best_ws[osd]
1142 ow = orig_osd_weight[osd]
1143 if actual[osd] > 0:
1144 calc_weight = target[osd] / actual[osd] * weight * ow
1145 else:
81eedcae
TL
1146 # for newly created osds, reset calc_weight at target value
1147 # this way weight-set will end up absorbing *step* of its
1148 # target (final) value at the very beginning and slowly catch up later.
1149 # note that if this turns out causing too many misplaced
1150 # pgs, then we'll reduce step and retry
1151 calc_weight = target[osd]
3efd9988
FG
1152 new_weight = weight * (1.0 - step) + calc_weight * step
1153 self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
1154 new_weight)
1155 next_ws[osd] = new_weight
1156 if ow < 1.0:
1157 new_ow = min(1.0, max(step + (1.0 - step) * ow,
1158 ow + .005))
1159 self.log.debug('Reweight osd.%d reweight %f -> %f',
1160 osd, ow, new_ow)
1161 next_ow[osd] = new_ow
1162
1163 # normalize weights under this root
1164 root_weight = crush.get_item_weight(pe.root_ids[root])
1adf2230 1165 root_sum = sum(b for a,b in six.iteritems(next_ws)
3efd9988
FG
1166 if a in target.keys())
1167 if root_sum > 0 and root_weight > 0:
1168 factor = root_sum / root_weight
1169 self.log.debug('normalizing root %s %d, weight %f, '
1170 'ws sum %f, factor %f',
1171 root, pe.root_ids[root], root_weight,
1172 root_sum, factor)
1173 for osd in actual.keys():
1174 next_ws[osd] = next_ws[osd] / factor
1175
1176 # recalc
1177 plan.compat_ws = copy.deepcopy(next_ws)
1178 next_ms = plan.final_state()
94b18763 1179 next_pe = self.calc_eval(next_ms, plan.pools)
3efd9988
FG
1180 next_misplaced = next_ms.calc_misplaced_from(ms)
1181 self.log.debug('Step result score %f -> %f, misplacing %f',
1182 best_pe.score, next_pe.score, next_misplaced)
1183
1184 if next_misplaced > max_misplaced:
1185 if best_pe.score < pe.score:
1186 self.log.debug('Step misplaced %f > max %f, stopping',
1187 next_misplaced, max_misplaced)
1188 break
1189 step /= 2.0
1190 next_ws = copy.deepcopy(best_ws)
1191 next_ow = copy.deepcopy(best_ow)
1192 self.log.debug('Step misplaced %f > max %f, reducing step to %f',
1193 next_misplaced, max_misplaced, step)
1194 else:
1195 if next_pe.score > best_pe.score * 1.0001:
94b18763 1196 bad_steps += 1
3efd9988
FG
1197 if bad_steps < 5 and random.randint(0, 100) < 70:
1198 self.log.debug('Score got worse, taking another step')
1199 else:
1200 step /= 2.0
1201 next_ws = copy.deepcopy(best_ws)
1202 next_ow = copy.deepcopy(best_ow)
1203 self.log.debug('Score got worse, trying smaller step %f',
1204 step)
1205 else:
1206 bad_steps = 0
1207 best_pe = next_pe
91327a77
AA
1208 best_ws = copy.deepcopy(next_ws)
1209 best_ow = copy.deepcopy(next_ow)
3efd9988
FG
1210 if best_pe.score == 0:
1211 break
1212 left -= 1
1213
1214 # allow a small regression if we are phasing out osd weights
1215 fudge = 0
81eedcae 1216 if best_ow != orig_osd_weight:
3efd9988
FG
1217 fudge = .001
1218
1219 if best_pe.score < pe.score + fudge:
1220 self.log.info('Success, score %f -> %f', pe.score, best_pe.score)
1221 plan.compat_ws = best_ws
1adf2230 1222 for osd, w in six.iteritems(best_ow):
3efd9988
FG
1223 if w != orig_osd_weight[osd]:
1224 self.log.debug('osd.%d reweight %f', osd, w)
1225 plan.osd_weights[osd] = w
94b18763 1226 return 0, ''
3efd9988
FG
1227 else:
1228 self.log.info('Failed to find further optimization, score %f',
1229 pe.score)
94b18763
FG
1230 plan.compat_ws = {}
1231 return -errno.EDOM, 'Unable to find further optimization, ' \
1232 'change balancer mode and retry might help'
1233
1234 def get_compat_weight_set_weights(self, ms):
11fdf7f2 1235 if not CRUSHMap.have_default_choose_args(ms.crush_dump):
94b18763
FG
1236 # enable compat weight-set first
1237 self.log.debug('ceph osd crush weight-set create-compat')
1238 result = CommandResult('')
1239 self.send_command(result, 'mon', '', json.dumps({
1240 'prefix': 'osd crush weight-set create-compat',
1241 'format': 'json',
1242 }), '')
1243 r, outb, outs = result.wait()
1244 if r != 0:
1245 self.log.error('Error creating compat weight-set')
1246 return
1247
1248 result = CommandResult('')
1249 self.send_command(result, 'mon', '', json.dumps({
1250 'prefix': 'osd crush dump',
1251 'format': 'json',
1252 }), '')
1253 r, outb, outs = result.wait()
1254 if r != 0:
1255 self.log.error('Error dumping crush map')
1256 return
1257 try:
1258 crushmap = json.loads(outb)
1259 except:
1260 raise RuntimeError('unable to parse crush map')
1261 else:
1262 crushmap = ms.crush_dump
3efd9988 1263
11fdf7f2 1264 raw = CRUSHMap.get_default_choose_args(crushmap)
3efd9988
FG
1265 weight_set = {}
1266 for b in raw:
1267 bucket = None
1268 for t in crushmap['buckets']:
1269 if t['id'] == b['bucket_id']:
1270 bucket = t
1271 break
1272 if not bucket:
1273 raise RuntimeError('could not find bucket %s' % b['bucket_id'])
1274 self.log.debug('bucket items %s' % bucket['items'])
1275 self.log.debug('weight set %s' % b['weight_set'][0])
1276 if len(bucket['items']) != len(b['weight_set'][0]):
1277 raise RuntimeError('weight-set size does not match bucket items')
1278 for pos in range(len(bucket['items'])):
1279 weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos]
1280
1281 self.log.debug('weight_set weights %s' % weight_set)
1282 return weight_set
1283
1284 def do_crush(self):
1285 self.log.info('do_crush (not yet implemented)')
1286
1287 def do_osd_weight(self):
1288 self.log.info('do_osd_weight (not yet implemented)')
1289
1290 def execute(self, plan):
1291 self.log.info('Executing plan %s' % plan.name)
1292
1293 commands = []
1294
1295 # compat weight-set
1296 if len(plan.compat_ws) and \
11fdf7f2 1297 not CRUSHMap.have_default_choose_args(plan.initial.crush_dump):
3efd9988
FG
1298 self.log.debug('ceph osd crush weight-set create-compat')
1299 result = CommandResult('')
1300 self.send_command(result, 'mon', '', json.dumps({
1301 'prefix': 'osd crush weight-set create-compat',
1302 'format': 'json',
1303 }), '')
1304 r, outb, outs = result.wait()
1305 if r != 0:
1306 self.log.error('Error creating compat weight-set')
94b18763 1307 return r, outs
3efd9988 1308
1adf2230 1309 for osd, weight in six.iteritems(plan.compat_ws):
3efd9988
FG
1310 self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
1311 osd, weight)
b32b8144 1312 result = CommandResult('')
3efd9988
FG
1313 self.send_command(result, 'mon', '', json.dumps({
1314 'prefix': 'osd crush weight-set reweight-compat',
1315 'format': 'json',
1316 'item': 'osd.%d' % osd,
1317 'weight': [weight],
b32b8144 1318 }), '')
3efd9988
FG
1319 commands.append(result)
1320
1321 # new_weight
1322 reweightn = {}
1adf2230 1323 for osd, weight in six.iteritems(plan.osd_weights):
3efd9988
FG
1324 reweightn[str(osd)] = str(int(weight * float(0x10000)))
1325 if len(reweightn):
1326 self.log.info('ceph osd reweightn %s', reweightn)
b32b8144 1327 result = CommandResult('')
3efd9988
FG
1328 self.send_command(result, 'mon', '', json.dumps({
1329 'prefix': 'osd reweightn',
1330 'format': 'json',
1331 'weights': json.dumps(reweightn),
b32b8144 1332 }), '')
3efd9988
FG
1333 commands.append(result)
1334
1335 # upmap
1336 incdump = plan.inc.dump()
9f95a23c
TL
1337 for item in incdump.get('new_pg_upmap', []):
1338 self.log.info('ceph osd pg-upmap %s mappings %s', item['pgid'],
1339 item['osds'])
3efd9988
FG
1340 result = CommandResult('foo')
1341 self.send_command(result, 'mon', '', json.dumps({
9f95a23c
TL
1342 'prefix': 'osd pg-upmap',
1343 'format': 'json',
1344 'pgid': item['pgid'],
1345 'id': item['osds'],
1346 }), 'foo')
1347 commands.append(result)
1348
1349 for pgid in incdump.get('old_pg_upmap', []):
1350 self.log.info('ceph osd rm-pg-upmap %s', pgid)
1351 result = CommandResult('foo')
1352 self.send_command(result, 'mon', '', json.dumps({
1353 'prefix': 'osd rm-pg-upmap',
3efd9988
FG
1354 'format': 'json',
1355 'pgid': pgid,
1356 }), 'foo')
1357 commands.append(result)
1358
1359 for item in incdump.get('new_pg_upmap_items', []):
1360 self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
1361 item['mappings'])
1362 osdlist = []
1363 for m in item['mappings']:
1364 osdlist += [m['from'], m['to']]
1365 result = CommandResult('foo')
1366 self.send_command(result, 'mon', '', json.dumps({
1367 'prefix': 'osd pg-upmap-items',
1368 'format': 'json',
1369 'pgid': item['pgid'],
1370 'id': osdlist,
1371 }), 'foo')
1372 commands.append(result)
1373
9f95a23c
TL
1374 for pgid in incdump.get('old_pg_upmap_items', []):
1375 self.log.info('ceph osd rm-pg-upmap-items %s', pgid)
1376 result = CommandResult('foo')
1377 self.send_command(result, 'mon', '', json.dumps({
1378 'prefix': 'osd rm-pg-upmap-items',
1379 'format': 'json',
1380 'pgid': pgid,
1381 }), 'foo')
1382 commands.append(result)
1383
3efd9988
FG
1384 # wait for commands
1385 self.log.debug('commands %s' % commands)
1386 for result in commands:
1387 r, outb, outs = result.wait()
1388 if r != 0:
94b18763
FG
1389 self.log.error('execute error: r = %d, detail = %s' % (r, outs))
1390 return r, outs
3efd9988 1391 self.log.debug('done')
94b18763 1392 return 0, ''
eafe8130
TL
1393
1394 def gather_telemetry(self):
1395 return {
1396 'active': self.active,
1397 'mode': self.mode,
1398 }