]>
Commit | Line | Data |
---|---|---|
3efd9988 FG |
1 | |
2 | """ | |
3 | Balance PG distribution across OSDs. | |
4 | """ | |
5 | ||
6 | import copy | |
7 | import errno | |
8 | import json | |
9 | import math | |
10 | import random | |
1adf2230 | 11 | import six |
3efd9988 FG |
12 | import time |
13 | from mgr_module import MgrModule, CommandResult | |
14 | from threading import Event | |
b32b8144 | 15 | from mgr_module import CRUSHMap |
3efd9988 | 16 | |
3efd9988 FG |
17 | TIME_FORMAT = '%Y-%m-%d_%H:%M:%S' |
18 | ||
3efd9988 FG |
19 | class MappingState: |
20 | def __init__(self, osdmap, pg_dump, desc=''): | |
21 | self.desc = desc | |
22 | self.osdmap = osdmap | |
23 | self.osdmap_dump = self.osdmap.dump() | |
24 | self.crush = osdmap.get_crush() | |
25 | self.crush_dump = self.crush.dump() | |
26 | self.pg_dump = pg_dump | |
27 | self.pg_stat = { | |
28 | i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', []) | |
29 | } | |
94b18763 FG |
30 | osd_poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])] |
31 | pg_poolids = [p['poolid'] for p in pg_dump.get('pool_stats', [])] | |
32 | self.poolids = set(osd_poolids) & set(pg_poolids) | |
3efd9988 FG |
33 | self.pg_up = {} |
34 | self.pg_up_by_poolid = {} | |
35 | for poolid in self.poolids: | |
36 | self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid) | |
1adf2230 | 37 | for a,b in six.iteritems(self.pg_up_by_poolid[poolid]): |
3efd9988 FG |
38 | self.pg_up[a] = b |
39 | ||
40 | def calc_misplaced_from(self, other_ms): | |
41 | num = len(other_ms.pg_up) | |
42 | misplaced = 0 | |
1adf2230 | 43 | for pgid, before in six.iteritems(other_ms.pg_up): |
3efd9988 FG |
44 | if before != self.pg_up.get(pgid, []): |
45 | misplaced += 1 | |
46 | if num > 0: | |
47 | return float(misplaced) / float(num) | |
48 | return 0.0 | |
49 | ||
50 | class Plan: | |
94b18763 | 51 | def __init__(self, name, ms, pools): |
3efd9988 FG |
52 | self.mode = 'unknown' |
53 | self.name = name | |
54 | self.initial = ms | |
94b18763 | 55 | self.pools = pools |
3efd9988 FG |
56 | |
57 | self.osd_weights = {} | |
58 | self.compat_ws = {} | |
59 | self.inc = ms.osdmap.new_incremental() | |
60 | ||
61 | def final_state(self): | |
62 | self.inc.set_osd_reweights(self.osd_weights) | |
63 | self.inc.set_crush_compat_weight_set_weights(self.compat_ws) | |
64 | return MappingState(self.initial.osdmap.apply_incremental(self.inc), | |
65 | self.initial.pg_dump, | |
66 | 'plan %s final' % self.name) | |
67 | ||
68 | def dump(self): | |
69 | return json.dumps(self.inc.dump(), indent=4) | |
70 | ||
71 | def show(self): | |
72 | ls = [] | |
73 | ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch()) | |
74 | ls.append('# starting crush version %d' % | |
75 | self.initial.osdmap.get_crush_version()) | |
76 | ls.append('# mode %s' % self.mode) | |
77 | if len(self.compat_ws) and \ | |
11fdf7f2 | 78 | not CRUSHMap.have_default_choose_args(self.initial.crush_dump): |
3efd9988 | 79 | ls.append('ceph osd crush weight-set create-compat') |
1adf2230 | 80 | for osd, weight in six.iteritems(self.compat_ws): |
3efd9988 FG |
81 | ls.append('ceph osd crush weight-set reweight-compat %s %f' % |
82 | (osd, weight)) | |
1adf2230 | 83 | for osd, weight in six.iteritems(self.osd_weights): |
3efd9988 FG |
84 | ls.append('ceph osd reweight osd.%d %f' % (osd, weight)) |
85 | incdump = self.inc.dump() | |
86 | for pgid in incdump.get('old_pg_upmap_items', []): | |
87 | ls.append('ceph osd rm-pg-upmap-items %s' % pgid) | |
88 | for item in incdump.get('new_pg_upmap_items', []): | |
89 | osdlist = [] | |
90 | for m in item['mappings']: | |
91 | osdlist += [m['from'], m['to']] | |
92 | ls.append('ceph osd pg-upmap-items %s %s' % | |
93 | (item['pgid'], ' '.join([str(a) for a in osdlist]))) | |
94 | return '\n'.join(ls) | |
95 | ||
96 | ||
97 | class Eval: | |
3efd9988 FG |
98 | def __init__(self, ms): |
99 | self.ms = ms | |
94b18763 FG |
100 | self.root_ids = {} # root name -> id |
101 | self.pool_name = {} # pool id -> pool name | |
102 | self.pool_id = {} # pool name -> id | |
103 | self.pool_roots = {} # pool name -> root name | |
104 | self.root_pools = {} # root name -> pools | |
105 | self.target_by_root = {} # root name -> target weight map | |
106 | self.count_by_pool = {} | |
107 | self.count_by_root = {} | |
108 | self.actual_by_pool = {} # pool -> by_* -> actual weight map | |
109 | self.actual_by_root = {} # pool -> by_* -> actual weight map | |
110 | self.total_by_pool = {} # pool -> by_* -> total | |
111 | self.total_by_root = {} # root -> by_* -> total | |
112 | self.stats_by_pool = {} # pool -> by_* -> stddev or avg -> value | |
113 | self.stats_by_root = {} # root -> by_* -> stddev or avg -> value | |
114 | ||
115 | self.score_by_pool = {} | |
116 | self.score_by_root = {} | |
117 | ||
118 | self.score = 0.0 | |
3efd9988 FG |
119 | |
120 | def show(self, verbose=False): | |
121 | if verbose: | |
122 | r = self.ms.desc + '\n' | |
123 | r += 'target_by_root %s\n' % self.target_by_root | |
124 | r += 'actual_by_pool %s\n' % self.actual_by_pool | |
125 | r += 'actual_by_root %s\n' % self.actual_by_root | |
126 | r += 'count_by_pool %s\n' % self.count_by_pool | |
127 | r += 'count_by_root %s\n' % self.count_by_root | |
128 | r += 'total_by_pool %s\n' % self.total_by_pool | |
129 | r += 'total_by_root %s\n' % self.total_by_root | |
130 | r += 'stats_by_root %s\n' % self.stats_by_root | |
131 | r += 'score_by_pool %s\n' % self.score_by_pool | |
132 | r += 'score_by_root %s\n' % self.score_by_root | |
133 | else: | |
134 | r = self.ms.desc + ' ' | |
135 | r += 'score %f (lower is better)\n' % self.score | |
136 | return r | |
137 | ||
138 | def calc_stats(self, count, target, total): | |
139 | num = max(len(target), 1) | |
140 | r = {} | |
141 | for t in ('pgs', 'objects', 'bytes'): | |
94b18763 FG |
142 | if total[t] == 0: |
143 | r[t] = { | |
144 | 'avg': 0, | |
145 | 'stddev': 0, | |
146 | 'sum_weight': 0, | |
147 | 'score': 0, | |
148 | } | |
149 | continue | |
150 | ||
3efd9988 FG |
151 | avg = float(total[t]) / float(num) |
152 | dev = 0.0 | |
153 | ||
154 | # score is a measure of how uneven the data distribution is. | |
155 | # score lies between [0, 1), 0 means perfect distribution. | |
156 | score = 0.0 | |
157 | sum_weight = 0.0 | |
158 | ||
1adf2230 | 159 | for k, v in six.iteritems(count[t]): |
3efd9988 FG |
160 | # adjust/normalize by weight |
161 | if target[k]: | |
162 | adjusted = float(v) / target[k] / float(num) | |
163 | else: | |
164 | adjusted = 0.0 | |
165 | ||
166 | # Overweighted devices and their weights are factors to calculate reweight_urgency. | |
167 | # One 10% underfilled device with 5 2% overfilled devices, is arguably a better | |
168 | # situation than one 10% overfilled with 5 2% underfilled devices | |
169 | if adjusted > avg: | |
170 | ''' | |
171 | F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution | |
172 | x = (adjusted - avg)/avg. | |
173 | Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1). | |
174 | To bring range of F(x) in range [0, 1), we need to make the above modification. | |
175 | ||
176 | In general, we need to use a function F(x), where x = (adjusted - avg)/avg | |
177 | 1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded. | |
178 | 2. A larger value of x, should imply more urgency to reweight. | |
179 | 3. Also, the difference between F(x) when x is large, should be minimal. | |
180 | 4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply. | |
181 | ||
182 | Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use. | |
183 | ||
184 | cdf of standard normal distribution: https://stackoverflow.com/a/29273201 | |
185 | ''' | |
186 | score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0))) | |
187 | sum_weight += target[k] | |
188 | dev += (avg - adjusted) * (avg - adjusted) | |
189 | stddev = math.sqrt(dev / float(max(num - 1, 1))) | |
190 | score = score / max(sum_weight, 1) | |
191 | r[t] = { | |
192 | 'avg': avg, | |
193 | 'stddev': stddev, | |
194 | 'sum_weight': sum_weight, | |
195 | 'score': score, | |
196 | } | |
197 | return r | |
198 | ||
199 | class Module(MgrModule): | |
11fdf7f2 TL |
200 | MODULE_OPTIONS = [ |
201 | { | |
202 | 'name': 'active', | |
203 | 'type': 'bool', | |
204 | 'default': False, | |
205 | 'desc': 'automatically balance PGs across cluster', | |
206 | 'runtime': True, | |
207 | }, | |
208 | { | |
209 | 'name': 'begin_time', | |
210 | 'type': 'str', | |
211 | 'default': '0000', | |
212 | 'desc': 'beginning time of day to automatically balance', | |
213 | 'long_desc': 'This is a time of day in the format HHMM.', | |
214 | 'runtime': True, | |
215 | }, | |
216 | { | |
217 | 'name': 'end_time', | |
218 | 'type': 'str', | |
219 | 'default': '2400', | |
220 | 'desc': 'ending time of day to automatically balance', | |
221 | 'long_desc': 'This is a time of day in the format HHMM.', | |
222 | 'runtime': True, | |
223 | }, | |
224 | { | |
225 | 'name': 'begin_weekday', | |
226 | 'type': 'uint', | |
227 | 'default': 0, | |
228 | 'min': 0, | |
229 | 'max': 7, | |
230 | 'desc': 'Restrict automatic balancing to this day of the week or later', | |
231 | 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.', | |
232 | 'runtime': True, | |
233 | }, | |
234 | { | |
235 | 'name': 'end_weekday', | |
236 | 'type': 'uint', | |
237 | 'default': 7, | |
238 | 'min': 0, | |
239 | 'max': 7, | |
240 | 'desc': 'Restrict automatic balancing to days of the week earlier than this', | |
241 | 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.', | |
242 | 'runtime': True, | |
243 | }, | |
244 | { | |
245 | 'name': 'crush_compat_max_iterations', | |
246 | 'type': 'uint', | |
247 | 'default': 25, | |
248 | 'min': 1, | |
249 | 'max': 250, | |
250 | 'desc': 'maximum number of iterations to attempt optimization', | |
251 | 'runtime': True, | |
252 | }, | |
253 | { | |
254 | 'name': 'crush_compat_metrics', | |
255 | 'type': 'str', | |
256 | 'default': 'pgs,objects,bytes', | |
257 | 'desc': 'metrics with which to calculate OSD utilization', | |
258 | 'long_desc': 'Value is a list of one or more of "pgs", "objects", or "bytes", and indicates which metrics to use to balance utilization.', | |
259 | 'runtime': True, | |
260 | }, | |
261 | { | |
262 | 'name': 'crush_compat_step', | |
263 | 'type': 'float', | |
264 | 'default': .5, | |
265 | 'min': .001, | |
266 | 'max': .999, | |
267 | 'desc': 'aggressiveness of optimization', | |
268 | 'long_desc': '.99 is very aggressive, .01 is less aggressive', | |
269 | 'runtime': True, | |
270 | }, | |
271 | { | |
272 | 'name': 'min_score', | |
273 | 'type': 'float', | |
274 | 'default': 0, | |
275 | 'desc': 'minimum score, below which no optimization is attempted', | |
276 | 'runtime': True, | |
277 | }, | |
278 | { | |
279 | 'name': 'mode', | |
280 | 'desc': 'Balancer mode', | |
281 | 'default': 'none', | |
282 | 'enum_allowed': ['none', 'crush-compat', 'upmap'], | |
283 | 'runtime': True, | |
284 | }, | |
285 | { | |
286 | 'name': 'sleep_interval', | |
287 | 'type': 'secs', | |
288 | 'default': 60, | |
289 | 'desc': 'how frequently to wake up and attempt optimization', | |
290 | 'runtime': True, | |
291 | }, | |
292 | { | |
293 | 'name': 'upmap_max_iterations', | |
294 | 'type': 'uint', | |
295 | 'default': 10, | |
296 | 'desc': 'maximum upmap optimization iterations', | |
297 | 'runtime': True, | |
298 | }, | |
299 | { | |
300 | 'name': 'upmap_max_deviation', | |
301 | 'type': 'float', | |
302 | 'default': .01, | |
303 | 'min': 0, | |
304 | 'max': 1, | |
305 | 'desc': 'deviation below which no optimization is attempted', | |
306 | 'long_desc': 'If the ratio between the fullest and least-full OSD is below this value then we stop trying to optimize placement.', | |
307 | 'runtime': True, | |
308 | }, | |
309 | { | |
310 | 'name': 'pool_ids', | |
311 | 'type': 'str', | |
312 | 'default': '', | |
313 | 'desc': 'pools which the automatic balancing will be limited to', | |
314 | 'runtime': True, | |
315 | }, | |
316 | ] | |
317 | ||
3efd9988 FG |
318 | COMMANDS = [ |
319 | { | |
320 | "cmd": "balancer status", | |
321 | "desc": "Show balancer status", | |
322 | "perm": "r", | |
323 | }, | |
324 | { | |
325 | "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap", | |
326 | "desc": "Set balancer mode", | |
327 | "perm": "rw", | |
328 | }, | |
329 | { | |
330 | "cmd": "balancer on", | |
331 | "desc": "Enable automatic balancing", | |
332 | "perm": "rw", | |
333 | }, | |
334 | { | |
335 | "cmd": "balancer off", | |
336 | "desc": "Disable automatic balancing", | |
337 | "perm": "rw", | |
338 | }, | |
11fdf7f2 TL |
339 | { |
340 | "cmd": "balancer pool ls", | |
341 | "desc": "List automatic balancing pools. " | |
342 | "Note that empty list means all existing pools will be automatic balancing targets, " | |
343 | "which is the default behaviour of balancer.", | |
344 | "perm": "r", | |
345 | }, | |
346 | { | |
347 | "cmd": "balancer pool add name=pools,type=CephString,n=N", | |
348 | "desc": "Enable automatic balancing for specific pools", | |
349 | "perm": "rw", | |
350 | }, | |
351 | { | |
352 | "cmd": "balancer pool rm name=pools,type=CephString,n=N", | |
353 | "desc": "Disable automatic balancing for specific pools", | |
354 | "perm": "rw", | |
355 | }, | |
3efd9988 | 356 | { |
94b18763 FG |
357 | "cmd": "balancer eval name=option,type=CephString,req=false", |
358 | "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan", | |
3efd9988 FG |
359 | "perm": "r", |
360 | }, | |
361 | { | |
94b18763 FG |
362 | "cmd": "balancer eval-verbose name=option,type=CephString,req=false", |
363 | "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan (verbosely)", | |
3efd9988 FG |
364 | "perm": "r", |
365 | }, | |
366 | { | |
94b18763 | 367 | "cmd": "balancer optimize name=plan,type=CephString name=pools,type=CephString,n=N,req=false", |
3efd9988 FG |
368 | "desc": "Run optimizer to create a new plan", |
369 | "perm": "rw", | |
370 | }, | |
371 | { | |
372 | "cmd": "balancer show name=plan,type=CephString", | |
373 | "desc": "Show details of an optimization plan", | |
374 | "perm": "r", | |
375 | }, | |
376 | { | |
377 | "cmd": "balancer rm name=plan,type=CephString", | |
378 | "desc": "Discard an optimization plan", | |
379 | "perm": "rw", | |
380 | }, | |
381 | { | |
382 | "cmd": "balancer reset", | |
383 | "desc": "Discard all optimization plans", | |
384 | "perm": "rw", | |
385 | }, | |
386 | { | |
387 | "cmd": "balancer dump name=plan,type=CephString", | |
388 | "desc": "Show an optimization plan", | |
389 | "perm": "r", | |
390 | }, | |
f64942e4 AA |
391 | { |
392 | "cmd": "balancer ls", | |
393 | "desc": "List all plans", | |
394 | "perm": "r", | |
395 | }, | |
3efd9988 FG |
396 | { |
397 | "cmd": "balancer execute name=plan,type=CephString", | |
398 | "desc": "Execute an optimization plan", | |
a8e16298 | 399 | "perm": "rw", |
3efd9988 FG |
400 | }, |
401 | ] | |
402 | active = False | |
403 | run = True | |
404 | plans = {} | |
405 | mode = '' | |
406 | ||
407 | def __init__(self, *args, **kwargs): | |
408 | super(Module, self).__init__(*args, **kwargs) | |
409 | self.event = Event() | |
410 | ||
11fdf7f2 | 411 | def handle_command(self, inbuf, command): |
3efd9988 FG |
412 | self.log.warn("Handling command: '%s'" % str(command)) |
413 | if command['prefix'] == 'balancer status': | |
414 | s = { | |
f64942e4 | 415 | 'plans': list(self.plans.keys()), |
3efd9988 | 416 | 'active': self.active, |
11fdf7f2 | 417 | 'mode': self.get_module_option('mode'), |
3efd9988 FG |
418 | } |
419 | return (0, json.dumps(s, indent=4), '') | |
420 | elif command['prefix'] == 'balancer mode': | |
11fdf7f2 TL |
421 | if command['mode'] == 'upmap': |
422 | min_compat_client = self.get_osdmap().dump().get('require_min_compat_client', '') | |
423 | if min_compat_client < 'luminous': # works well because version is alphabetized.. | |
424 | warn = 'min_compat_client "%s" ' \ | |
425 | '< "luminous", which is required for pg-upmap. ' \ | |
426 | 'Try "ceph osd set-require-min-compat-client luminous" ' \ | |
427 | 'before enabling this mode' % min_compat_client | |
428 | return (-errno.EPERM, '', warn) | |
81eedcae TL |
429 | elif command['mode'] == 'crush-compat': |
430 | ms = MappingState(self.get_osdmap(), | |
431 | self.get("pg_dump"), | |
432 | 'initialize compat weight-set') | |
433 | self.get_compat_weight_set_weights(ms) # ignore error | |
11fdf7f2 | 434 | self.set_module_option('mode', command['mode']) |
3efd9988 FG |
435 | return (0, '', '') |
436 | elif command['prefix'] == 'balancer on': | |
437 | if not self.active: | |
11fdf7f2 | 438 | self.set_module_option('active', 'true') |
3efd9988 FG |
439 | self.active = True |
440 | self.event.set() | |
441 | return (0, '', '') | |
442 | elif command['prefix'] == 'balancer off': | |
443 | if self.active: | |
11fdf7f2 | 444 | self.set_module_option('active', 'false') |
3efd9988 FG |
445 | self.active = False |
446 | self.event.set() | |
447 | return (0, '', '') | |
11fdf7f2 TL |
448 | elif command['prefix'] == 'balancer pool ls': |
449 | pool_ids = self.get_module_option('pool_ids') | |
450 | if pool_ids is '': | |
451 | return (0, '', '') | |
452 | pool_ids = pool_ids.split(',') | |
453 | pool_ids = [int(p) for p in pool_ids] | |
454 | pool_name_by_id = dict((p['pool'], p['pool_name']) for p in self.get_osdmap().dump().get('pools', [])) | |
455 | should_prune = False | |
456 | final_ids = [] | |
457 | final_names = [] | |
458 | for p in pool_ids: | |
459 | if p in pool_name_by_id: | |
460 | final_ids.append(p) | |
461 | final_names.append(pool_name_by_id[p]) | |
462 | else: | |
463 | should_prune = True | |
464 | if should_prune: # some pools were gone, prune | |
465 | self.set_module_option('pool_ids', ','.join(final_ids)) | |
466 | return (0, json.dumps(final_names, indent=4), '') | |
467 | elif command['prefix'] == 'balancer pool add': | |
468 | raw_names = command['pools'] | |
469 | pool_id_by_name = dict((p['pool_name'], p['pool']) for p in self.get_osdmap().dump().get('pools', [])) | |
470 | invalid_names = [p for p in raw_names if p not in pool_id_by_name] | |
471 | if invalid_names: | |
472 | return (-errno.EINVAL, '', 'pool(s) %s not found' % invalid_names) | |
473 | to_add = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name] | |
474 | existing = self.get_module_option('pool_ids') | |
475 | final = to_add | |
476 | if existing is not '': | |
477 | existing = existing.split(',') | |
478 | final = set(to_add) | set(existing) | |
479 | self.set_module_option('pool_ids', ','.join(final)) | |
480 | return (0, '', '') | |
481 | elif command['prefix'] == 'balancer pool rm': | |
482 | raw_names = command['pools'] | |
483 | existing = self.get_module_option('pool_ids') | |
484 | if existing is '': # for idempotence | |
485 | return (0, '', '') | |
486 | existing = existing.split(',') | |
487 | osdmap = self.get_osdmap() | |
488 | pool_ids = [str(p['pool']) for p in osdmap.dump().get('pools', [])] | |
489 | pool_id_by_name = dict((p['pool_name'], p['pool']) for p in osdmap.dump().get('pools', [])) | |
490 | final = [p for p in existing if p in pool_ids] | |
491 | to_delete = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name] | |
492 | final = set(final) - set(to_delete) | |
493 | self.set_module_option('pool_ids', ','.join(final)) | |
494 | return (0, '', '') | |
3efd9988 FG |
495 | elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose': |
496 | verbose = command['prefix'] == 'balancer eval-verbose' | |
94b18763 FG |
497 | pools = [] |
498 | if 'option' in command: | |
499 | plan = self.plans.get(command['option']) | |
3efd9988 | 500 | if not plan: |
94b18763 FG |
501 | # not a plan, does it look like a pool? |
502 | osdmap = self.get_osdmap() | |
503 | valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])] | |
504 | option = command['option'] | |
505 | if option not in valid_pool_names: | |
506 | return (-errno.EINVAL, '', 'option "%s" not a plan or a pool' % option) | |
507 | pools.append(option) | |
508 | ms = MappingState(osdmap, self.get("pg_dump"), 'pool "%s"' % option) | |
509 | else: | |
510 | pools = plan.pools | |
511 | ms = plan.final_state() | |
3efd9988 FG |
512 | else: |
513 | ms = MappingState(self.get_osdmap(), | |
514 | self.get("pg_dump"), | |
515 | 'current cluster') | |
94b18763 | 516 | return (0, self.evaluate(ms, pools, verbose=verbose), '') |
3efd9988 | 517 | elif command['prefix'] == 'balancer optimize': |
94b18763 FG |
518 | pools = [] |
519 | if 'pools' in command: | |
520 | pools = command['pools'] | |
521 | osdmap = self.get_osdmap() | |
522 | valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])] | |
523 | invalid_pool_names = [] | |
524 | for p in pools: | |
525 | if p not in valid_pool_names: | |
526 | invalid_pool_names.append(p) | |
527 | if len(invalid_pool_names): | |
528 | return (-errno.EINVAL, '', 'pools %s not found' % invalid_pool_names) | |
529 | plan = self.plan_create(command['plan'], osdmap, pools) | |
530 | r, detail = self.optimize(plan) | |
531 | # remove plan if we are currently unable to find an optimization | |
532 | # or distribution is already perfect | |
533 | if r: | |
534 | self.plan_rm(command['plan']) | |
535 | return (r, '', detail) | |
3efd9988 | 536 | elif command['prefix'] == 'balancer rm': |
b32b8144 | 537 | self.plan_rm(command['plan']) |
3efd9988 FG |
538 | return (0, '', '') |
539 | elif command['prefix'] == 'balancer reset': | |
540 | self.plans = {} | |
541 | return (0, '', '') | |
f64942e4 AA |
542 | elif command['prefix'] == 'balancer ls': |
543 | return (0, json.dumps([p for p in self.plans], indent=4), '') | |
3efd9988 FG |
544 | elif command['prefix'] == 'balancer dump': |
545 | plan = self.plans.get(command['plan']) | |
546 | if not plan: | |
547 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
548 | return (0, plan.dump(), '') | |
549 | elif command['prefix'] == 'balancer show': | |
550 | plan = self.plans.get(command['plan']) | |
551 | if not plan: | |
552 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
553 | return (0, plan.show(), '') | |
554 | elif command['prefix'] == 'balancer execute': | |
555 | plan = self.plans.get(command['plan']) | |
556 | if not plan: | |
557 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
94b18763 FG |
558 | r, detail = self.execute(plan) |
559 | self.plan_rm(command['plan']) | |
560 | return (r, '', detail) | |
3efd9988 FG |
561 | else: |
562 | return (-errno.EINVAL, '', | |
563 | "Command not found '{0}'".format(command['prefix'])) | |
564 | ||
565 | def shutdown(self): | |
566 | self.log.info('Stopping') | |
567 | self.run = False | |
568 | self.event.set() | |
569 | ||
a8e16298 TL |
570 | def time_permit(self): |
571 | local_time = time.localtime() | |
572 | time_of_day = time.strftime('%H%M', local_time) | |
573 | weekday = (local_time.tm_wday + 1) % 7 # be compatible with C | |
574 | permit = False | |
575 | ||
11fdf7f2 TL |
576 | begin_time = self.get_module_option('begin_time') |
577 | end_time = self.get_module_option('end_time') | |
a8e16298 TL |
578 | if begin_time <= end_time: |
579 | permit = begin_time <= time_of_day < end_time | |
580 | else: | |
581 | permit = time_of_day >= begin_time or time_of_day < end_time | |
582 | if not permit: | |
583 | self.log.debug("should run between %s - %s, now %s, skipping", | |
584 | begin_time, end_time, time_of_day) | |
585 | return False | |
586 | ||
11fdf7f2 TL |
587 | begin_weekday = self.get_module_option('begin_weekday') |
588 | end_weekday = self.get_module_option('end_weekday') | |
a8e16298 TL |
589 | if begin_weekday <= end_weekday: |
590 | permit = begin_weekday <= weekday < end_weekday | |
3efd9988 | 591 | else: |
a8e16298 TL |
592 | permit = weekday >= begin_weekday or weekday < end_weekday |
593 | if not permit: | |
594 | self.log.debug("should run between weekday %d - %d, now %d, skipping", | |
595 | begin_weekday, end_weekday, weekday) | |
596 | return False | |
597 | ||
598 | return True | |
3efd9988 FG |
599 | |
600 | def serve(self): | |
601 | self.log.info('Starting') | |
602 | while self.run: | |
11fdf7f2 TL |
603 | self.active = self.get_module_option('active') |
604 | sleep_interval = self.get_module_option('sleep_interval') | |
a8e16298 TL |
605 | self.log.debug('Waking up [%s, now %s]', |
606 | "active" if self.active else "inactive", | |
607 | time.strftime(TIME_FORMAT, time.localtime())) | |
608 | if self.active and self.time_permit(): | |
3efd9988 FG |
609 | self.log.debug('Running') |
610 | name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime()) | |
11fdf7f2 TL |
611 | osdmap = self.get_osdmap() |
612 | allow = self.get_module_option('pool_ids') | |
613 | final = [] | |
614 | if allow is not '': | |
615 | allow = allow.split(',') | |
616 | valid = [str(p['pool']) for p in osdmap.dump().get('pools', [])] | |
617 | final = set(allow) & set(valid) | |
618 | if set(allow) - set(valid): # some pools were gone, prune | |
619 | self.set_module_option('pool_ids', ','.join(final)) | |
620 | pool_name_by_id = dict((p['pool'], p['pool_name']) for p in osdmap.dump().get('pools', [])) | |
621 | final = [int(p) for p in final] | |
622 | final = [pool_name_by_id[p] for p in final if p in pool_name_by_id] | |
623 | plan = self.plan_create(name, osdmap, final) | |
94b18763 FG |
624 | r, detail = self.optimize(plan) |
625 | if r == 0: | |
3efd9988 FG |
626 | self.execute(plan) |
627 | self.plan_rm(name) | |
628 | self.log.debug('Sleeping for %d', sleep_interval) | |
629 | self.event.wait(sleep_interval) | |
630 | self.event.clear() | |
631 | ||
94b18763 FG |
632 | def plan_create(self, name, osdmap, pools): |
633 | plan = Plan(name, | |
634 | MappingState(osdmap, | |
635 | self.get("pg_dump"), | |
636 | 'plan %s initial' % name), | |
637 | pools) | |
3efd9988 FG |
638 | self.plans[name] = plan |
639 | return plan | |
640 | ||
641 | def plan_rm(self, name): | |
642 | if name in self.plans: | |
643 | del self.plans[name] | |
644 | ||
94b18763 | 645 | def calc_eval(self, ms, pools): |
3efd9988 FG |
646 | pe = Eval(ms) |
647 | pool_rule = {} | |
648 | pool_info = {} | |
649 | for p in ms.osdmap_dump.get('pools',[]): | |
94b18763 FG |
650 | if len(pools) and p['pool_name'] not in pools: |
651 | continue | |
652 | # skip dead or not-yet-ready pools too | |
653 | if p['pool'] not in ms.poolids: | |
654 | continue | |
3efd9988 FG |
655 | pe.pool_name[p['pool']] = p['pool_name'] |
656 | pe.pool_id[p['pool_name']] = p['pool'] | |
657 | pool_rule[p['pool_name']] = p['crush_rule'] | |
658 | pe.pool_roots[p['pool_name']] = [] | |
659 | pool_info[p['pool_name']] = p | |
94b18763 | 660 | if len(pool_info) == 0: |
3efd9988 FG |
661 | return pe |
662 | self.log.debug('pool_name %s' % pe.pool_name) | |
663 | self.log.debug('pool_id %s' % pe.pool_id) | |
664 | self.log.debug('pools %s' % pools) | |
665 | self.log.debug('pool_rule %s' % pool_rule) | |
666 | ||
667 | osd_weight = { a['osd']: a['weight'] | |
94b18763 | 668 | for a in ms.osdmap_dump.get('osds',[]) if a['weight'] > 0 } |
3efd9988 FG |
669 | |
670 | # get expected distributions by root | |
671 | actual_by_root = {} | |
672 | rootids = ms.crush.find_takes() | |
673 | roots = [] | |
674 | for rootid in rootids: | |
3efd9988 | 675 | ls = ms.osdmap.get_pools_by_take(rootid) |
94b18763 FG |
676 | want = [] |
677 | # find out roots associating with pools we are passed in | |
678 | for candidate in ls: | |
679 | if candidate in pe.pool_name: | |
680 | want.append(candidate) | |
681 | if len(want) == 0: | |
682 | continue | |
683 | root = ms.crush.get_item_name(rootid) | |
3efd9988 | 684 | pe.root_pools[root] = [] |
94b18763 | 685 | for poolid in want: |
3efd9988 FG |
686 | pe.pool_roots[pe.pool_name[poolid]].append(root) |
687 | pe.root_pools[root].append(pe.pool_name[poolid]) | |
94b18763 FG |
688 | pe.root_ids[root] = rootid |
689 | roots.append(root) | |
3efd9988 FG |
690 | weight_map = ms.crush.get_take_weight_osd_map(rootid) |
691 | adjusted_map = { | |
94b18763 | 692 | osd: cw * osd_weight[osd] |
1adf2230 | 693 | for osd,cw in six.iteritems(weight_map) if osd in osd_weight and cw > 0 |
3efd9988 | 694 | } |
94b18763 FG |
695 | sum_w = sum(adjusted_map.values()) |
696 | assert len(adjusted_map) == 0 or sum_w > 0 | |
3efd9988 | 697 | pe.target_by_root[root] = { osd: w / sum_w |
1adf2230 | 698 | for osd,w in six.iteritems(adjusted_map) } |
3efd9988 FG |
699 | actual_by_root[root] = { |
700 | 'pgs': {}, | |
701 | 'objects': {}, | |
702 | 'bytes': {}, | |
703 | } | |
11fdf7f2 | 704 | for osd in pe.target_by_root[root]: |
3efd9988 FG |
705 | actual_by_root[root]['pgs'][osd] = 0 |
706 | actual_by_root[root]['objects'][osd] = 0 | |
707 | actual_by_root[root]['bytes'][osd] = 0 | |
708 | pe.total_by_root[root] = { | |
709 | 'pgs': 0, | |
710 | 'objects': 0, | |
711 | 'bytes': 0, | |
712 | } | |
713 | self.log.debug('pool_roots %s' % pe.pool_roots) | |
714 | self.log.debug('root_pools %s' % pe.root_pools) | |
715 | self.log.debug('target_by_root %s' % pe.target_by_root) | |
716 | ||
717 | # pool and root actual | |
1adf2230 | 718 | for pool, pi in six.iteritems(pool_info): |
3efd9988 FG |
719 | poolid = pi['pool'] |
720 | pm = ms.pg_up_by_poolid[poolid] | |
721 | pgs = 0 | |
722 | objects = 0 | |
723 | bytes = 0 | |
724 | pgs_by_osd = {} | |
725 | objects_by_osd = {} | |
726 | bytes_by_osd = {} | |
727 | for root in pe.pool_roots[pool]: | |
11fdf7f2 | 728 | for osd in pe.target_by_root[root]: |
3efd9988 FG |
729 | pgs_by_osd[osd] = 0 |
730 | objects_by_osd[osd] = 0 | |
731 | bytes_by_osd[osd] = 0 | |
1adf2230 | 732 | for pgid, up in six.iteritems(pm): |
3efd9988 | 733 | for osd in [int(osd) for osd in up]: |
b32b8144 FG |
734 | if osd == CRUSHMap.ITEM_NONE: |
735 | continue | |
3efd9988 FG |
736 | pgs_by_osd[osd] += 1 |
737 | objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects'] | |
738 | bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes'] | |
739 | # pick a root to associate this pg instance with. | |
740 | # note that this is imprecise if the roots have | |
741 | # overlapping children. | |
742 | # FIXME: divide bytes by k for EC pools. | |
743 | for root in pe.pool_roots[pool]: | |
744 | if osd in pe.target_by_root[root]: | |
745 | actual_by_root[root]['pgs'][osd] += 1 | |
746 | actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects'] | |
747 | actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes'] | |
748 | pgs += 1 | |
749 | objects += ms.pg_stat[pgid]['num_objects'] | |
750 | bytes += ms.pg_stat[pgid]['num_bytes'] | |
751 | pe.total_by_root[root]['pgs'] += 1 | |
752 | pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects'] | |
753 | pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes'] | |
754 | break | |
755 | pe.count_by_pool[pool] = { | |
756 | 'pgs': { | |
757 | k: v | |
1adf2230 | 758 | for k, v in six.iteritems(pgs_by_osd) |
3efd9988 FG |
759 | }, |
760 | 'objects': { | |
761 | k: v | |
1adf2230 | 762 | for k, v in six.iteritems(objects_by_osd) |
3efd9988 FG |
763 | }, |
764 | 'bytes': { | |
765 | k: v | |
1adf2230 | 766 | for k, v in six.iteritems(bytes_by_osd) |
3efd9988 FG |
767 | }, |
768 | } | |
769 | pe.actual_by_pool[pool] = { | |
770 | 'pgs': { | |
771 | k: float(v) / float(max(pgs, 1)) | |
1adf2230 | 772 | for k, v in six.iteritems(pgs_by_osd) |
3efd9988 FG |
773 | }, |
774 | 'objects': { | |
775 | k: float(v) / float(max(objects, 1)) | |
1adf2230 | 776 | for k, v in six.iteritems(objects_by_osd) |
3efd9988 FG |
777 | }, |
778 | 'bytes': { | |
779 | k: float(v) / float(max(bytes, 1)) | |
1adf2230 | 780 | for k, v in six.iteritems(bytes_by_osd) |
3efd9988 FG |
781 | }, |
782 | } | |
783 | pe.total_by_pool[pool] = { | |
784 | 'pgs': pgs, | |
785 | 'objects': objects, | |
786 | 'bytes': bytes, | |
787 | } | |
11fdf7f2 | 788 | for root in pe.total_by_root: |
3efd9988 FG |
789 | pe.count_by_root[root] = { |
790 | 'pgs': { | |
791 | k: float(v) | |
1adf2230 | 792 | for k, v in six.iteritems(actual_by_root[root]['pgs']) |
3efd9988 FG |
793 | }, |
794 | 'objects': { | |
795 | k: float(v) | |
1adf2230 | 796 | for k, v in six.iteritems(actual_by_root[root]['objects']) |
3efd9988 FG |
797 | }, |
798 | 'bytes': { | |
799 | k: float(v) | |
1adf2230 | 800 | for k, v in six.iteritems(actual_by_root[root]['bytes']) |
3efd9988 FG |
801 | }, |
802 | } | |
803 | pe.actual_by_root[root] = { | |
804 | 'pgs': { | |
805 | k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1)) | |
1adf2230 | 806 | for k, v in six.iteritems(actual_by_root[root]['pgs']) |
3efd9988 FG |
807 | }, |
808 | 'objects': { | |
809 | k: float(v) / float(max(pe.total_by_root[root]['objects'], 1)) | |
1adf2230 | 810 | for k, v in six.iteritems(actual_by_root[root]['objects']) |
3efd9988 FG |
811 | }, |
812 | 'bytes': { | |
813 | k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1)) | |
1adf2230 | 814 | for k, v in six.iteritems(actual_by_root[root]['bytes']) |
3efd9988 FG |
815 | }, |
816 | } | |
817 | self.log.debug('actual_by_pool %s' % pe.actual_by_pool) | |
818 | self.log.debug('actual_by_root %s' % pe.actual_by_root) | |
819 | ||
820 | # average and stddev and score | |
821 | pe.stats_by_root = { | |
822 | a: pe.calc_stats( | |
823 | b, | |
824 | pe.target_by_root[a], | |
825 | pe.total_by_root[a] | |
1adf2230 | 826 | ) for a, b in six.iteritems(pe.count_by_root) |
3efd9988 | 827 | } |
94b18763 | 828 | self.log.debug('stats_by_root %s' % pe.stats_by_root) |
3efd9988 FG |
829 | |
830 | # the scores are already normalized | |
831 | pe.score_by_root = { | |
832 | r: { | |
833 | 'pgs': pe.stats_by_root[r]['pgs']['score'], | |
834 | 'objects': pe.stats_by_root[r]['objects']['score'], | |
835 | 'bytes': pe.stats_by_root[r]['bytes']['score'], | |
836 | } for r in pe.total_by_root.keys() | |
837 | } | |
94b18763 | 838 | self.log.debug('score_by_root %s' % pe.score_by_root) |
3efd9988 | 839 | |
f64942e4 | 840 | # get the list of score metrics, comma separated |
11fdf7f2 | 841 | metrics = self.get_module_option('crush_compat_metrics').split(',') |
f64942e4 | 842 | |
3efd9988 FG |
843 | # total score is just average of normalized stddevs |
844 | pe.score = 0.0 | |
1adf2230 AA |
845 | for r, vs in six.iteritems(pe.score_by_root): |
846 | for k, v in six.iteritems(vs): | |
f64942e4 AA |
847 | if k in metrics: |
848 | pe.score += v | |
849 | pe.score /= len(metrics) * len(roots) | |
3efd9988 FG |
850 | return pe |
851 | ||
94b18763 FG |
852 | def evaluate(self, ms, pools, verbose=False): |
853 | pe = self.calc_eval(ms, pools) | |
3efd9988 FG |
854 | return pe.show(verbose=verbose) |
855 | ||
856 | def optimize(self, plan): | |
857 | self.log.info('Optimize plan %s' % plan.name) | |
11fdf7f2 TL |
858 | plan.mode = self.get_module_option('mode') |
859 | max_misplaced = float(self.get_ceph_option('target_max_misplaced_ratio')) | |
3efd9988 FG |
860 | self.log.info('Mode %s, max misplaced %f' % |
861 | (plan.mode, max_misplaced)) | |
862 | ||
863 | info = self.get('pg_status') | |
864 | unknown = info.get('unknown_pgs_ratio', 0.0) | |
865 | degraded = info.get('degraded_ratio', 0.0) | |
866 | inactive = info.get('inactive_pgs_ratio', 0.0) | |
867 | misplaced = info.get('misplaced_ratio', 0.0) | |
868 | self.log.debug('unknown %f degraded %f inactive %f misplaced %g', | |
869 | unknown, degraded, inactive, misplaced) | |
870 | if unknown > 0.0: | |
94b18763 FG |
871 | detail = 'Some PGs (%f) are unknown; try again later' % unknown |
872 | self.log.info(detail) | |
873 | return -errno.EAGAIN, detail | |
3efd9988 | 874 | elif degraded > 0.0: |
94b18763 FG |
875 | detail = 'Some objects (%f) are degraded; try again later' % degraded |
876 | self.log.info(detail) | |
877 | return -errno.EAGAIN, detail | |
3efd9988 | 878 | elif inactive > 0.0: |
94b18763 FG |
879 | detail = 'Some PGs (%f) are inactive; try again later' % inactive |
880 | self.log.info(detail) | |
881 | return -errno.EAGAIN, detail | |
3efd9988 | 882 | elif misplaced >= max_misplaced: |
94b18763 FG |
883 | detail = 'Too many objects (%f > %f) are misplaced; ' \ |
884 | 'try again later' % (misplaced, max_misplaced) | |
885 | self.log.info(detail) | |
886 | return -errno.EAGAIN, detail | |
3efd9988 FG |
887 | else: |
888 | if plan.mode == 'upmap': | |
889 | return self.do_upmap(plan) | |
890 | elif plan.mode == 'crush-compat': | |
891 | return self.do_crush_compat(plan) | |
892 | elif plan.mode == 'none': | |
94b18763 | 893 | detail = 'Please do "ceph balancer mode" to choose a valid mode first' |
3efd9988 | 894 | self.log.info('Idle') |
94b18763 | 895 | return -errno.ENOEXEC, detail |
3efd9988 | 896 | else: |
94b18763 FG |
897 | detail = 'Unrecognized mode %s' % plan.mode |
898 | self.log.info(detail) | |
899 | return -errno.EINVAL, detail | |
3efd9988 FG |
900 | ## |
901 | ||
902 | def do_upmap(self, plan): | |
903 | self.log.info('do_upmap') | |
11fdf7f2 TL |
904 | max_iterations = self.get_module_option('upmap_max_iterations') |
905 | max_deviation = self.get_module_option('upmap_max_deviation') | |
3efd9988 FG |
906 | |
907 | ms = plan.initial | |
94b18763 FG |
908 | if len(plan.pools): |
909 | pools = plan.pools | |
910 | else: # all | |
911 | pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])] | |
3efd9988 | 912 | if len(pools) == 0: |
94b18763 FG |
913 | detail = 'No pools available' |
914 | self.log.info(detail) | |
915 | return -errno.ENOENT, detail | |
3efd9988 FG |
916 | |
917 | inc = plan.inc | |
918 | total_did = 0 | |
919 | left = max_iterations | |
11fdf7f2 TL |
920 | osdmap_dump = self.get_osdmap().dump() |
921 | pools_with_pg_merge = [p['pool_name'] for p in osdmap_dump.get('pools', []) | |
922 | if p['pg_num'] > p['pg_num_target']] | |
923 | crush_rule_by_pool_name = dict((p['pool_name'], p['crush_rule']) for p in osdmap_dump.get('pools', [])) | |
924 | pools_by_crush_rule = {} # group pools by crush_rule | |
3efd9988 | 925 | for pool in pools: |
11fdf7f2 TL |
926 | if pool not in crush_rule_by_pool_name: |
927 | self.log.info('pool %s does not exist' % pool) | |
928 | continue | |
929 | if pool in pools_with_pg_merge: | |
930 | self.log.info('pool %s has pending PG(s) for merging, skipping for now' % pool) | |
931 | continue | |
932 | crush_rule = crush_rule_by_pool_name[pool] | |
933 | if crush_rule not in pools_by_crush_rule: | |
934 | pools_by_crush_rule[crush_rule] = [] | |
935 | pools_by_crush_rule[crush_rule].append(pool) | |
936 | classified_pools = list(pools_by_crush_rule.values()) | |
937 | # shuffle so all pools get equal (in)attention | |
938 | random.shuffle(classified_pools) | |
939 | for it in classified_pools: | |
940 | did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, it) | |
3efd9988 FG |
941 | total_did += did |
942 | left -= did | |
943 | if left <= 0: | |
944 | break | |
945 | self.log.info('prepared %d/%d changes' % (total_did, max_iterations)) | |
94b18763 | 946 | if total_did == 0: |
11fdf7f2 TL |
947 | return -errno.EALREADY, 'Unable to find further optimization, ' \ |
948 | 'or pool(s)\' pg_num is decreasing, ' \ | |
94b18763 FG |
949 | 'or distribution is already perfect' |
950 | return 0, '' | |
3efd9988 FG |
951 | |
952 | def do_crush_compat(self, plan): | |
953 | self.log.info('do_crush_compat') | |
11fdf7f2 | 954 | max_iterations = self.get_module_option('crush_compat_max_iterations') |
3efd9988 | 955 | if max_iterations < 1: |
94b18763 | 956 | return -errno.EINVAL, '"crush_compat_max_iterations" must be >= 1' |
11fdf7f2 | 957 | step = self.get_module_option('crush_compat_step') |
3efd9988 | 958 | if step <= 0 or step >= 1.0: |
94b18763 | 959 | return -errno.EINVAL, '"crush_compat_step" must be in (0, 1)' |
11fdf7f2 | 960 | max_misplaced = float(self.get_ceph_option('target_max_misplaced_ratio')) |
3efd9988 FG |
961 | min_pg_per_osd = 2 |
962 | ||
963 | ms = plan.initial | |
964 | osdmap = ms.osdmap | |
965 | crush = osdmap.get_crush() | |
94b18763 | 966 | pe = self.calc_eval(ms, plan.pools) |
11fdf7f2 | 967 | min_score_to_optimize = self.get_module_option('min_score') |
94b18763 FG |
968 | if pe.score <= min_score_to_optimize: |
969 | if pe.score == 0: | |
970 | detail = 'Distribution is already perfect' | |
971 | else: | |
972 | detail = 'score %f <= min_score %f, will not optimize' \ | |
973 | % (pe.score, min_score_to_optimize) | |
974 | self.log.info(detail) | |
975 | return -errno.EALREADY, detail | |
3efd9988 FG |
976 | |
977 | # get current osd reweights | |
978 | orig_osd_weight = { a['osd']: a['weight'] | |
979 | for a in ms.osdmap_dump.get('osds',[]) } | |
1adf2230 | 980 | reweighted_osds = [ a for a,b in six.iteritems(orig_osd_weight) |
3efd9988 FG |
981 | if b < 1.0 and b > 0.0 ] |
982 | ||
983 | # get current compat weight-set weights | |
94b18763 FG |
984 | orig_ws = self.get_compat_weight_set_weights(ms) |
985 | if not orig_ws: | |
986 | return -errno.EAGAIN, 'compat weight-set not available' | |
1adf2230 | 987 | orig_ws = { a: b for a, b in six.iteritems(orig_ws) if a >= 0 } |
3efd9988 FG |
988 | |
989 | # Make sure roots don't overlap their devices. If so, we | |
990 | # can't proceed. | |
991 | roots = pe.target_by_root.keys() | |
992 | self.log.debug('roots %s', roots) | |
993 | visited = {} | |
994 | overlap = {} | |
995 | root_ids = {} | |
1adf2230 | 996 | for root, wm in six.iteritems(pe.target_by_root): |
11fdf7f2 | 997 | for osd in wm: |
3efd9988 | 998 | if osd in visited: |
11fdf7f2 TL |
999 | if osd not in overlap: |
1000 | overlap[osd] = [ visited[osd] ] | |
1001 | overlap[osd].append(root) | |
1002 | visited[osd] = root | |
3efd9988 | 1003 | if len(overlap) > 0: |
94b18763 | 1004 | detail = 'Some osds belong to multiple subtrees: %s' % \ |
11fdf7f2 | 1005 | overlap |
94b18763 FG |
1006 | self.log.error(detail) |
1007 | return -errno.EOPNOTSUPP, detail | |
3efd9988 | 1008 | |
f64942e4 | 1009 | # rebalance by pgs, objects, or bytes |
11fdf7f2 | 1010 | metrics = self.get_module_option('crush_compat_metrics').split(',') |
f64942e4 AA |
1011 | key = metrics[0] # balancing using the first score metric |
1012 | if key not in ['pgs', 'bytes', 'objects']: | |
1013 | self.log.warn("Invalid crush_compat balancing key %s. Using 'pgs'." % key) | |
1014 | key = 'pgs' | |
3efd9988 FG |
1015 | |
1016 | # go | |
1017 | best_ws = copy.deepcopy(orig_ws) | |
1018 | best_ow = copy.deepcopy(orig_osd_weight) | |
1019 | best_pe = pe | |
1020 | left = max_iterations | |
1021 | bad_steps = 0 | |
1022 | next_ws = copy.deepcopy(best_ws) | |
1023 | next_ow = copy.deepcopy(best_ow) | |
1024 | while left > 0: | |
1025 | # adjust | |
1026 | self.log.debug('best_ws %s' % best_ws) | |
1027 | random.shuffle(roots) | |
1028 | for root in roots: | |
1029 | pools = best_pe.root_pools[root] | |
94b18763 FG |
1030 | osds = len(best_pe.target_by_root[root]) |
1031 | min_pgs = osds * min_pg_per_osd | |
1032 | if best_pe.total_by_root[root][key] < min_pgs: | |
3efd9988 FG |
1033 | self.log.info('Skipping root %s (pools %s), total pgs %d ' |
1034 | '< minimum %d (%d per osd)', | |
94b18763 FG |
1035 | root, pools, |
1036 | best_pe.total_by_root[root][key], | |
1037 | min_pgs, min_pg_per_osd) | |
3efd9988 FG |
1038 | continue |
1039 | self.log.info('Balancing root %s (pools %s) by %s' % | |
1040 | (root, pools, key)) | |
1041 | target = best_pe.target_by_root[root] | |
1042 | actual = best_pe.actual_by_root[root][key] | |
1043 | queue = sorted(actual.keys(), | |
1044 | key=lambda osd: -abs(target[osd] - actual[osd])) | |
1045 | for osd in queue: | |
1046 | if orig_osd_weight[osd] == 0: | |
1047 | self.log.debug('skipping out osd.%d', osd) | |
1048 | else: | |
1049 | deviation = target[osd] - actual[osd] | |
1050 | if deviation == 0: | |
1051 | break | |
1052 | self.log.debug('osd.%d deviation %f', osd, deviation) | |
1053 | weight = best_ws[osd] | |
1054 | ow = orig_osd_weight[osd] | |
1055 | if actual[osd] > 0: | |
1056 | calc_weight = target[osd] / actual[osd] * weight * ow | |
1057 | else: | |
81eedcae TL |
1058 | # for newly created osds, reset calc_weight at target value |
1059 | # this way weight-set will end up absorbing *step* of its | |
1060 | # target (final) value at the very beginning and slowly catch up later. | |
1061 | # note that if this turns out causing too many misplaced | |
1062 | # pgs, then we'll reduce step and retry | |
1063 | calc_weight = target[osd] | |
3efd9988 FG |
1064 | new_weight = weight * (1.0 - step) + calc_weight * step |
1065 | self.log.debug('Reweight osd.%d %f -> %f', osd, weight, | |
1066 | new_weight) | |
1067 | next_ws[osd] = new_weight | |
1068 | if ow < 1.0: | |
1069 | new_ow = min(1.0, max(step + (1.0 - step) * ow, | |
1070 | ow + .005)) | |
1071 | self.log.debug('Reweight osd.%d reweight %f -> %f', | |
1072 | osd, ow, new_ow) | |
1073 | next_ow[osd] = new_ow | |
1074 | ||
1075 | # normalize weights under this root | |
1076 | root_weight = crush.get_item_weight(pe.root_ids[root]) | |
1adf2230 | 1077 | root_sum = sum(b for a,b in six.iteritems(next_ws) |
3efd9988 FG |
1078 | if a in target.keys()) |
1079 | if root_sum > 0 and root_weight > 0: | |
1080 | factor = root_sum / root_weight | |
1081 | self.log.debug('normalizing root %s %d, weight %f, ' | |
1082 | 'ws sum %f, factor %f', | |
1083 | root, pe.root_ids[root], root_weight, | |
1084 | root_sum, factor) | |
1085 | for osd in actual.keys(): | |
1086 | next_ws[osd] = next_ws[osd] / factor | |
1087 | ||
1088 | # recalc | |
1089 | plan.compat_ws = copy.deepcopy(next_ws) | |
1090 | next_ms = plan.final_state() | |
94b18763 | 1091 | next_pe = self.calc_eval(next_ms, plan.pools) |
3efd9988 FG |
1092 | next_misplaced = next_ms.calc_misplaced_from(ms) |
1093 | self.log.debug('Step result score %f -> %f, misplacing %f', | |
1094 | best_pe.score, next_pe.score, next_misplaced) | |
1095 | ||
1096 | if next_misplaced > max_misplaced: | |
1097 | if best_pe.score < pe.score: | |
1098 | self.log.debug('Step misplaced %f > max %f, stopping', | |
1099 | next_misplaced, max_misplaced) | |
1100 | break | |
1101 | step /= 2.0 | |
1102 | next_ws = copy.deepcopy(best_ws) | |
1103 | next_ow = copy.deepcopy(best_ow) | |
1104 | self.log.debug('Step misplaced %f > max %f, reducing step to %f', | |
1105 | next_misplaced, max_misplaced, step) | |
1106 | else: | |
1107 | if next_pe.score > best_pe.score * 1.0001: | |
94b18763 | 1108 | bad_steps += 1 |
3efd9988 FG |
1109 | if bad_steps < 5 and random.randint(0, 100) < 70: |
1110 | self.log.debug('Score got worse, taking another step') | |
1111 | else: | |
1112 | step /= 2.0 | |
1113 | next_ws = copy.deepcopy(best_ws) | |
1114 | next_ow = copy.deepcopy(best_ow) | |
1115 | self.log.debug('Score got worse, trying smaller step %f', | |
1116 | step) | |
1117 | else: | |
1118 | bad_steps = 0 | |
1119 | best_pe = next_pe | |
91327a77 AA |
1120 | best_ws = copy.deepcopy(next_ws) |
1121 | best_ow = copy.deepcopy(next_ow) | |
3efd9988 FG |
1122 | if best_pe.score == 0: |
1123 | break | |
1124 | left -= 1 | |
1125 | ||
1126 | # allow a small regression if we are phasing out osd weights | |
1127 | fudge = 0 | |
81eedcae | 1128 | if best_ow != orig_osd_weight: |
3efd9988 FG |
1129 | fudge = .001 |
1130 | ||
1131 | if best_pe.score < pe.score + fudge: | |
1132 | self.log.info('Success, score %f -> %f', pe.score, best_pe.score) | |
1133 | plan.compat_ws = best_ws | |
1adf2230 | 1134 | for osd, w in six.iteritems(best_ow): |
3efd9988 FG |
1135 | if w != orig_osd_weight[osd]: |
1136 | self.log.debug('osd.%d reweight %f', osd, w) | |
1137 | plan.osd_weights[osd] = w | |
94b18763 | 1138 | return 0, '' |
3efd9988 FG |
1139 | else: |
1140 | self.log.info('Failed to find further optimization, score %f', | |
1141 | pe.score) | |
94b18763 FG |
1142 | plan.compat_ws = {} |
1143 | return -errno.EDOM, 'Unable to find further optimization, ' \ | |
1144 | 'change balancer mode and retry might help' | |
1145 | ||
1146 | def get_compat_weight_set_weights(self, ms): | |
11fdf7f2 | 1147 | if not CRUSHMap.have_default_choose_args(ms.crush_dump): |
94b18763 FG |
1148 | # enable compat weight-set first |
1149 | self.log.debug('ceph osd crush weight-set create-compat') | |
1150 | result = CommandResult('') | |
1151 | self.send_command(result, 'mon', '', json.dumps({ | |
1152 | 'prefix': 'osd crush weight-set create-compat', | |
1153 | 'format': 'json', | |
1154 | }), '') | |
1155 | r, outb, outs = result.wait() | |
1156 | if r != 0: | |
1157 | self.log.error('Error creating compat weight-set') | |
1158 | return | |
1159 | ||
1160 | result = CommandResult('') | |
1161 | self.send_command(result, 'mon', '', json.dumps({ | |
1162 | 'prefix': 'osd crush dump', | |
1163 | 'format': 'json', | |
1164 | }), '') | |
1165 | r, outb, outs = result.wait() | |
1166 | if r != 0: | |
1167 | self.log.error('Error dumping crush map') | |
1168 | return | |
1169 | try: | |
1170 | crushmap = json.loads(outb) | |
1171 | except: | |
1172 | raise RuntimeError('unable to parse crush map') | |
1173 | else: | |
1174 | crushmap = ms.crush_dump | |
3efd9988 | 1175 | |
11fdf7f2 | 1176 | raw = CRUSHMap.get_default_choose_args(crushmap) |
3efd9988 FG |
1177 | weight_set = {} |
1178 | for b in raw: | |
1179 | bucket = None | |
1180 | for t in crushmap['buckets']: | |
1181 | if t['id'] == b['bucket_id']: | |
1182 | bucket = t | |
1183 | break | |
1184 | if not bucket: | |
1185 | raise RuntimeError('could not find bucket %s' % b['bucket_id']) | |
1186 | self.log.debug('bucket items %s' % bucket['items']) | |
1187 | self.log.debug('weight set %s' % b['weight_set'][0]) | |
1188 | if len(bucket['items']) != len(b['weight_set'][0]): | |
1189 | raise RuntimeError('weight-set size does not match bucket items') | |
1190 | for pos in range(len(bucket['items'])): | |
1191 | weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos] | |
1192 | ||
1193 | self.log.debug('weight_set weights %s' % weight_set) | |
1194 | return weight_set | |
1195 | ||
1196 | def do_crush(self): | |
1197 | self.log.info('do_crush (not yet implemented)') | |
1198 | ||
1199 | def do_osd_weight(self): | |
1200 | self.log.info('do_osd_weight (not yet implemented)') | |
1201 | ||
1202 | def execute(self, plan): | |
1203 | self.log.info('Executing plan %s' % plan.name) | |
1204 | ||
1205 | commands = [] | |
1206 | ||
1207 | # compat weight-set | |
1208 | if len(plan.compat_ws) and \ | |
11fdf7f2 | 1209 | not CRUSHMap.have_default_choose_args(plan.initial.crush_dump): |
3efd9988 FG |
1210 | self.log.debug('ceph osd crush weight-set create-compat') |
1211 | result = CommandResult('') | |
1212 | self.send_command(result, 'mon', '', json.dumps({ | |
1213 | 'prefix': 'osd crush weight-set create-compat', | |
1214 | 'format': 'json', | |
1215 | }), '') | |
1216 | r, outb, outs = result.wait() | |
1217 | if r != 0: | |
1218 | self.log.error('Error creating compat weight-set') | |
94b18763 | 1219 | return r, outs |
3efd9988 | 1220 | |
1adf2230 | 1221 | for osd, weight in six.iteritems(plan.compat_ws): |
3efd9988 FG |
1222 | self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f', |
1223 | osd, weight) | |
b32b8144 | 1224 | result = CommandResult('') |
3efd9988 FG |
1225 | self.send_command(result, 'mon', '', json.dumps({ |
1226 | 'prefix': 'osd crush weight-set reweight-compat', | |
1227 | 'format': 'json', | |
1228 | 'item': 'osd.%d' % osd, | |
1229 | 'weight': [weight], | |
b32b8144 | 1230 | }), '') |
3efd9988 FG |
1231 | commands.append(result) |
1232 | ||
1233 | # new_weight | |
1234 | reweightn = {} | |
1adf2230 | 1235 | for osd, weight in six.iteritems(plan.osd_weights): |
3efd9988 FG |
1236 | reweightn[str(osd)] = str(int(weight * float(0x10000))) |
1237 | if len(reweightn): | |
1238 | self.log.info('ceph osd reweightn %s', reweightn) | |
b32b8144 | 1239 | result = CommandResult('') |
3efd9988 FG |
1240 | self.send_command(result, 'mon', '', json.dumps({ |
1241 | 'prefix': 'osd reweightn', | |
1242 | 'format': 'json', | |
1243 | 'weights': json.dumps(reweightn), | |
b32b8144 | 1244 | }), '') |
3efd9988 FG |
1245 | commands.append(result) |
1246 | ||
1247 | # upmap | |
1248 | incdump = plan.inc.dump() | |
1249 | for pgid in incdump.get('old_pg_upmap_items', []): | |
1250 | self.log.info('ceph osd rm-pg-upmap-items %s', pgid) | |
1251 | result = CommandResult('foo') | |
1252 | self.send_command(result, 'mon', '', json.dumps({ | |
1253 | 'prefix': 'osd rm-pg-upmap-items', | |
1254 | 'format': 'json', | |
1255 | 'pgid': pgid, | |
1256 | }), 'foo') | |
1257 | commands.append(result) | |
1258 | ||
1259 | for item in incdump.get('new_pg_upmap_items', []): | |
1260 | self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'], | |
1261 | item['mappings']) | |
1262 | osdlist = [] | |
1263 | for m in item['mappings']: | |
1264 | osdlist += [m['from'], m['to']] | |
1265 | result = CommandResult('foo') | |
1266 | self.send_command(result, 'mon', '', json.dumps({ | |
1267 | 'prefix': 'osd pg-upmap-items', | |
1268 | 'format': 'json', | |
1269 | 'pgid': item['pgid'], | |
1270 | 'id': osdlist, | |
1271 | }), 'foo') | |
1272 | commands.append(result) | |
1273 | ||
1274 | # wait for commands | |
1275 | self.log.debug('commands %s' % commands) | |
1276 | for result in commands: | |
1277 | r, outb, outs = result.wait() | |
1278 | if r != 0: | |
94b18763 FG |
1279 | self.log.error('execute error: r = %d, detail = %s' % (r, outs)) |
1280 | return r, outs | |
3efd9988 | 1281 | self.log.debug('done') |
94b18763 | 1282 | return 0, '' |