]>
Commit | Line | Data |
---|---|---|
3efd9988 FG |
1 | |
2 | """ | |
3 | Balance PG distribution across OSDs. | |
4 | """ | |
5 | ||
6 | import copy | |
7 | import errno | |
8 | import json | |
9 | import math | |
10 | import random | |
1adf2230 | 11 | import six |
3efd9988 FG |
12 | import time |
13 | from mgr_module import MgrModule, CommandResult | |
14 | from threading import Event | |
b32b8144 | 15 | from mgr_module import CRUSHMap |
eafe8130 | 16 | import datetime |
3efd9988 | 17 | |
3efd9988 FG |
18 | TIME_FORMAT = '%Y-%m-%d_%H:%M:%S' |
19 | ||
3efd9988 FG |
20 | class MappingState: |
21 | def __init__(self, osdmap, pg_dump, desc=''): | |
22 | self.desc = desc | |
23 | self.osdmap = osdmap | |
24 | self.osdmap_dump = self.osdmap.dump() | |
25 | self.crush = osdmap.get_crush() | |
26 | self.crush_dump = self.crush.dump() | |
27 | self.pg_dump = pg_dump | |
28 | self.pg_stat = { | |
29 | i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', []) | |
30 | } | |
94b18763 FG |
31 | osd_poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])] |
32 | pg_poolids = [p['poolid'] for p in pg_dump.get('pool_stats', [])] | |
33 | self.poolids = set(osd_poolids) & set(pg_poolids) | |
3efd9988 FG |
34 | self.pg_up = {} |
35 | self.pg_up_by_poolid = {} | |
36 | for poolid in self.poolids: | |
37 | self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid) | |
1adf2230 | 38 | for a,b in six.iteritems(self.pg_up_by_poolid[poolid]): |
3efd9988 FG |
39 | self.pg_up[a] = b |
40 | ||
41 | def calc_misplaced_from(self, other_ms): | |
42 | num = len(other_ms.pg_up) | |
43 | misplaced = 0 | |
1adf2230 | 44 | for pgid, before in six.iteritems(other_ms.pg_up): |
3efd9988 FG |
45 | if before != self.pg_up.get(pgid, []): |
46 | misplaced += 1 | |
47 | if num > 0: | |
48 | return float(misplaced) / float(num) | |
49 | return 0.0 | |
50 | ||
51 | class Plan: | |
94b18763 | 52 | def __init__(self, name, ms, pools): |
3efd9988 FG |
53 | self.mode = 'unknown' |
54 | self.name = name | |
55 | self.initial = ms | |
94b18763 | 56 | self.pools = pools |
3efd9988 FG |
57 | |
58 | self.osd_weights = {} | |
59 | self.compat_ws = {} | |
60 | self.inc = ms.osdmap.new_incremental() | |
61 | ||
62 | def final_state(self): | |
63 | self.inc.set_osd_reweights(self.osd_weights) | |
64 | self.inc.set_crush_compat_weight_set_weights(self.compat_ws) | |
65 | return MappingState(self.initial.osdmap.apply_incremental(self.inc), | |
66 | self.initial.pg_dump, | |
67 | 'plan %s final' % self.name) | |
68 | ||
69 | def dump(self): | |
70 | return json.dumps(self.inc.dump(), indent=4) | |
71 | ||
72 | def show(self): | |
73 | ls = [] | |
74 | ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch()) | |
75 | ls.append('# starting crush version %d' % | |
76 | self.initial.osdmap.get_crush_version()) | |
77 | ls.append('# mode %s' % self.mode) | |
78 | if len(self.compat_ws) and \ | |
11fdf7f2 | 79 | not CRUSHMap.have_default_choose_args(self.initial.crush_dump): |
3efd9988 | 80 | ls.append('ceph osd crush weight-set create-compat') |
1adf2230 | 81 | for osd, weight in six.iteritems(self.compat_ws): |
3efd9988 FG |
82 | ls.append('ceph osd crush weight-set reweight-compat %s %f' % |
83 | (osd, weight)) | |
1adf2230 | 84 | for osd, weight in six.iteritems(self.osd_weights): |
3efd9988 FG |
85 | ls.append('ceph osd reweight osd.%d %f' % (osd, weight)) |
86 | incdump = self.inc.dump() | |
87 | for pgid in incdump.get('old_pg_upmap_items', []): | |
88 | ls.append('ceph osd rm-pg-upmap-items %s' % pgid) | |
89 | for item in incdump.get('new_pg_upmap_items', []): | |
90 | osdlist = [] | |
91 | for m in item['mappings']: | |
92 | osdlist += [m['from'], m['to']] | |
93 | ls.append('ceph osd pg-upmap-items %s %s' % | |
94 | (item['pgid'], ' '.join([str(a) for a in osdlist]))) | |
95 | return '\n'.join(ls) | |
96 | ||
97 | ||
98 | class Eval: | |
3efd9988 FG |
99 | def __init__(self, ms): |
100 | self.ms = ms | |
94b18763 FG |
101 | self.root_ids = {} # root name -> id |
102 | self.pool_name = {} # pool id -> pool name | |
103 | self.pool_id = {} # pool name -> id | |
104 | self.pool_roots = {} # pool name -> root name | |
105 | self.root_pools = {} # root name -> pools | |
106 | self.target_by_root = {} # root name -> target weight map | |
107 | self.count_by_pool = {} | |
108 | self.count_by_root = {} | |
109 | self.actual_by_pool = {} # pool -> by_* -> actual weight map | |
110 | self.actual_by_root = {} # pool -> by_* -> actual weight map | |
111 | self.total_by_pool = {} # pool -> by_* -> total | |
112 | self.total_by_root = {} # root -> by_* -> total | |
113 | self.stats_by_pool = {} # pool -> by_* -> stddev or avg -> value | |
114 | self.stats_by_root = {} # root -> by_* -> stddev or avg -> value | |
115 | ||
116 | self.score_by_pool = {} | |
117 | self.score_by_root = {} | |
118 | ||
119 | self.score = 0.0 | |
3efd9988 FG |
120 | |
121 | def show(self, verbose=False): | |
122 | if verbose: | |
123 | r = self.ms.desc + '\n' | |
124 | r += 'target_by_root %s\n' % self.target_by_root | |
125 | r += 'actual_by_pool %s\n' % self.actual_by_pool | |
126 | r += 'actual_by_root %s\n' % self.actual_by_root | |
127 | r += 'count_by_pool %s\n' % self.count_by_pool | |
128 | r += 'count_by_root %s\n' % self.count_by_root | |
129 | r += 'total_by_pool %s\n' % self.total_by_pool | |
130 | r += 'total_by_root %s\n' % self.total_by_root | |
131 | r += 'stats_by_root %s\n' % self.stats_by_root | |
132 | r += 'score_by_pool %s\n' % self.score_by_pool | |
133 | r += 'score_by_root %s\n' % self.score_by_root | |
134 | else: | |
135 | r = self.ms.desc + ' ' | |
136 | r += 'score %f (lower is better)\n' % self.score | |
137 | return r | |
138 | ||
139 | def calc_stats(self, count, target, total): | |
140 | num = max(len(target), 1) | |
141 | r = {} | |
142 | for t in ('pgs', 'objects', 'bytes'): | |
94b18763 FG |
143 | if total[t] == 0: |
144 | r[t] = { | |
145 | 'avg': 0, | |
146 | 'stddev': 0, | |
147 | 'sum_weight': 0, | |
148 | 'score': 0, | |
149 | } | |
150 | continue | |
151 | ||
3efd9988 FG |
152 | avg = float(total[t]) / float(num) |
153 | dev = 0.0 | |
154 | ||
155 | # score is a measure of how uneven the data distribution is. | |
156 | # score lies between [0, 1), 0 means perfect distribution. | |
157 | score = 0.0 | |
158 | sum_weight = 0.0 | |
159 | ||
1adf2230 | 160 | for k, v in six.iteritems(count[t]): |
3efd9988 FG |
161 | # adjust/normalize by weight |
162 | if target[k]: | |
163 | adjusted = float(v) / target[k] / float(num) | |
164 | else: | |
165 | adjusted = 0.0 | |
166 | ||
167 | # Overweighted devices and their weights are factors to calculate reweight_urgency. | |
168 | # One 10% underfilled device with 5 2% overfilled devices, is arguably a better | |
169 | # situation than one 10% overfilled with 5 2% underfilled devices | |
170 | if adjusted > avg: | |
171 | ''' | |
172 | F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution | |
173 | x = (adjusted - avg)/avg. | |
174 | Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1). | |
175 | To bring range of F(x) in range [0, 1), we need to make the above modification. | |
176 | ||
177 | In general, we need to use a function F(x), where x = (adjusted - avg)/avg | |
178 | 1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded. | |
179 | 2. A larger value of x, should imply more urgency to reweight. | |
180 | 3. Also, the difference between F(x) when x is large, should be minimal. | |
181 | 4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply. | |
182 | ||
183 | Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use. | |
184 | ||
185 | cdf of standard normal distribution: https://stackoverflow.com/a/29273201 | |
186 | ''' | |
187 | score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0))) | |
188 | sum_weight += target[k] | |
189 | dev += (avg - adjusted) * (avg - adjusted) | |
190 | stddev = math.sqrt(dev / float(max(num - 1, 1))) | |
191 | score = score / max(sum_weight, 1) | |
192 | r[t] = { | |
193 | 'avg': avg, | |
194 | 'stddev': stddev, | |
195 | 'sum_weight': sum_weight, | |
196 | 'score': score, | |
197 | } | |
198 | return r | |
199 | ||
200 | class Module(MgrModule): | |
11fdf7f2 TL |
201 | MODULE_OPTIONS = [ |
202 | { | |
203 | 'name': 'active', | |
204 | 'type': 'bool', | |
205 | 'default': False, | |
206 | 'desc': 'automatically balance PGs across cluster', | |
207 | 'runtime': True, | |
208 | }, | |
209 | { | |
210 | 'name': 'begin_time', | |
211 | 'type': 'str', | |
212 | 'default': '0000', | |
213 | 'desc': 'beginning time of day to automatically balance', | |
214 | 'long_desc': 'This is a time of day in the format HHMM.', | |
215 | 'runtime': True, | |
216 | }, | |
217 | { | |
218 | 'name': 'end_time', | |
219 | 'type': 'str', | |
220 | 'default': '2400', | |
221 | 'desc': 'ending time of day to automatically balance', | |
222 | 'long_desc': 'This is a time of day in the format HHMM.', | |
223 | 'runtime': True, | |
224 | }, | |
225 | { | |
226 | 'name': 'begin_weekday', | |
227 | 'type': 'uint', | |
228 | 'default': 0, | |
229 | 'min': 0, | |
230 | 'max': 7, | |
231 | 'desc': 'Restrict automatic balancing to this day of the week or later', | |
232 | 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.', | |
233 | 'runtime': True, | |
234 | }, | |
235 | { | |
236 | 'name': 'end_weekday', | |
237 | 'type': 'uint', | |
238 | 'default': 7, | |
239 | 'min': 0, | |
240 | 'max': 7, | |
241 | 'desc': 'Restrict automatic balancing to days of the week earlier than this', | |
242 | 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.', | |
243 | 'runtime': True, | |
244 | }, | |
245 | { | |
246 | 'name': 'crush_compat_max_iterations', | |
247 | 'type': 'uint', | |
248 | 'default': 25, | |
249 | 'min': 1, | |
250 | 'max': 250, | |
251 | 'desc': 'maximum number of iterations to attempt optimization', | |
252 | 'runtime': True, | |
253 | }, | |
254 | { | |
255 | 'name': 'crush_compat_metrics', | |
256 | 'type': 'str', | |
257 | 'default': 'pgs,objects,bytes', | |
258 | 'desc': 'metrics with which to calculate OSD utilization', | |
259 | 'long_desc': 'Value is a list of one or more of "pgs", "objects", or "bytes", and indicates which metrics to use to balance utilization.', | |
260 | 'runtime': True, | |
261 | }, | |
262 | { | |
263 | 'name': 'crush_compat_step', | |
264 | 'type': 'float', | |
265 | 'default': .5, | |
266 | 'min': .001, | |
267 | 'max': .999, | |
268 | 'desc': 'aggressiveness of optimization', | |
269 | 'long_desc': '.99 is very aggressive, .01 is less aggressive', | |
270 | 'runtime': True, | |
271 | }, | |
272 | { | |
273 | 'name': 'min_score', | |
274 | 'type': 'float', | |
275 | 'default': 0, | |
276 | 'desc': 'minimum score, below which no optimization is attempted', | |
277 | 'runtime': True, | |
278 | }, | |
279 | { | |
280 | 'name': 'mode', | |
281 | 'desc': 'Balancer mode', | |
282 | 'default': 'none', | |
283 | 'enum_allowed': ['none', 'crush-compat', 'upmap'], | |
284 | 'runtime': True, | |
285 | }, | |
286 | { | |
287 | 'name': 'sleep_interval', | |
288 | 'type': 'secs', | |
289 | 'default': 60, | |
290 | 'desc': 'how frequently to wake up and attempt optimization', | |
291 | 'runtime': True, | |
292 | }, | |
293 | { | |
294 | 'name': 'upmap_max_iterations', | |
295 | 'type': 'uint', | |
296 | 'default': 10, | |
297 | 'desc': 'maximum upmap optimization iterations', | |
298 | 'runtime': True, | |
299 | }, | |
300 | { | |
301 | 'name': 'upmap_max_deviation', | |
92f5a8d4 TL |
302 | 'type': 'int', |
303 | 'default': 5, | |
304 | 'min': 1, | |
11fdf7f2 | 305 | 'desc': 'deviation below which no optimization is attempted', |
92f5a8d4 | 306 | 'long_desc': 'If the number of PGs are within this count then no optimization is attempted', |
11fdf7f2 TL |
307 | 'runtime': True, |
308 | }, | |
309 | { | |
310 | 'name': 'pool_ids', | |
311 | 'type': 'str', | |
312 | 'default': '', | |
313 | 'desc': 'pools which the automatic balancing will be limited to', | |
314 | 'runtime': True, | |
315 | }, | |
316 | ] | |
317 | ||
3efd9988 FG |
318 | COMMANDS = [ |
319 | { | |
320 | "cmd": "balancer status", | |
321 | "desc": "Show balancer status", | |
322 | "perm": "r", | |
323 | }, | |
324 | { | |
325 | "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap", | |
326 | "desc": "Set balancer mode", | |
327 | "perm": "rw", | |
328 | }, | |
329 | { | |
330 | "cmd": "balancer on", | |
331 | "desc": "Enable automatic balancing", | |
332 | "perm": "rw", | |
333 | }, | |
334 | { | |
335 | "cmd": "balancer off", | |
336 | "desc": "Disable automatic balancing", | |
337 | "perm": "rw", | |
338 | }, | |
11fdf7f2 TL |
339 | { |
340 | "cmd": "balancer pool ls", | |
341 | "desc": "List automatic balancing pools. " | |
342 | "Note that empty list means all existing pools will be automatic balancing targets, " | |
343 | "which is the default behaviour of balancer.", | |
344 | "perm": "r", | |
345 | }, | |
346 | { | |
347 | "cmd": "balancer pool add name=pools,type=CephString,n=N", | |
348 | "desc": "Enable automatic balancing for specific pools", | |
349 | "perm": "rw", | |
350 | }, | |
351 | { | |
352 | "cmd": "balancer pool rm name=pools,type=CephString,n=N", | |
353 | "desc": "Disable automatic balancing for specific pools", | |
354 | "perm": "rw", | |
355 | }, | |
3efd9988 | 356 | { |
94b18763 FG |
357 | "cmd": "balancer eval name=option,type=CephString,req=false", |
358 | "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan", | |
3efd9988 FG |
359 | "perm": "r", |
360 | }, | |
361 | { | |
94b18763 FG |
362 | "cmd": "balancer eval-verbose name=option,type=CephString,req=false", |
363 | "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan (verbosely)", | |
3efd9988 FG |
364 | "perm": "r", |
365 | }, | |
366 | { | |
94b18763 | 367 | "cmd": "balancer optimize name=plan,type=CephString name=pools,type=CephString,n=N,req=false", |
3efd9988 FG |
368 | "desc": "Run optimizer to create a new plan", |
369 | "perm": "rw", | |
370 | }, | |
371 | { | |
372 | "cmd": "balancer show name=plan,type=CephString", | |
373 | "desc": "Show details of an optimization plan", | |
374 | "perm": "r", | |
375 | }, | |
376 | { | |
377 | "cmd": "balancer rm name=plan,type=CephString", | |
378 | "desc": "Discard an optimization plan", | |
379 | "perm": "rw", | |
380 | }, | |
381 | { | |
382 | "cmd": "balancer reset", | |
383 | "desc": "Discard all optimization plans", | |
384 | "perm": "rw", | |
385 | }, | |
386 | { | |
387 | "cmd": "balancer dump name=plan,type=CephString", | |
388 | "desc": "Show an optimization plan", | |
389 | "perm": "r", | |
390 | }, | |
f64942e4 AA |
391 | { |
392 | "cmd": "balancer ls", | |
393 | "desc": "List all plans", | |
394 | "perm": "r", | |
395 | }, | |
3efd9988 FG |
396 | { |
397 | "cmd": "balancer execute name=plan,type=CephString", | |
398 | "desc": "Execute an optimization plan", | |
a8e16298 | 399 | "perm": "rw", |
3efd9988 FG |
400 | }, |
401 | ] | |
402 | active = False | |
403 | run = True | |
404 | plans = {} | |
405 | mode = '' | |
eafe8130 TL |
406 | optimizing = False |
407 | last_optimize_started = '' | |
408 | last_optimize_duration = '' | |
409 | optimize_result = '' | |
410 | success_string = 'Optimization plan created successfully' | |
411 | in_progress_string = 'in progress' | |
3efd9988 FG |
412 | |
413 | def __init__(self, *args, **kwargs): | |
414 | super(Module, self).__init__(*args, **kwargs) | |
415 | self.event = Event() | |
416 | ||
11fdf7f2 | 417 | def handle_command(self, inbuf, command): |
3efd9988 FG |
418 | self.log.warn("Handling command: '%s'" % str(command)) |
419 | if command['prefix'] == 'balancer status': | |
420 | s = { | |
f64942e4 | 421 | 'plans': list(self.plans.keys()), |
3efd9988 | 422 | 'active': self.active, |
eafe8130 TL |
423 | 'last_optimize_started': self.last_optimize_started, |
424 | 'last_optimize_duration': self.last_optimize_duration, | |
425 | 'optimize_result': self.optimize_result, | |
11fdf7f2 | 426 | 'mode': self.get_module_option('mode'), |
3efd9988 FG |
427 | } |
428 | return (0, json.dumps(s, indent=4), '') | |
429 | elif command['prefix'] == 'balancer mode': | |
11fdf7f2 TL |
430 | if command['mode'] == 'upmap': |
431 | min_compat_client = self.get_osdmap().dump().get('require_min_compat_client', '') | |
432 | if min_compat_client < 'luminous': # works well because version is alphabetized.. | |
433 | warn = 'min_compat_client "%s" ' \ | |
434 | '< "luminous", which is required for pg-upmap. ' \ | |
435 | 'Try "ceph osd set-require-min-compat-client luminous" ' \ | |
436 | 'before enabling this mode' % min_compat_client | |
437 | return (-errno.EPERM, '', warn) | |
81eedcae TL |
438 | elif command['mode'] == 'crush-compat': |
439 | ms = MappingState(self.get_osdmap(), | |
440 | self.get("pg_dump"), | |
441 | 'initialize compat weight-set') | |
442 | self.get_compat_weight_set_weights(ms) # ignore error | |
11fdf7f2 | 443 | self.set_module_option('mode', command['mode']) |
3efd9988 FG |
444 | return (0, '', '') |
445 | elif command['prefix'] == 'balancer on': | |
446 | if not self.active: | |
11fdf7f2 | 447 | self.set_module_option('active', 'true') |
3efd9988 FG |
448 | self.active = True |
449 | self.event.set() | |
450 | return (0, '', '') | |
451 | elif command['prefix'] == 'balancer off': | |
452 | if self.active: | |
11fdf7f2 | 453 | self.set_module_option('active', 'false') |
3efd9988 FG |
454 | self.active = False |
455 | self.event.set() | |
456 | return (0, '', '') | |
11fdf7f2 TL |
457 | elif command['prefix'] == 'balancer pool ls': |
458 | pool_ids = self.get_module_option('pool_ids') | |
459 | if pool_ids is '': | |
460 | return (0, '', '') | |
461 | pool_ids = pool_ids.split(',') | |
462 | pool_ids = [int(p) for p in pool_ids] | |
463 | pool_name_by_id = dict((p['pool'], p['pool_name']) for p in self.get_osdmap().dump().get('pools', [])) | |
464 | should_prune = False | |
465 | final_ids = [] | |
466 | final_names = [] | |
467 | for p in pool_ids: | |
468 | if p in pool_name_by_id: | |
469 | final_ids.append(p) | |
470 | final_names.append(pool_name_by_id[p]) | |
471 | else: | |
472 | should_prune = True | |
473 | if should_prune: # some pools were gone, prune | |
474 | self.set_module_option('pool_ids', ','.join(final_ids)) | |
475 | return (0, json.dumps(final_names, indent=4), '') | |
476 | elif command['prefix'] == 'balancer pool add': | |
477 | raw_names = command['pools'] | |
478 | pool_id_by_name = dict((p['pool_name'], p['pool']) for p in self.get_osdmap().dump().get('pools', [])) | |
479 | invalid_names = [p for p in raw_names if p not in pool_id_by_name] | |
480 | if invalid_names: | |
481 | return (-errno.EINVAL, '', 'pool(s) %s not found' % invalid_names) | |
482 | to_add = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name] | |
483 | existing = self.get_module_option('pool_ids') | |
484 | final = to_add | |
485 | if existing is not '': | |
486 | existing = existing.split(',') | |
487 | final = set(to_add) | set(existing) | |
488 | self.set_module_option('pool_ids', ','.join(final)) | |
489 | return (0, '', '') | |
490 | elif command['prefix'] == 'balancer pool rm': | |
491 | raw_names = command['pools'] | |
492 | existing = self.get_module_option('pool_ids') | |
493 | if existing is '': # for idempotence | |
494 | return (0, '', '') | |
495 | existing = existing.split(',') | |
496 | osdmap = self.get_osdmap() | |
497 | pool_ids = [str(p['pool']) for p in osdmap.dump().get('pools', [])] | |
498 | pool_id_by_name = dict((p['pool_name'], p['pool']) for p in osdmap.dump().get('pools', [])) | |
499 | final = [p for p in existing if p in pool_ids] | |
500 | to_delete = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name] | |
501 | final = set(final) - set(to_delete) | |
502 | self.set_module_option('pool_ids', ','.join(final)) | |
503 | return (0, '', '') | |
3efd9988 FG |
504 | elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose': |
505 | verbose = command['prefix'] == 'balancer eval-verbose' | |
94b18763 FG |
506 | pools = [] |
507 | if 'option' in command: | |
508 | plan = self.plans.get(command['option']) | |
3efd9988 | 509 | if not plan: |
94b18763 FG |
510 | # not a plan, does it look like a pool? |
511 | osdmap = self.get_osdmap() | |
512 | valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])] | |
513 | option = command['option'] | |
514 | if option not in valid_pool_names: | |
515 | return (-errno.EINVAL, '', 'option "%s" not a plan or a pool' % option) | |
516 | pools.append(option) | |
517 | ms = MappingState(osdmap, self.get("pg_dump"), 'pool "%s"' % option) | |
518 | else: | |
519 | pools = plan.pools | |
520 | ms = plan.final_state() | |
3efd9988 FG |
521 | else: |
522 | ms = MappingState(self.get_osdmap(), | |
523 | self.get("pg_dump"), | |
524 | 'current cluster') | |
94b18763 | 525 | return (0, self.evaluate(ms, pools, verbose=verbose), '') |
3efd9988 | 526 | elif command['prefix'] == 'balancer optimize': |
eafe8130 TL |
527 | # The GIL can be release by the active balancer, so disallow when active |
528 | if self.active: | |
529 | return (-errno.EINVAL, '', 'Balancer enabled, disable to optimize manually') | |
530 | if self.optimizing: | |
531 | return (-errno.EINVAL, '', 'Balancer finishing up....try again') | |
94b18763 FG |
532 | pools = [] |
533 | if 'pools' in command: | |
534 | pools = command['pools'] | |
535 | osdmap = self.get_osdmap() | |
536 | valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])] | |
537 | invalid_pool_names = [] | |
538 | for p in pools: | |
539 | if p not in valid_pool_names: | |
540 | invalid_pool_names.append(p) | |
541 | if len(invalid_pool_names): | |
542 | return (-errno.EINVAL, '', 'pools %s not found' % invalid_pool_names) | |
543 | plan = self.plan_create(command['plan'], osdmap, pools) | |
eafe8130 TL |
544 | self.last_optimize_started = time.asctime(time.localtime()) |
545 | self.optimize_result = self.in_progress_string | |
546 | start = time.time() | |
94b18763 | 547 | r, detail = self.optimize(plan) |
eafe8130 TL |
548 | end = time.time() |
549 | self.last_optimize_duration = str(datetime.timedelta(seconds=(end - start))) | |
550 | if r == 0: | |
551 | # Add plan if an optimization was created | |
552 | self.optimize_result = self.success_string | |
553 | self.plans[command['plan']] = plan | |
554 | else: | |
555 | self.optimize_result = detail | |
94b18763 | 556 | return (r, '', detail) |
3efd9988 | 557 | elif command['prefix'] == 'balancer rm': |
b32b8144 | 558 | self.plan_rm(command['plan']) |
3efd9988 FG |
559 | return (0, '', '') |
560 | elif command['prefix'] == 'balancer reset': | |
561 | self.plans = {} | |
562 | return (0, '', '') | |
f64942e4 AA |
563 | elif command['prefix'] == 'balancer ls': |
564 | return (0, json.dumps([p for p in self.plans], indent=4), '') | |
3efd9988 FG |
565 | elif command['prefix'] == 'balancer dump': |
566 | plan = self.plans.get(command['plan']) | |
567 | if not plan: | |
568 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
569 | return (0, plan.dump(), '') | |
570 | elif command['prefix'] == 'balancer show': | |
571 | plan = self.plans.get(command['plan']) | |
572 | if not plan: | |
573 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
574 | return (0, plan.show(), '') | |
575 | elif command['prefix'] == 'balancer execute': | |
eafe8130 TL |
576 | # The GIL can be release by the active balancer, so disallow when active |
577 | if self.active: | |
578 | return (-errno.EINVAL, '', 'Balancer enabled, disable to execute a plan') | |
579 | if self.optimizing: | |
580 | return (-errno.EINVAL, '', 'Balancer finishing up....try again') | |
3efd9988 FG |
581 | plan = self.plans.get(command['plan']) |
582 | if not plan: | |
583 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
94b18763 FG |
584 | r, detail = self.execute(plan) |
585 | self.plan_rm(command['plan']) | |
586 | return (r, '', detail) | |
3efd9988 FG |
587 | else: |
588 | return (-errno.EINVAL, '', | |
589 | "Command not found '{0}'".format(command['prefix'])) | |
590 | ||
591 | def shutdown(self): | |
592 | self.log.info('Stopping') | |
593 | self.run = False | |
594 | self.event.set() | |
595 | ||
a8e16298 TL |
596 | def time_permit(self): |
597 | local_time = time.localtime() | |
598 | time_of_day = time.strftime('%H%M', local_time) | |
599 | weekday = (local_time.tm_wday + 1) % 7 # be compatible with C | |
600 | permit = False | |
601 | ||
11fdf7f2 TL |
602 | begin_time = self.get_module_option('begin_time') |
603 | end_time = self.get_module_option('end_time') | |
a8e16298 TL |
604 | if begin_time <= end_time: |
605 | permit = begin_time <= time_of_day < end_time | |
606 | else: | |
607 | permit = time_of_day >= begin_time or time_of_day < end_time | |
608 | if not permit: | |
609 | self.log.debug("should run between %s - %s, now %s, skipping", | |
610 | begin_time, end_time, time_of_day) | |
611 | return False | |
612 | ||
11fdf7f2 TL |
613 | begin_weekday = self.get_module_option('begin_weekday') |
614 | end_weekday = self.get_module_option('end_weekday') | |
a8e16298 TL |
615 | if begin_weekday <= end_weekday: |
616 | permit = begin_weekday <= weekday < end_weekday | |
3efd9988 | 617 | else: |
a8e16298 TL |
618 | permit = weekday >= begin_weekday or weekday < end_weekday |
619 | if not permit: | |
620 | self.log.debug("should run between weekday %d - %d, now %d, skipping", | |
621 | begin_weekday, end_weekday, weekday) | |
622 | return False | |
623 | ||
624 | return True | |
3efd9988 FG |
625 | |
626 | def serve(self): | |
627 | self.log.info('Starting') | |
628 | while self.run: | |
11fdf7f2 TL |
629 | self.active = self.get_module_option('active') |
630 | sleep_interval = self.get_module_option('sleep_interval') | |
a8e16298 TL |
631 | self.log.debug('Waking up [%s, now %s]', |
632 | "active" if self.active else "inactive", | |
633 | time.strftime(TIME_FORMAT, time.localtime())) | |
634 | if self.active and self.time_permit(): | |
3efd9988 FG |
635 | self.log.debug('Running') |
636 | name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime()) | |
11fdf7f2 TL |
637 | osdmap = self.get_osdmap() |
638 | allow = self.get_module_option('pool_ids') | |
639 | final = [] | |
640 | if allow is not '': | |
641 | allow = allow.split(',') | |
642 | valid = [str(p['pool']) for p in osdmap.dump().get('pools', [])] | |
643 | final = set(allow) & set(valid) | |
644 | if set(allow) - set(valid): # some pools were gone, prune | |
645 | self.set_module_option('pool_ids', ','.join(final)) | |
646 | pool_name_by_id = dict((p['pool'], p['pool_name']) for p in osdmap.dump().get('pools', [])) | |
647 | final = [int(p) for p in final] | |
648 | final = [pool_name_by_id[p] for p in final if p in pool_name_by_id] | |
649 | plan = self.plan_create(name, osdmap, final) | |
eafe8130 TL |
650 | self.optimizing = True |
651 | self.last_optimize_started = time.asctime(time.localtime()) | |
652 | self.optimize_result = self.in_progress_string | |
653 | start = time.time() | |
94b18763 | 654 | r, detail = self.optimize(plan) |
eafe8130 TL |
655 | end = time.time() |
656 | self.last_optimize_duration = str(datetime.timedelta(seconds=(end - start))) | |
94b18763 | 657 | if r == 0: |
eafe8130 | 658 | self.optimize_result = self.success_string |
3efd9988 | 659 | self.execute(plan) |
eafe8130 TL |
660 | else: |
661 | self.optimize_result = detail | |
662 | self.optimizing = False | |
3efd9988 FG |
663 | self.log.debug('Sleeping for %d', sleep_interval) |
664 | self.event.wait(sleep_interval) | |
665 | self.event.clear() | |
666 | ||
94b18763 FG |
667 | def plan_create(self, name, osdmap, pools): |
668 | plan = Plan(name, | |
669 | MappingState(osdmap, | |
670 | self.get("pg_dump"), | |
671 | 'plan %s initial' % name), | |
672 | pools) | |
3efd9988 FG |
673 | return plan |
674 | ||
675 | def plan_rm(self, name): | |
676 | if name in self.plans: | |
677 | del self.plans[name] | |
678 | ||
94b18763 | 679 | def calc_eval(self, ms, pools): |
3efd9988 FG |
680 | pe = Eval(ms) |
681 | pool_rule = {} | |
682 | pool_info = {} | |
683 | for p in ms.osdmap_dump.get('pools',[]): | |
94b18763 FG |
684 | if len(pools) and p['pool_name'] not in pools: |
685 | continue | |
686 | # skip dead or not-yet-ready pools too | |
687 | if p['pool'] not in ms.poolids: | |
688 | continue | |
3efd9988 FG |
689 | pe.pool_name[p['pool']] = p['pool_name'] |
690 | pe.pool_id[p['pool_name']] = p['pool'] | |
691 | pool_rule[p['pool_name']] = p['crush_rule'] | |
692 | pe.pool_roots[p['pool_name']] = [] | |
693 | pool_info[p['pool_name']] = p | |
94b18763 | 694 | if len(pool_info) == 0: |
3efd9988 FG |
695 | return pe |
696 | self.log.debug('pool_name %s' % pe.pool_name) | |
697 | self.log.debug('pool_id %s' % pe.pool_id) | |
698 | self.log.debug('pools %s' % pools) | |
699 | self.log.debug('pool_rule %s' % pool_rule) | |
700 | ||
701 | osd_weight = { a['osd']: a['weight'] | |
94b18763 | 702 | for a in ms.osdmap_dump.get('osds',[]) if a['weight'] > 0 } |
3efd9988 FG |
703 | |
704 | # get expected distributions by root | |
705 | actual_by_root = {} | |
706 | rootids = ms.crush.find_takes() | |
707 | roots = [] | |
708 | for rootid in rootids: | |
3efd9988 | 709 | ls = ms.osdmap.get_pools_by_take(rootid) |
94b18763 FG |
710 | want = [] |
711 | # find out roots associating with pools we are passed in | |
712 | for candidate in ls: | |
713 | if candidate in pe.pool_name: | |
714 | want.append(candidate) | |
715 | if len(want) == 0: | |
716 | continue | |
717 | root = ms.crush.get_item_name(rootid) | |
3efd9988 | 718 | pe.root_pools[root] = [] |
94b18763 | 719 | for poolid in want: |
3efd9988 FG |
720 | pe.pool_roots[pe.pool_name[poolid]].append(root) |
721 | pe.root_pools[root].append(pe.pool_name[poolid]) | |
94b18763 FG |
722 | pe.root_ids[root] = rootid |
723 | roots.append(root) | |
3efd9988 FG |
724 | weight_map = ms.crush.get_take_weight_osd_map(rootid) |
725 | adjusted_map = { | |
94b18763 | 726 | osd: cw * osd_weight[osd] |
1adf2230 | 727 | for osd,cw in six.iteritems(weight_map) if osd in osd_weight and cw > 0 |
3efd9988 | 728 | } |
94b18763 FG |
729 | sum_w = sum(adjusted_map.values()) |
730 | assert len(adjusted_map) == 0 or sum_w > 0 | |
3efd9988 | 731 | pe.target_by_root[root] = { osd: w / sum_w |
1adf2230 | 732 | for osd,w in six.iteritems(adjusted_map) } |
3efd9988 FG |
733 | actual_by_root[root] = { |
734 | 'pgs': {}, | |
735 | 'objects': {}, | |
736 | 'bytes': {}, | |
737 | } | |
11fdf7f2 | 738 | for osd in pe.target_by_root[root]: |
3efd9988 FG |
739 | actual_by_root[root]['pgs'][osd] = 0 |
740 | actual_by_root[root]['objects'][osd] = 0 | |
741 | actual_by_root[root]['bytes'][osd] = 0 | |
742 | pe.total_by_root[root] = { | |
743 | 'pgs': 0, | |
744 | 'objects': 0, | |
745 | 'bytes': 0, | |
746 | } | |
747 | self.log.debug('pool_roots %s' % pe.pool_roots) | |
748 | self.log.debug('root_pools %s' % pe.root_pools) | |
749 | self.log.debug('target_by_root %s' % pe.target_by_root) | |
750 | ||
751 | # pool and root actual | |
1adf2230 | 752 | for pool, pi in six.iteritems(pool_info): |
3efd9988 FG |
753 | poolid = pi['pool'] |
754 | pm = ms.pg_up_by_poolid[poolid] | |
755 | pgs = 0 | |
756 | objects = 0 | |
757 | bytes = 0 | |
758 | pgs_by_osd = {} | |
759 | objects_by_osd = {} | |
760 | bytes_by_osd = {} | |
761 | for root in pe.pool_roots[pool]: | |
11fdf7f2 | 762 | for osd in pe.target_by_root[root]: |
3efd9988 FG |
763 | pgs_by_osd[osd] = 0 |
764 | objects_by_osd[osd] = 0 | |
765 | bytes_by_osd[osd] = 0 | |
1adf2230 | 766 | for pgid, up in six.iteritems(pm): |
3efd9988 | 767 | for osd in [int(osd) for osd in up]: |
b32b8144 FG |
768 | if osd == CRUSHMap.ITEM_NONE: |
769 | continue | |
3efd9988 FG |
770 | pgs_by_osd[osd] += 1 |
771 | objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects'] | |
772 | bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes'] | |
773 | # pick a root to associate this pg instance with. | |
774 | # note that this is imprecise if the roots have | |
775 | # overlapping children. | |
776 | # FIXME: divide bytes by k for EC pools. | |
777 | for root in pe.pool_roots[pool]: | |
778 | if osd in pe.target_by_root[root]: | |
779 | actual_by_root[root]['pgs'][osd] += 1 | |
780 | actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects'] | |
781 | actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes'] | |
782 | pgs += 1 | |
783 | objects += ms.pg_stat[pgid]['num_objects'] | |
784 | bytes += ms.pg_stat[pgid]['num_bytes'] | |
785 | pe.total_by_root[root]['pgs'] += 1 | |
786 | pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects'] | |
787 | pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes'] | |
788 | break | |
789 | pe.count_by_pool[pool] = { | |
790 | 'pgs': { | |
791 | k: v | |
1adf2230 | 792 | for k, v in six.iteritems(pgs_by_osd) |
3efd9988 FG |
793 | }, |
794 | 'objects': { | |
795 | k: v | |
1adf2230 | 796 | for k, v in six.iteritems(objects_by_osd) |
3efd9988 FG |
797 | }, |
798 | 'bytes': { | |
799 | k: v | |
1adf2230 | 800 | for k, v in six.iteritems(bytes_by_osd) |
3efd9988 FG |
801 | }, |
802 | } | |
803 | pe.actual_by_pool[pool] = { | |
804 | 'pgs': { | |
805 | k: float(v) / float(max(pgs, 1)) | |
1adf2230 | 806 | for k, v in six.iteritems(pgs_by_osd) |
3efd9988 FG |
807 | }, |
808 | 'objects': { | |
809 | k: float(v) / float(max(objects, 1)) | |
1adf2230 | 810 | for k, v in six.iteritems(objects_by_osd) |
3efd9988 FG |
811 | }, |
812 | 'bytes': { | |
813 | k: float(v) / float(max(bytes, 1)) | |
1adf2230 | 814 | for k, v in six.iteritems(bytes_by_osd) |
3efd9988 FG |
815 | }, |
816 | } | |
817 | pe.total_by_pool[pool] = { | |
818 | 'pgs': pgs, | |
819 | 'objects': objects, | |
820 | 'bytes': bytes, | |
821 | } | |
11fdf7f2 | 822 | for root in pe.total_by_root: |
3efd9988 FG |
823 | pe.count_by_root[root] = { |
824 | 'pgs': { | |
825 | k: float(v) | |
1adf2230 | 826 | for k, v in six.iteritems(actual_by_root[root]['pgs']) |
3efd9988 FG |
827 | }, |
828 | 'objects': { | |
829 | k: float(v) | |
1adf2230 | 830 | for k, v in six.iteritems(actual_by_root[root]['objects']) |
3efd9988 FG |
831 | }, |
832 | 'bytes': { | |
833 | k: float(v) | |
1adf2230 | 834 | for k, v in six.iteritems(actual_by_root[root]['bytes']) |
3efd9988 FG |
835 | }, |
836 | } | |
837 | pe.actual_by_root[root] = { | |
838 | 'pgs': { | |
839 | k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1)) | |
1adf2230 | 840 | for k, v in six.iteritems(actual_by_root[root]['pgs']) |
3efd9988 FG |
841 | }, |
842 | 'objects': { | |
843 | k: float(v) / float(max(pe.total_by_root[root]['objects'], 1)) | |
1adf2230 | 844 | for k, v in six.iteritems(actual_by_root[root]['objects']) |
3efd9988 FG |
845 | }, |
846 | 'bytes': { | |
847 | k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1)) | |
1adf2230 | 848 | for k, v in six.iteritems(actual_by_root[root]['bytes']) |
3efd9988 FG |
849 | }, |
850 | } | |
851 | self.log.debug('actual_by_pool %s' % pe.actual_by_pool) | |
852 | self.log.debug('actual_by_root %s' % pe.actual_by_root) | |
853 | ||
854 | # average and stddev and score | |
855 | pe.stats_by_root = { | |
856 | a: pe.calc_stats( | |
857 | b, | |
858 | pe.target_by_root[a], | |
859 | pe.total_by_root[a] | |
1adf2230 | 860 | ) for a, b in six.iteritems(pe.count_by_root) |
3efd9988 | 861 | } |
94b18763 | 862 | self.log.debug('stats_by_root %s' % pe.stats_by_root) |
3efd9988 FG |
863 | |
864 | # the scores are already normalized | |
865 | pe.score_by_root = { | |
866 | r: { | |
867 | 'pgs': pe.stats_by_root[r]['pgs']['score'], | |
868 | 'objects': pe.stats_by_root[r]['objects']['score'], | |
869 | 'bytes': pe.stats_by_root[r]['bytes']['score'], | |
870 | } for r in pe.total_by_root.keys() | |
871 | } | |
94b18763 | 872 | self.log.debug('score_by_root %s' % pe.score_by_root) |
3efd9988 | 873 | |
f64942e4 | 874 | # get the list of score metrics, comma separated |
11fdf7f2 | 875 | metrics = self.get_module_option('crush_compat_metrics').split(',') |
f64942e4 | 876 | |
3efd9988 FG |
877 | # total score is just average of normalized stddevs |
878 | pe.score = 0.0 | |
1adf2230 AA |
879 | for r, vs in six.iteritems(pe.score_by_root): |
880 | for k, v in six.iteritems(vs): | |
f64942e4 AA |
881 | if k in metrics: |
882 | pe.score += v | |
883 | pe.score /= len(metrics) * len(roots) | |
3efd9988 FG |
884 | return pe |
885 | ||
94b18763 FG |
886 | def evaluate(self, ms, pools, verbose=False): |
887 | pe = self.calc_eval(ms, pools) | |
3efd9988 FG |
888 | return pe.show(verbose=verbose) |
889 | ||
890 | def optimize(self, plan): | |
891 | self.log.info('Optimize plan %s' % plan.name) | |
11fdf7f2 TL |
892 | plan.mode = self.get_module_option('mode') |
893 | max_misplaced = float(self.get_ceph_option('target_max_misplaced_ratio')) | |
3efd9988 FG |
894 | self.log.info('Mode %s, max misplaced %f' % |
895 | (plan.mode, max_misplaced)) | |
896 | ||
897 | info = self.get('pg_status') | |
898 | unknown = info.get('unknown_pgs_ratio', 0.0) | |
899 | degraded = info.get('degraded_ratio', 0.0) | |
900 | inactive = info.get('inactive_pgs_ratio', 0.0) | |
901 | misplaced = info.get('misplaced_ratio', 0.0) | |
902 | self.log.debug('unknown %f degraded %f inactive %f misplaced %g', | |
903 | unknown, degraded, inactive, misplaced) | |
904 | if unknown > 0.0: | |
94b18763 FG |
905 | detail = 'Some PGs (%f) are unknown; try again later' % unknown |
906 | self.log.info(detail) | |
907 | return -errno.EAGAIN, detail | |
3efd9988 | 908 | elif degraded > 0.0: |
94b18763 FG |
909 | detail = 'Some objects (%f) are degraded; try again later' % degraded |
910 | self.log.info(detail) | |
911 | return -errno.EAGAIN, detail | |
3efd9988 | 912 | elif inactive > 0.0: |
94b18763 FG |
913 | detail = 'Some PGs (%f) are inactive; try again later' % inactive |
914 | self.log.info(detail) | |
915 | return -errno.EAGAIN, detail | |
3efd9988 | 916 | elif misplaced >= max_misplaced: |
94b18763 FG |
917 | detail = 'Too many objects (%f > %f) are misplaced; ' \ |
918 | 'try again later' % (misplaced, max_misplaced) | |
919 | self.log.info(detail) | |
920 | return -errno.EAGAIN, detail | |
3efd9988 FG |
921 | else: |
922 | if plan.mode == 'upmap': | |
923 | return self.do_upmap(plan) | |
924 | elif plan.mode == 'crush-compat': | |
925 | return self.do_crush_compat(plan) | |
926 | elif plan.mode == 'none': | |
94b18763 | 927 | detail = 'Please do "ceph balancer mode" to choose a valid mode first' |
3efd9988 | 928 | self.log.info('Idle') |
94b18763 | 929 | return -errno.ENOEXEC, detail |
3efd9988 | 930 | else: |
94b18763 FG |
931 | detail = 'Unrecognized mode %s' % plan.mode |
932 | self.log.info(detail) | |
933 | return -errno.EINVAL, detail | |
3efd9988 FG |
934 | ## |
935 | ||
936 | def do_upmap(self, plan): | |
937 | self.log.info('do_upmap') | |
11fdf7f2 TL |
938 | max_iterations = self.get_module_option('upmap_max_iterations') |
939 | max_deviation = self.get_module_option('upmap_max_deviation') | |
3efd9988 FG |
940 | |
941 | ms = plan.initial | |
94b18763 FG |
942 | if len(plan.pools): |
943 | pools = plan.pools | |
944 | else: # all | |
945 | pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])] | |
3efd9988 | 946 | if len(pools) == 0: |
94b18763 FG |
947 | detail = 'No pools available' |
948 | self.log.info(detail) | |
949 | return -errno.ENOENT, detail | |
92f5a8d4 TL |
950 | # shuffle pool list so they all get equal (in)attention |
951 | random.shuffle(pools) | |
952 | self.log.info('pools %s' % pools) | |
3efd9988 | 953 | |
92f5a8d4 | 954 | adjusted_pools = [] |
3efd9988 FG |
955 | inc = plan.inc |
956 | total_did = 0 | |
957 | left = max_iterations | |
11fdf7f2 TL |
958 | osdmap_dump = self.get_osdmap().dump() |
959 | pools_with_pg_merge = [p['pool_name'] for p in osdmap_dump.get('pools', []) | |
960 | if p['pg_num'] > p['pg_num_target']] | |
961 | crush_rule_by_pool_name = dict((p['pool_name'], p['crush_rule']) for p in osdmap_dump.get('pools', [])) | |
3efd9988 | 962 | for pool in pools: |
11fdf7f2 TL |
963 | if pool not in crush_rule_by_pool_name: |
964 | self.log.info('pool %s does not exist' % pool) | |
965 | continue | |
966 | if pool in pools_with_pg_merge: | |
967 | self.log.info('pool %s has pending PG(s) for merging, skipping for now' % pool) | |
968 | continue | |
92f5a8d4 | 969 | adjusted_pools.append(pool) |
11fdf7f2 | 970 | # shuffle so all pools get equal (in)attention |
92f5a8d4 TL |
971 | random.shuffle(adjusted_pools) |
972 | for pool in adjusted_pools: | |
973 | did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool]) | |
3efd9988 FG |
974 | total_did += did |
975 | left -= did | |
976 | if left <= 0: | |
977 | break | |
978 | self.log.info('prepared %d/%d changes' % (total_did, max_iterations)) | |
94b18763 | 979 | if total_did == 0: |
11fdf7f2 | 980 | return -errno.EALREADY, 'Unable to find further optimization, ' \ |
92f5a8d4 | 981 | 'or pool(s) pg_num is decreasing, ' \ |
94b18763 FG |
982 | 'or distribution is already perfect' |
983 | return 0, '' | |
3efd9988 FG |
984 | |
985 | def do_crush_compat(self, plan): | |
986 | self.log.info('do_crush_compat') | |
11fdf7f2 | 987 | max_iterations = self.get_module_option('crush_compat_max_iterations') |
3efd9988 | 988 | if max_iterations < 1: |
94b18763 | 989 | return -errno.EINVAL, '"crush_compat_max_iterations" must be >= 1' |
11fdf7f2 | 990 | step = self.get_module_option('crush_compat_step') |
3efd9988 | 991 | if step <= 0 or step >= 1.0: |
94b18763 | 992 | return -errno.EINVAL, '"crush_compat_step" must be in (0, 1)' |
11fdf7f2 | 993 | max_misplaced = float(self.get_ceph_option('target_max_misplaced_ratio')) |
3efd9988 FG |
994 | min_pg_per_osd = 2 |
995 | ||
996 | ms = plan.initial | |
997 | osdmap = ms.osdmap | |
998 | crush = osdmap.get_crush() | |
94b18763 | 999 | pe = self.calc_eval(ms, plan.pools) |
11fdf7f2 | 1000 | min_score_to_optimize = self.get_module_option('min_score') |
94b18763 FG |
1001 | if pe.score <= min_score_to_optimize: |
1002 | if pe.score == 0: | |
1003 | detail = 'Distribution is already perfect' | |
1004 | else: | |
1005 | detail = 'score %f <= min_score %f, will not optimize' \ | |
1006 | % (pe.score, min_score_to_optimize) | |
1007 | self.log.info(detail) | |
1008 | return -errno.EALREADY, detail | |
3efd9988 FG |
1009 | |
1010 | # get current osd reweights | |
1011 | orig_osd_weight = { a['osd']: a['weight'] | |
1012 | for a in ms.osdmap_dump.get('osds',[]) } | |
1adf2230 | 1013 | reweighted_osds = [ a for a,b in six.iteritems(orig_osd_weight) |
3efd9988 FG |
1014 | if b < 1.0 and b > 0.0 ] |
1015 | ||
1016 | # get current compat weight-set weights | |
94b18763 FG |
1017 | orig_ws = self.get_compat_weight_set_weights(ms) |
1018 | if not orig_ws: | |
1019 | return -errno.EAGAIN, 'compat weight-set not available' | |
1adf2230 | 1020 | orig_ws = { a: b for a, b in six.iteritems(orig_ws) if a >= 0 } |
3efd9988 FG |
1021 | |
1022 | # Make sure roots don't overlap their devices. If so, we | |
1023 | # can't proceed. | |
eafe8130 | 1024 | roots = list(pe.target_by_root.keys()) |
3efd9988 FG |
1025 | self.log.debug('roots %s', roots) |
1026 | visited = {} | |
1027 | overlap = {} | |
1028 | root_ids = {} | |
1adf2230 | 1029 | for root, wm in six.iteritems(pe.target_by_root): |
11fdf7f2 | 1030 | for osd in wm: |
3efd9988 | 1031 | if osd in visited: |
11fdf7f2 TL |
1032 | if osd not in overlap: |
1033 | overlap[osd] = [ visited[osd] ] | |
1034 | overlap[osd].append(root) | |
1035 | visited[osd] = root | |
3efd9988 | 1036 | if len(overlap) > 0: |
94b18763 | 1037 | detail = 'Some osds belong to multiple subtrees: %s' % \ |
11fdf7f2 | 1038 | overlap |
94b18763 FG |
1039 | self.log.error(detail) |
1040 | return -errno.EOPNOTSUPP, detail | |
3efd9988 | 1041 | |
f64942e4 | 1042 | # rebalance by pgs, objects, or bytes |
11fdf7f2 | 1043 | metrics = self.get_module_option('crush_compat_metrics').split(',') |
f64942e4 AA |
1044 | key = metrics[0] # balancing using the first score metric |
1045 | if key not in ['pgs', 'bytes', 'objects']: | |
1046 | self.log.warn("Invalid crush_compat balancing key %s. Using 'pgs'." % key) | |
1047 | key = 'pgs' | |
3efd9988 FG |
1048 | |
1049 | # go | |
1050 | best_ws = copy.deepcopy(orig_ws) | |
1051 | best_ow = copy.deepcopy(orig_osd_weight) | |
1052 | best_pe = pe | |
1053 | left = max_iterations | |
1054 | bad_steps = 0 | |
1055 | next_ws = copy.deepcopy(best_ws) | |
1056 | next_ow = copy.deepcopy(best_ow) | |
1057 | while left > 0: | |
1058 | # adjust | |
1059 | self.log.debug('best_ws %s' % best_ws) | |
1060 | random.shuffle(roots) | |
1061 | for root in roots: | |
1062 | pools = best_pe.root_pools[root] | |
94b18763 FG |
1063 | osds = len(best_pe.target_by_root[root]) |
1064 | min_pgs = osds * min_pg_per_osd | |
1065 | if best_pe.total_by_root[root][key] < min_pgs: | |
3efd9988 FG |
1066 | self.log.info('Skipping root %s (pools %s), total pgs %d ' |
1067 | '< minimum %d (%d per osd)', | |
94b18763 FG |
1068 | root, pools, |
1069 | best_pe.total_by_root[root][key], | |
1070 | min_pgs, min_pg_per_osd) | |
3efd9988 FG |
1071 | continue |
1072 | self.log.info('Balancing root %s (pools %s) by %s' % | |
1073 | (root, pools, key)) | |
1074 | target = best_pe.target_by_root[root] | |
1075 | actual = best_pe.actual_by_root[root][key] | |
1076 | queue = sorted(actual.keys(), | |
1077 | key=lambda osd: -abs(target[osd] - actual[osd])) | |
1078 | for osd in queue: | |
1079 | if orig_osd_weight[osd] == 0: | |
1080 | self.log.debug('skipping out osd.%d', osd) | |
1081 | else: | |
1082 | deviation = target[osd] - actual[osd] | |
1083 | if deviation == 0: | |
1084 | break | |
1085 | self.log.debug('osd.%d deviation %f', osd, deviation) | |
1086 | weight = best_ws[osd] | |
1087 | ow = orig_osd_weight[osd] | |
1088 | if actual[osd] > 0: | |
1089 | calc_weight = target[osd] / actual[osd] * weight * ow | |
1090 | else: | |
81eedcae TL |
1091 | # for newly created osds, reset calc_weight at target value |
1092 | # this way weight-set will end up absorbing *step* of its | |
1093 | # target (final) value at the very beginning and slowly catch up later. | |
1094 | # note that if this turns out causing too many misplaced | |
1095 | # pgs, then we'll reduce step and retry | |
1096 | calc_weight = target[osd] | |
3efd9988 FG |
1097 | new_weight = weight * (1.0 - step) + calc_weight * step |
1098 | self.log.debug('Reweight osd.%d %f -> %f', osd, weight, | |
1099 | new_weight) | |
1100 | next_ws[osd] = new_weight | |
1101 | if ow < 1.0: | |
1102 | new_ow = min(1.0, max(step + (1.0 - step) * ow, | |
1103 | ow + .005)) | |
1104 | self.log.debug('Reweight osd.%d reweight %f -> %f', | |
1105 | osd, ow, new_ow) | |
1106 | next_ow[osd] = new_ow | |
1107 | ||
1108 | # normalize weights under this root | |
1109 | root_weight = crush.get_item_weight(pe.root_ids[root]) | |
1adf2230 | 1110 | root_sum = sum(b for a,b in six.iteritems(next_ws) |
3efd9988 FG |
1111 | if a in target.keys()) |
1112 | if root_sum > 0 and root_weight > 0: | |
1113 | factor = root_sum / root_weight | |
1114 | self.log.debug('normalizing root %s %d, weight %f, ' | |
1115 | 'ws sum %f, factor %f', | |
1116 | root, pe.root_ids[root], root_weight, | |
1117 | root_sum, factor) | |
1118 | for osd in actual.keys(): | |
1119 | next_ws[osd] = next_ws[osd] / factor | |
1120 | ||
1121 | # recalc | |
1122 | plan.compat_ws = copy.deepcopy(next_ws) | |
1123 | next_ms = plan.final_state() | |
94b18763 | 1124 | next_pe = self.calc_eval(next_ms, plan.pools) |
3efd9988 FG |
1125 | next_misplaced = next_ms.calc_misplaced_from(ms) |
1126 | self.log.debug('Step result score %f -> %f, misplacing %f', | |
1127 | best_pe.score, next_pe.score, next_misplaced) | |
1128 | ||
1129 | if next_misplaced > max_misplaced: | |
1130 | if best_pe.score < pe.score: | |
1131 | self.log.debug('Step misplaced %f > max %f, stopping', | |
1132 | next_misplaced, max_misplaced) | |
1133 | break | |
1134 | step /= 2.0 | |
1135 | next_ws = copy.deepcopy(best_ws) | |
1136 | next_ow = copy.deepcopy(best_ow) | |
1137 | self.log.debug('Step misplaced %f > max %f, reducing step to %f', | |
1138 | next_misplaced, max_misplaced, step) | |
1139 | else: | |
1140 | if next_pe.score > best_pe.score * 1.0001: | |
94b18763 | 1141 | bad_steps += 1 |
3efd9988 FG |
1142 | if bad_steps < 5 and random.randint(0, 100) < 70: |
1143 | self.log.debug('Score got worse, taking another step') | |
1144 | else: | |
1145 | step /= 2.0 | |
1146 | next_ws = copy.deepcopy(best_ws) | |
1147 | next_ow = copy.deepcopy(best_ow) | |
1148 | self.log.debug('Score got worse, trying smaller step %f', | |
1149 | step) | |
1150 | else: | |
1151 | bad_steps = 0 | |
1152 | best_pe = next_pe | |
91327a77 AA |
1153 | best_ws = copy.deepcopy(next_ws) |
1154 | best_ow = copy.deepcopy(next_ow) | |
3efd9988 FG |
1155 | if best_pe.score == 0: |
1156 | break | |
1157 | left -= 1 | |
1158 | ||
1159 | # allow a small regression if we are phasing out osd weights | |
1160 | fudge = 0 | |
81eedcae | 1161 | if best_ow != orig_osd_weight: |
3efd9988 FG |
1162 | fudge = .001 |
1163 | ||
1164 | if best_pe.score < pe.score + fudge: | |
1165 | self.log.info('Success, score %f -> %f', pe.score, best_pe.score) | |
1166 | plan.compat_ws = best_ws | |
1adf2230 | 1167 | for osd, w in six.iteritems(best_ow): |
3efd9988 FG |
1168 | if w != orig_osd_weight[osd]: |
1169 | self.log.debug('osd.%d reweight %f', osd, w) | |
1170 | plan.osd_weights[osd] = w | |
94b18763 | 1171 | return 0, '' |
3efd9988 FG |
1172 | else: |
1173 | self.log.info('Failed to find further optimization, score %f', | |
1174 | pe.score) | |
94b18763 FG |
1175 | plan.compat_ws = {} |
1176 | return -errno.EDOM, 'Unable to find further optimization, ' \ | |
1177 | 'change balancer mode and retry might help' | |
1178 | ||
1179 | def get_compat_weight_set_weights(self, ms): | |
11fdf7f2 | 1180 | if not CRUSHMap.have_default_choose_args(ms.crush_dump): |
94b18763 FG |
1181 | # enable compat weight-set first |
1182 | self.log.debug('ceph osd crush weight-set create-compat') | |
1183 | result = CommandResult('') | |
1184 | self.send_command(result, 'mon', '', json.dumps({ | |
1185 | 'prefix': 'osd crush weight-set create-compat', | |
1186 | 'format': 'json', | |
1187 | }), '') | |
1188 | r, outb, outs = result.wait() | |
1189 | if r != 0: | |
1190 | self.log.error('Error creating compat weight-set') | |
1191 | return | |
1192 | ||
1193 | result = CommandResult('') | |
1194 | self.send_command(result, 'mon', '', json.dumps({ | |
1195 | 'prefix': 'osd crush dump', | |
1196 | 'format': 'json', | |
1197 | }), '') | |
1198 | r, outb, outs = result.wait() | |
1199 | if r != 0: | |
1200 | self.log.error('Error dumping crush map') | |
1201 | return | |
1202 | try: | |
1203 | crushmap = json.loads(outb) | |
1204 | except: | |
1205 | raise RuntimeError('unable to parse crush map') | |
1206 | else: | |
1207 | crushmap = ms.crush_dump | |
3efd9988 | 1208 | |
11fdf7f2 | 1209 | raw = CRUSHMap.get_default_choose_args(crushmap) |
3efd9988 FG |
1210 | weight_set = {} |
1211 | for b in raw: | |
1212 | bucket = None | |
1213 | for t in crushmap['buckets']: | |
1214 | if t['id'] == b['bucket_id']: | |
1215 | bucket = t | |
1216 | break | |
1217 | if not bucket: | |
1218 | raise RuntimeError('could not find bucket %s' % b['bucket_id']) | |
1219 | self.log.debug('bucket items %s' % bucket['items']) | |
1220 | self.log.debug('weight set %s' % b['weight_set'][0]) | |
1221 | if len(bucket['items']) != len(b['weight_set'][0]): | |
1222 | raise RuntimeError('weight-set size does not match bucket items') | |
1223 | for pos in range(len(bucket['items'])): | |
1224 | weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos] | |
1225 | ||
1226 | self.log.debug('weight_set weights %s' % weight_set) | |
1227 | return weight_set | |
1228 | ||
1229 | def do_crush(self): | |
1230 | self.log.info('do_crush (not yet implemented)') | |
1231 | ||
1232 | def do_osd_weight(self): | |
1233 | self.log.info('do_osd_weight (not yet implemented)') | |
1234 | ||
1235 | def execute(self, plan): | |
1236 | self.log.info('Executing plan %s' % plan.name) | |
1237 | ||
1238 | commands = [] | |
1239 | ||
1240 | # compat weight-set | |
1241 | if len(plan.compat_ws) and \ | |
11fdf7f2 | 1242 | not CRUSHMap.have_default_choose_args(plan.initial.crush_dump): |
3efd9988 FG |
1243 | self.log.debug('ceph osd crush weight-set create-compat') |
1244 | result = CommandResult('') | |
1245 | self.send_command(result, 'mon', '', json.dumps({ | |
1246 | 'prefix': 'osd crush weight-set create-compat', | |
1247 | 'format': 'json', | |
1248 | }), '') | |
1249 | r, outb, outs = result.wait() | |
1250 | if r != 0: | |
1251 | self.log.error('Error creating compat weight-set') | |
94b18763 | 1252 | return r, outs |
3efd9988 | 1253 | |
1adf2230 | 1254 | for osd, weight in six.iteritems(plan.compat_ws): |
3efd9988 FG |
1255 | self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f', |
1256 | osd, weight) | |
b32b8144 | 1257 | result = CommandResult('') |
3efd9988 FG |
1258 | self.send_command(result, 'mon', '', json.dumps({ |
1259 | 'prefix': 'osd crush weight-set reweight-compat', | |
1260 | 'format': 'json', | |
1261 | 'item': 'osd.%d' % osd, | |
1262 | 'weight': [weight], | |
b32b8144 | 1263 | }), '') |
3efd9988 FG |
1264 | commands.append(result) |
1265 | ||
1266 | # new_weight | |
1267 | reweightn = {} | |
1adf2230 | 1268 | for osd, weight in six.iteritems(plan.osd_weights): |
3efd9988 FG |
1269 | reweightn[str(osd)] = str(int(weight * float(0x10000))) |
1270 | if len(reweightn): | |
1271 | self.log.info('ceph osd reweightn %s', reweightn) | |
b32b8144 | 1272 | result = CommandResult('') |
3efd9988 FG |
1273 | self.send_command(result, 'mon', '', json.dumps({ |
1274 | 'prefix': 'osd reweightn', | |
1275 | 'format': 'json', | |
1276 | 'weights': json.dumps(reweightn), | |
b32b8144 | 1277 | }), '') |
3efd9988 FG |
1278 | commands.append(result) |
1279 | ||
1280 | # upmap | |
1281 | incdump = plan.inc.dump() | |
1282 | for pgid in incdump.get('old_pg_upmap_items', []): | |
1283 | self.log.info('ceph osd rm-pg-upmap-items %s', pgid) | |
1284 | result = CommandResult('foo') | |
1285 | self.send_command(result, 'mon', '', json.dumps({ | |
1286 | 'prefix': 'osd rm-pg-upmap-items', | |
1287 | 'format': 'json', | |
1288 | 'pgid': pgid, | |
1289 | }), 'foo') | |
1290 | commands.append(result) | |
1291 | ||
1292 | for item in incdump.get('new_pg_upmap_items', []): | |
1293 | self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'], | |
1294 | item['mappings']) | |
1295 | osdlist = [] | |
1296 | for m in item['mappings']: | |
1297 | osdlist += [m['from'], m['to']] | |
1298 | result = CommandResult('foo') | |
1299 | self.send_command(result, 'mon', '', json.dumps({ | |
1300 | 'prefix': 'osd pg-upmap-items', | |
1301 | 'format': 'json', | |
1302 | 'pgid': item['pgid'], | |
1303 | 'id': osdlist, | |
1304 | }), 'foo') | |
1305 | commands.append(result) | |
1306 | ||
1307 | # wait for commands | |
1308 | self.log.debug('commands %s' % commands) | |
1309 | for result in commands: | |
1310 | r, outb, outs = result.wait() | |
1311 | if r != 0: | |
94b18763 FG |
1312 | self.log.error('execute error: r = %d, detail = %s' % (r, outs)) |
1313 | return r, outs | |
3efd9988 | 1314 | self.log.debug('done') |
94b18763 | 1315 | return 0, '' |
eafe8130 TL |
1316 | |
1317 | def gather_telemetry(self): | |
1318 | return { | |
1319 | 'active': self.active, | |
1320 | 'mode': self.mode, | |
1321 | } |