]>
Commit | Line | Data |
---|---|---|
3efd9988 FG |
1 | """ |
2 | Balance PG distribution across OSDs. | |
3 | """ | |
4 | ||
5 | import copy | |
6 | import errno | |
7 | import json | |
8 | import math | |
9 | import random | |
1adf2230 | 10 | import six |
3efd9988 FG |
11 | import time |
12 | from mgr_module import MgrModule, CommandResult | |
13 | from threading import Event | |
b32b8144 | 14 | from mgr_module import CRUSHMap |
eafe8130 | 15 | import datetime |
3efd9988 | 16 | |
3efd9988 FG |
17 | TIME_FORMAT = '%Y-%m-%d_%H:%M:%S' |
18 | ||
3efd9988 | 19 | class MappingState: |
9f95a23c | 20 | def __init__(self, osdmap, raw_pg_stats, raw_pool_stats, desc=''): |
3efd9988 FG |
21 | self.desc = desc |
22 | self.osdmap = osdmap | |
23 | self.osdmap_dump = self.osdmap.dump() | |
24 | self.crush = osdmap.get_crush() | |
25 | self.crush_dump = self.crush.dump() | |
9f95a23c TL |
26 | self.raw_pg_stats = raw_pg_stats |
27 | self.raw_pool_stats = raw_pool_stats | |
3efd9988 | 28 | self.pg_stat = { |
9f95a23c | 29 | i['pgid']: i['stat_sum'] for i in raw_pg_stats.get('pg_stats', []) |
3efd9988 | 30 | } |
94b18763 | 31 | osd_poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])] |
9f95a23c | 32 | pg_poolids = [p['poolid'] for p in raw_pool_stats.get('pool_stats', [])] |
94b18763 | 33 | self.poolids = set(osd_poolids) & set(pg_poolids) |
3efd9988 FG |
34 | self.pg_up = {} |
35 | self.pg_up_by_poolid = {} | |
36 | for poolid in self.poolids: | |
37 | self.pg_up_by_poolid[poolid] = osdmap.map_pool_pgs_up(poolid) | |
1adf2230 | 38 | for a,b in six.iteritems(self.pg_up_by_poolid[poolid]): |
3efd9988 FG |
39 | self.pg_up[a] = b |
40 | ||
41 | def calc_misplaced_from(self, other_ms): | |
42 | num = len(other_ms.pg_up) | |
43 | misplaced = 0 | |
1adf2230 | 44 | for pgid, before in six.iteritems(other_ms.pg_up): |
3efd9988 FG |
45 | if before != self.pg_up.get(pgid, []): |
46 | misplaced += 1 | |
47 | if num > 0: | |
48 | return float(misplaced) / float(num) | |
49 | return 0.0 | |
50 | ||
9f95a23c TL |
51 | class Plan(object): |
52 | def __init__(self, name, mode, osdmap, pools): | |
3efd9988 | 53 | self.name = name |
9f95a23c TL |
54 | self.mode = mode |
55 | self.osdmap = osdmap | |
56 | self.osdmap_dump = osdmap.dump() | |
94b18763 | 57 | self.pools = pools |
3efd9988 FG |
58 | self.osd_weights = {} |
59 | self.compat_ws = {} | |
9f95a23c TL |
60 | self.inc = osdmap.new_incremental() |
61 | self.pg_status = {} | |
62 | ||
63 | class MsPlan(Plan): | |
64 | """ | |
65 | Plan with a preloaded MappingState member. | |
66 | """ | |
67 | def __init__(self, name, mode, ms, pools): | |
68 | super(MsPlan, self).__init__(name, mode, ms.osdmap, pools) | |
69 | self.initial = ms | |
3efd9988 FG |
70 | |
71 | def final_state(self): | |
72 | self.inc.set_osd_reweights(self.osd_weights) | |
73 | self.inc.set_crush_compat_weight_set_weights(self.compat_ws) | |
74 | return MappingState(self.initial.osdmap.apply_incremental(self.inc), | |
9f95a23c TL |
75 | self.initial.raw_pg_stats, |
76 | self.initial.raw_pool_stats, | |
3efd9988 FG |
77 | 'plan %s final' % self.name) |
78 | ||
79 | def dump(self): | |
9f95a23c | 80 | return json.dumps(self.inc.dump(), indent=4, sort_keys=True) |
3efd9988 FG |
81 | |
82 | def show(self): | |
83 | ls = [] | |
84 | ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch()) | |
85 | ls.append('# starting crush version %d' % | |
86 | self.initial.osdmap.get_crush_version()) | |
87 | ls.append('# mode %s' % self.mode) | |
88 | if len(self.compat_ws) and \ | |
11fdf7f2 | 89 | not CRUSHMap.have_default_choose_args(self.initial.crush_dump): |
3efd9988 | 90 | ls.append('ceph osd crush weight-set create-compat') |
1adf2230 | 91 | for osd, weight in six.iteritems(self.compat_ws): |
3efd9988 FG |
92 | ls.append('ceph osd crush weight-set reweight-compat %s %f' % |
93 | (osd, weight)) | |
1adf2230 | 94 | for osd, weight in six.iteritems(self.osd_weights): |
3efd9988 FG |
95 | ls.append('ceph osd reweight osd.%d %f' % (osd, weight)) |
96 | incdump = self.inc.dump() | |
97 | for pgid in incdump.get('old_pg_upmap_items', []): | |
98 | ls.append('ceph osd rm-pg-upmap-items %s' % pgid) | |
99 | for item in incdump.get('new_pg_upmap_items', []): | |
100 | osdlist = [] | |
101 | for m in item['mappings']: | |
102 | osdlist += [m['from'], m['to']] | |
103 | ls.append('ceph osd pg-upmap-items %s %s' % | |
104 | (item['pgid'], ' '.join([str(a) for a in osdlist]))) | |
105 | return '\n'.join(ls) | |
106 | ||
107 | ||
108 | class Eval: | |
3efd9988 FG |
109 | def __init__(self, ms): |
110 | self.ms = ms | |
94b18763 FG |
111 | self.root_ids = {} # root name -> id |
112 | self.pool_name = {} # pool id -> pool name | |
113 | self.pool_id = {} # pool name -> id | |
114 | self.pool_roots = {} # pool name -> root name | |
115 | self.root_pools = {} # root name -> pools | |
116 | self.target_by_root = {} # root name -> target weight map | |
117 | self.count_by_pool = {} | |
118 | self.count_by_root = {} | |
119 | self.actual_by_pool = {} # pool -> by_* -> actual weight map | |
120 | self.actual_by_root = {} # pool -> by_* -> actual weight map | |
121 | self.total_by_pool = {} # pool -> by_* -> total | |
122 | self.total_by_root = {} # root -> by_* -> total | |
123 | self.stats_by_pool = {} # pool -> by_* -> stddev or avg -> value | |
124 | self.stats_by_root = {} # root -> by_* -> stddev or avg -> value | |
125 | ||
126 | self.score_by_pool = {} | |
127 | self.score_by_root = {} | |
128 | ||
129 | self.score = 0.0 | |
3efd9988 FG |
130 | |
131 | def show(self, verbose=False): | |
132 | if verbose: | |
133 | r = self.ms.desc + '\n' | |
134 | r += 'target_by_root %s\n' % self.target_by_root | |
135 | r += 'actual_by_pool %s\n' % self.actual_by_pool | |
136 | r += 'actual_by_root %s\n' % self.actual_by_root | |
137 | r += 'count_by_pool %s\n' % self.count_by_pool | |
138 | r += 'count_by_root %s\n' % self.count_by_root | |
139 | r += 'total_by_pool %s\n' % self.total_by_pool | |
140 | r += 'total_by_root %s\n' % self.total_by_root | |
141 | r += 'stats_by_root %s\n' % self.stats_by_root | |
142 | r += 'score_by_pool %s\n' % self.score_by_pool | |
143 | r += 'score_by_root %s\n' % self.score_by_root | |
144 | else: | |
145 | r = self.ms.desc + ' ' | |
146 | r += 'score %f (lower is better)\n' % self.score | |
147 | return r | |
148 | ||
149 | def calc_stats(self, count, target, total): | |
150 | num = max(len(target), 1) | |
151 | r = {} | |
152 | for t in ('pgs', 'objects', 'bytes'): | |
94b18763 FG |
153 | if total[t] == 0: |
154 | r[t] = { | |
9f95a23c TL |
155 | 'max': 0, |
156 | 'min': 0, | |
94b18763 FG |
157 | 'avg': 0, |
158 | 'stddev': 0, | |
159 | 'sum_weight': 0, | |
160 | 'score': 0, | |
161 | } | |
162 | continue | |
163 | ||
3efd9988 FG |
164 | avg = float(total[t]) / float(num) |
165 | dev = 0.0 | |
166 | ||
167 | # score is a measure of how uneven the data distribution is. | |
168 | # score lies between [0, 1), 0 means perfect distribution. | |
169 | score = 0.0 | |
170 | sum_weight = 0.0 | |
171 | ||
1adf2230 | 172 | for k, v in six.iteritems(count[t]): |
3efd9988 FG |
173 | # adjust/normalize by weight |
174 | if target[k]: | |
175 | adjusted = float(v) / target[k] / float(num) | |
176 | else: | |
177 | adjusted = 0.0 | |
178 | ||
179 | # Overweighted devices and their weights are factors to calculate reweight_urgency. | |
180 | # One 10% underfilled device with 5 2% overfilled devices, is arguably a better | |
181 | # situation than one 10% overfilled with 5 2% underfilled devices | |
182 | if adjusted > avg: | |
183 | ''' | |
184 | F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution | |
185 | x = (adjusted - avg)/avg. | |
186 | Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1). | |
187 | To bring range of F(x) in range [0, 1), we need to make the above modification. | |
188 | ||
189 | In general, we need to use a function F(x), where x = (adjusted - avg)/avg | |
190 | 1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded. | |
191 | 2. A larger value of x, should imply more urgency to reweight. | |
192 | 3. Also, the difference between F(x) when x is large, should be minimal. | |
193 | 4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply. | |
194 | ||
195 | Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use. | |
196 | ||
197 | cdf of standard normal distribution: https://stackoverflow.com/a/29273201 | |
198 | ''' | |
199 | score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0))) | |
200 | sum_weight += target[k] | |
201 | dev += (avg - adjusted) * (avg - adjusted) | |
202 | stddev = math.sqrt(dev / float(max(num - 1, 1))) | |
203 | score = score / max(sum_weight, 1) | |
204 | r[t] = { | |
9f95a23c TL |
205 | 'max': max(count[t].values()), |
206 | 'min': min(count[t].values()), | |
3efd9988 FG |
207 | 'avg': avg, |
208 | 'stddev': stddev, | |
209 | 'sum_weight': sum_weight, | |
210 | 'score': score, | |
211 | } | |
212 | return r | |
213 | ||
214 | class Module(MgrModule): | |
11fdf7f2 TL |
215 | MODULE_OPTIONS = [ |
216 | { | |
217 | 'name': 'active', | |
218 | 'type': 'bool', | |
219 | 'default': False, | |
220 | 'desc': 'automatically balance PGs across cluster', | |
221 | 'runtime': True, | |
222 | }, | |
223 | { | |
224 | 'name': 'begin_time', | |
225 | 'type': 'str', | |
226 | 'default': '0000', | |
227 | 'desc': 'beginning time of day to automatically balance', | |
228 | 'long_desc': 'This is a time of day in the format HHMM.', | |
229 | 'runtime': True, | |
230 | }, | |
231 | { | |
232 | 'name': 'end_time', | |
233 | 'type': 'str', | |
234 | 'default': '2400', | |
235 | 'desc': 'ending time of day to automatically balance', | |
236 | 'long_desc': 'This is a time of day in the format HHMM.', | |
237 | 'runtime': True, | |
238 | }, | |
239 | { | |
240 | 'name': 'begin_weekday', | |
241 | 'type': 'uint', | |
242 | 'default': 0, | |
243 | 'min': 0, | |
244 | 'max': 7, | |
245 | 'desc': 'Restrict automatic balancing to this day of the week or later', | |
246 | 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.', | |
247 | 'runtime': True, | |
248 | }, | |
249 | { | |
250 | 'name': 'end_weekday', | |
251 | 'type': 'uint', | |
252 | 'default': 7, | |
253 | 'min': 0, | |
254 | 'max': 7, | |
255 | 'desc': 'Restrict automatic balancing to days of the week earlier than this', | |
256 | 'long_desc': '0 or 7 = Sunday, 1 = Monday, etc.', | |
257 | 'runtime': True, | |
258 | }, | |
259 | { | |
260 | 'name': 'crush_compat_max_iterations', | |
261 | 'type': 'uint', | |
262 | 'default': 25, | |
263 | 'min': 1, | |
264 | 'max': 250, | |
265 | 'desc': 'maximum number of iterations to attempt optimization', | |
266 | 'runtime': True, | |
267 | }, | |
268 | { | |
269 | 'name': 'crush_compat_metrics', | |
270 | 'type': 'str', | |
271 | 'default': 'pgs,objects,bytes', | |
272 | 'desc': 'metrics with which to calculate OSD utilization', | |
273 | 'long_desc': 'Value is a list of one or more of "pgs", "objects", or "bytes", and indicates which metrics to use to balance utilization.', | |
274 | 'runtime': True, | |
275 | }, | |
276 | { | |
277 | 'name': 'crush_compat_step', | |
278 | 'type': 'float', | |
279 | 'default': .5, | |
280 | 'min': .001, | |
281 | 'max': .999, | |
282 | 'desc': 'aggressiveness of optimization', | |
283 | 'long_desc': '.99 is very aggressive, .01 is less aggressive', | |
284 | 'runtime': True, | |
285 | }, | |
286 | { | |
287 | 'name': 'min_score', | |
288 | 'type': 'float', | |
289 | 'default': 0, | |
290 | 'desc': 'minimum score, below which no optimization is attempted', | |
291 | 'runtime': True, | |
292 | }, | |
293 | { | |
294 | 'name': 'mode', | |
295 | 'desc': 'Balancer mode', | |
296 | 'default': 'none', | |
297 | 'enum_allowed': ['none', 'crush-compat', 'upmap'], | |
298 | 'runtime': True, | |
299 | }, | |
300 | { | |
301 | 'name': 'sleep_interval', | |
302 | 'type': 'secs', | |
303 | 'default': 60, | |
304 | 'desc': 'how frequently to wake up and attempt optimization', | |
305 | 'runtime': True, | |
306 | }, | |
307 | { | |
9f95a23c | 308 | 'name': 'upmap_max_optimizations', |
11fdf7f2 TL |
309 | 'type': 'uint', |
310 | 'default': 10, | |
9f95a23c | 311 | 'desc': 'maximum upmap optimizations to make per attempt', |
11fdf7f2 TL |
312 | 'runtime': True, |
313 | }, | |
314 | { | |
315 | 'name': 'upmap_max_deviation', | |
92f5a8d4 TL |
316 | 'type': 'int', |
317 | 'default': 5, | |
318 | 'min': 1, | |
11fdf7f2 | 319 | 'desc': 'deviation below which no optimization is attempted', |
92f5a8d4 | 320 | 'long_desc': 'If the number of PGs are within this count then no optimization is attempted', |
11fdf7f2 TL |
321 | 'runtime': True, |
322 | }, | |
323 | { | |
324 | 'name': 'pool_ids', | |
325 | 'type': 'str', | |
326 | 'default': '', | |
327 | 'desc': 'pools which the automatic balancing will be limited to', | |
328 | 'runtime': True, | |
329 | }, | |
330 | ] | |
331 | ||
3efd9988 FG |
332 | COMMANDS = [ |
333 | { | |
334 | "cmd": "balancer status", | |
335 | "desc": "Show balancer status", | |
336 | "perm": "r", | |
337 | }, | |
338 | { | |
339 | "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap", | |
340 | "desc": "Set balancer mode", | |
341 | "perm": "rw", | |
342 | }, | |
343 | { | |
344 | "cmd": "balancer on", | |
345 | "desc": "Enable automatic balancing", | |
346 | "perm": "rw", | |
347 | }, | |
348 | { | |
349 | "cmd": "balancer off", | |
350 | "desc": "Disable automatic balancing", | |
351 | "perm": "rw", | |
352 | }, | |
11fdf7f2 TL |
353 | { |
354 | "cmd": "balancer pool ls", | |
355 | "desc": "List automatic balancing pools. " | |
356 | "Note that empty list means all existing pools will be automatic balancing targets, " | |
357 | "which is the default behaviour of balancer.", | |
358 | "perm": "r", | |
359 | }, | |
360 | { | |
361 | "cmd": "balancer pool add name=pools,type=CephString,n=N", | |
362 | "desc": "Enable automatic balancing for specific pools", | |
363 | "perm": "rw", | |
364 | }, | |
365 | { | |
366 | "cmd": "balancer pool rm name=pools,type=CephString,n=N", | |
367 | "desc": "Disable automatic balancing for specific pools", | |
368 | "perm": "rw", | |
369 | }, | |
3efd9988 | 370 | { |
94b18763 FG |
371 | "cmd": "balancer eval name=option,type=CephString,req=false", |
372 | "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan", | |
3efd9988 FG |
373 | "perm": "r", |
374 | }, | |
375 | { | |
94b18763 FG |
376 | "cmd": "balancer eval-verbose name=option,type=CephString,req=false", |
377 | "desc": "Evaluate data distribution for the current cluster or specific pool or specific plan (verbosely)", | |
3efd9988 FG |
378 | "perm": "r", |
379 | }, | |
380 | { | |
94b18763 | 381 | "cmd": "balancer optimize name=plan,type=CephString name=pools,type=CephString,n=N,req=false", |
3efd9988 FG |
382 | "desc": "Run optimizer to create a new plan", |
383 | "perm": "rw", | |
384 | }, | |
385 | { | |
386 | "cmd": "balancer show name=plan,type=CephString", | |
387 | "desc": "Show details of an optimization plan", | |
388 | "perm": "r", | |
389 | }, | |
390 | { | |
391 | "cmd": "balancer rm name=plan,type=CephString", | |
392 | "desc": "Discard an optimization plan", | |
393 | "perm": "rw", | |
394 | }, | |
395 | { | |
396 | "cmd": "balancer reset", | |
397 | "desc": "Discard all optimization plans", | |
398 | "perm": "rw", | |
399 | }, | |
400 | { | |
401 | "cmd": "balancer dump name=plan,type=CephString", | |
402 | "desc": "Show an optimization plan", | |
403 | "perm": "r", | |
404 | }, | |
f64942e4 AA |
405 | { |
406 | "cmd": "balancer ls", | |
407 | "desc": "List all plans", | |
408 | "perm": "r", | |
409 | }, | |
3efd9988 FG |
410 | { |
411 | "cmd": "balancer execute name=plan,type=CephString", | |
412 | "desc": "Execute an optimization plan", | |
a8e16298 | 413 | "perm": "rw", |
3efd9988 FG |
414 | }, |
415 | ] | |
416 | active = False | |
417 | run = True | |
418 | plans = {} | |
419 | mode = '' | |
eafe8130 TL |
420 | optimizing = False |
421 | last_optimize_started = '' | |
422 | last_optimize_duration = '' | |
423 | optimize_result = '' | |
424 | success_string = 'Optimization plan created successfully' | |
425 | in_progress_string = 'in progress' | |
3efd9988 FG |
426 | |
427 | def __init__(self, *args, **kwargs): | |
428 | super(Module, self).__init__(*args, **kwargs) | |
429 | self.event = Event() | |
430 | ||
11fdf7f2 | 431 | def handle_command(self, inbuf, command): |
3efd9988 FG |
432 | self.log.warn("Handling command: '%s'" % str(command)) |
433 | if command['prefix'] == 'balancer status': | |
434 | s = { | |
f64942e4 | 435 | 'plans': list(self.plans.keys()), |
3efd9988 | 436 | 'active': self.active, |
eafe8130 TL |
437 | 'last_optimize_started': self.last_optimize_started, |
438 | 'last_optimize_duration': self.last_optimize_duration, | |
439 | 'optimize_result': self.optimize_result, | |
11fdf7f2 | 440 | 'mode': self.get_module_option('mode'), |
3efd9988 | 441 | } |
9f95a23c | 442 | return (0, json.dumps(s, indent=4, sort_keys=True), '') |
3efd9988 | 443 | elif command['prefix'] == 'balancer mode': |
11fdf7f2 TL |
444 | if command['mode'] == 'upmap': |
445 | min_compat_client = self.get_osdmap().dump().get('require_min_compat_client', '') | |
446 | if min_compat_client < 'luminous': # works well because version is alphabetized.. | |
447 | warn = 'min_compat_client "%s" ' \ | |
448 | '< "luminous", which is required for pg-upmap. ' \ | |
449 | 'Try "ceph osd set-require-min-compat-client luminous" ' \ | |
450 | 'before enabling this mode' % min_compat_client | |
451 | return (-errno.EPERM, '', warn) | |
81eedcae TL |
452 | elif command['mode'] == 'crush-compat': |
453 | ms = MappingState(self.get_osdmap(), | |
9f95a23c TL |
454 | self.get("pg_stats"), |
455 | self.get("pool_stats"), | |
81eedcae TL |
456 | 'initialize compat weight-set') |
457 | self.get_compat_weight_set_weights(ms) # ignore error | |
11fdf7f2 | 458 | self.set_module_option('mode', command['mode']) |
3efd9988 FG |
459 | return (0, '', '') |
460 | elif command['prefix'] == 'balancer on': | |
461 | if not self.active: | |
11fdf7f2 | 462 | self.set_module_option('active', 'true') |
3efd9988 FG |
463 | self.active = True |
464 | self.event.set() | |
465 | return (0, '', '') | |
466 | elif command['prefix'] == 'balancer off': | |
467 | if self.active: | |
11fdf7f2 | 468 | self.set_module_option('active', 'false') |
3efd9988 FG |
469 | self.active = False |
470 | self.event.set() | |
471 | return (0, '', '') | |
11fdf7f2 TL |
472 | elif command['prefix'] == 'balancer pool ls': |
473 | pool_ids = self.get_module_option('pool_ids') | |
474 | if pool_ids is '': | |
475 | return (0, '', '') | |
476 | pool_ids = pool_ids.split(',') | |
477 | pool_ids = [int(p) for p in pool_ids] | |
478 | pool_name_by_id = dict((p['pool'], p['pool_name']) for p in self.get_osdmap().dump().get('pools', [])) | |
479 | should_prune = False | |
480 | final_ids = [] | |
481 | final_names = [] | |
482 | for p in pool_ids: | |
483 | if p in pool_name_by_id: | |
484 | final_ids.append(p) | |
485 | final_names.append(pool_name_by_id[p]) | |
486 | else: | |
487 | should_prune = True | |
488 | if should_prune: # some pools were gone, prune | |
489 | self.set_module_option('pool_ids', ','.join(final_ids)) | |
9f95a23c | 490 | return (0, json.dumps(sorted(final_names), indent=4, sort_keys=True), '') |
11fdf7f2 TL |
491 | elif command['prefix'] == 'balancer pool add': |
492 | raw_names = command['pools'] | |
493 | pool_id_by_name = dict((p['pool_name'], p['pool']) for p in self.get_osdmap().dump().get('pools', [])) | |
494 | invalid_names = [p for p in raw_names if p not in pool_id_by_name] | |
495 | if invalid_names: | |
496 | return (-errno.EINVAL, '', 'pool(s) %s not found' % invalid_names) | |
497 | to_add = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name] | |
498 | existing = self.get_module_option('pool_ids') | |
499 | final = to_add | |
500 | if existing is not '': | |
501 | existing = existing.split(',') | |
502 | final = set(to_add) | set(existing) | |
503 | self.set_module_option('pool_ids', ','.join(final)) | |
504 | return (0, '', '') | |
505 | elif command['prefix'] == 'balancer pool rm': | |
506 | raw_names = command['pools'] | |
507 | existing = self.get_module_option('pool_ids') | |
508 | if existing is '': # for idempotence | |
509 | return (0, '', '') | |
510 | existing = existing.split(',') | |
511 | osdmap = self.get_osdmap() | |
512 | pool_ids = [str(p['pool']) for p in osdmap.dump().get('pools', [])] | |
513 | pool_id_by_name = dict((p['pool_name'], p['pool']) for p in osdmap.dump().get('pools', [])) | |
514 | final = [p for p in existing if p in pool_ids] | |
515 | to_delete = [str(pool_id_by_name[p]) for p in raw_names if p in pool_id_by_name] | |
516 | final = set(final) - set(to_delete) | |
517 | self.set_module_option('pool_ids', ','.join(final)) | |
518 | return (0, '', '') | |
3efd9988 FG |
519 | elif command['prefix'] == 'balancer eval' or command['prefix'] == 'balancer eval-verbose': |
520 | verbose = command['prefix'] == 'balancer eval-verbose' | |
94b18763 FG |
521 | pools = [] |
522 | if 'option' in command: | |
523 | plan = self.plans.get(command['option']) | |
3efd9988 | 524 | if not plan: |
94b18763 FG |
525 | # not a plan, does it look like a pool? |
526 | osdmap = self.get_osdmap() | |
527 | valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])] | |
528 | option = command['option'] | |
529 | if option not in valid_pool_names: | |
530 | return (-errno.EINVAL, '', 'option "%s" not a plan or a pool' % option) | |
531 | pools.append(option) | |
9f95a23c | 532 | ms = MappingState(osdmap, self.get("pg_stats"), self.get("pool_stats"), 'pool "%s"' % option) |
94b18763 FG |
533 | else: |
534 | pools = plan.pools | |
9f95a23c TL |
535 | if plan.mode == 'upmap': |
536 | # Note that for upmap, to improve the efficiency, | |
537 | # we use a basic version of Plan without keeping the obvious | |
538 | # *redundant* MS member. | |
539 | # Hence ms might not be accurate here since we are basically | |
540 | # using an old snapshotted osdmap vs a fresh copy of pg_stats. | |
541 | # It should not be a big deal though.. | |
542 | ms = MappingState(plan.osdmap, | |
543 | self.get("pg_stats"), | |
544 | self.get("pool_stats"), | |
545 | 'plan "%s"' % plan.name) | |
546 | else: | |
547 | ms = plan.final_state() | |
3efd9988 FG |
548 | else: |
549 | ms = MappingState(self.get_osdmap(), | |
9f95a23c TL |
550 | self.get("pg_stats"), |
551 | self.get("pool_stats"), | |
3efd9988 | 552 | 'current cluster') |
94b18763 | 553 | return (0, self.evaluate(ms, pools, verbose=verbose), '') |
3efd9988 | 554 | elif command['prefix'] == 'balancer optimize': |
eafe8130 TL |
555 | # The GIL can be release by the active balancer, so disallow when active |
556 | if self.active: | |
557 | return (-errno.EINVAL, '', 'Balancer enabled, disable to optimize manually') | |
558 | if self.optimizing: | |
559 | return (-errno.EINVAL, '', 'Balancer finishing up....try again') | |
94b18763 FG |
560 | pools = [] |
561 | if 'pools' in command: | |
562 | pools = command['pools'] | |
563 | osdmap = self.get_osdmap() | |
564 | valid_pool_names = [p['pool_name'] for p in osdmap.dump().get('pools', [])] | |
565 | invalid_pool_names = [] | |
566 | for p in pools: | |
567 | if p not in valid_pool_names: | |
568 | invalid_pool_names.append(p) | |
569 | if len(invalid_pool_names): | |
570 | return (-errno.EINVAL, '', 'pools %s not found' % invalid_pool_names) | |
571 | plan = self.plan_create(command['plan'], osdmap, pools) | |
eafe8130 TL |
572 | self.last_optimize_started = time.asctime(time.localtime()) |
573 | self.optimize_result = self.in_progress_string | |
574 | start = time.time() | |
94b18763 | 575 | r, detail = self.optimize(plan) |
eafe8130 TL |
576 | end = time.time() |
577 | self.last_optimize_duration = str(datetime.timedelta(seconds=(end - start))) | |
578 | if r == 0: | |
579 | # Add plan if an optimization was created | |
580 | self.optimize_result = self.success_string | |
581 | self.plans[command['plan']] = plan | |
582 | else: | |
583 | self.optimize_result = detail | |
94b18763 | 584 | return (r, '', detail) |
3efd9988 | 585 | elif command['prefix'] == 'balancer rm': |
b32b8144 | 586 | self.plan_rm(command['plan']) |
3efd9988 FG |
587 | return (0, '', '') |
588 | elif command['prefix'] == 'balancer reset': | |
589 | self.plans = {} | |
590 | return (0, '', '') | |
f64942e4 | 591 | elif command['prefix'] == 'balancer ls': |
9f95a23c | 592 | return (0, json.dumps([p for p in self.plans], indent=4, sort_keys=True), '') |
3efd9988 FG |
593 | elif command['prefix'] == 'balancer dump': |
594 | plan = self.plans.get(command['plan']) | |
595 | if not plan: | |
596 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
597 | return (0, plan.dump(), '') | |
598 | elif command['prefix'] == 'balancer show': | |
599 | plan = self.plans.get(command['plan']) | |
600 | if not plan: | |
601 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
602 | return (0, plan.show(), '') | |
603 | elif command['prefix'] == 'balancer execute': | |
eafe8130 TL |
604 | # The GIL can be release by the active balancer, so disallow when active |
605 | if self.active: | |
606 | return (-errno.EINVAL, '', 'Balancer enabled, disable to execute a plan') | |
607 | if self.optimizing: | |
608 | return (-errno.EINVAL, '', 'Balancer finishing up....try again') | |
3efd9988 FG |
609 | plan = self.plans.get(command['plan']) |
610 | if not plan: | |
611 | return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) | |
94b18763 FG |
612 | r, detail = self.execute(plan) |
613 | self.plan_rm(command['plan']) | |
614 | return (r, '', detail) | |
3efd9988 FG |
615 | else: |
616 | return (-errno.EINVAL, '', | |
617 | "Command not found '{0}'".format(command['prefix'])) | |
618 | ||
619 | def shutdown(self): | |
620 | self.log.info('Stopping') | |
621 | self.run = False | |
622 | self.event.set() | |
623 | ||
a8e16298 TL |
624 | def time_permit(self): |
625 | local_time = time.localtime() | |
626 | time_of_day = time.strftime('%H%M', local_time) | |
627 | weekday = (local_time.tm_wday + 1) % 7 # be compatible with C | |
628 | permit = False | |
629 | ||
11fdf7f2 TL |
630 | begin_time = self.get_module_option('begin_time') |
631 | end_time = self.get_module_option('end_time') | |
a8e16298 TL |
632 | if begin_time <= end_time: |
633 | permit = begin_time <= time_of_day < end_time | |
634 | else: | |
635 | permit = time_of_day >= begin_time or time_of_day < end_time | |
636 | if not permit: | |
637 | self.log.debug("should run between %s - %s, now %s, skipping", | |
638 | begin_time, end_time, time_of_day) | |
639 | return False | |
640 | ||
11fdf7f2 TL |
641 | begin_weekday = self.get_module_option('begin_weekday') |
642 | end_weekday = self.get_module_option('end_weekday') | |
a8e16298 TL |
643 | if begin_weekday <= end_weekday: |
644 | permit = begin_weekday <= weekday < end_weekday | |
3efd9988 | 645 | else: |
a8e16298 TL |
646 | permit = weekday >= begin_weekday or weekday < end_weekday |
647 | if not permit: | |
648 | self.log.debug("should run between weekday %d - %d, now %d, skipping", | |
649 | begin_weekday, end_weekday, weekday) | |
650 | return False | |
651 | ||
652 | return True | |
3efd9988 FG |
653 | |
654 | def serve(self): | |
655 | self.log.info('Starting') | |
656 | while self.run: | |
11fdf7f2 TL |
657 | self.active = self.get_module_option('active') |
658 | sleep_interval = self.get_module_option('sleep_interval') | |
a8e16298 TL |
659 | self.log.debug('Waking up [%s, now %s]', |
660 | "active" if self.active else "inactive", | |
661 | time.strftime(TIME_FORMAT, time.localtime())) | |
662 | if self.active and self.time_permit(): | |
3efd9988 FG |
663 | self.log.debug('Running') |
664 | name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime()) | |
11fdf7f2 TL |
665 | osdmap = self.get_osdmap() |
666 | allow = self.get_module_option('pool_ids') | |
667 | final = [] | |
668 | if allow is not '': | |
669 | allow = allow.split(',') | |
670 | valid = [str(p['pool']) for p in osdmap.dump().get('pools', [])] | |
671 | final = set(allow) & set(valid) | |
672 | if set(allow) - set(valid): # some pools were gone, prune | |
673 | self.set_module_option('pool_ids', ','.join(final)) | |
674 | pool_name_by_id = dict((p['pool'], p['pool_name']) for p in osdmap.dump().get('pools', [])) | |
675 | final = [int(p) for p in final] | |
676 | final = [pool_name_by_id[p] for p in final if p in pool_name_by_id] | |
677 | plan = self.plan_create(name, osdmap, final) | |
eafe8130 TL |
678 | self.optimizing = True |
679 | self.last_optimize_started = time.asctime(time.localtime()) | |
680 | self.optimize_result = self.in_progress_string | |
681 | start = time.time() | |
94b18763 | 682 | r, detail = self.optimize(plan) |
eafe8130 TL |
683 | end = time.time() |
684 | self.last_optimize_duration = str(datetime.timedelta(seconds=(end - start))) | |
94b18763 | 685 | if r == 0: |
eafe8130 | 686 | self.optimize_result = self.success_string |
3efd9988 | 687 | self.execute(plan) |
eafe8130 TL |
688 | else: |
689 | self.optimize_result = detail | |
690 | self.optimizing = False | |
3efd9988 FG |
691 | self.log.debug('Sleeping for %d', sleep_interval) |
692 | self.event.wait(sleep_interval) | |
693 | self.event.clear() | |
694 | ||
94b18763 | 695 | def plan_create(self, name, osdmap, pools): |
9f95a23c TL |
696 | mode = self.get_module_option('mode') |
697 | if mode == 'upmap': | |
698 | # drop unnecessary MS member for upmap mode. | |
699 | # this way we could effectively eliminate the usage of a | |
700 | # complete pg_stats, which can become horribly inefficient | |
701 | # as pg_num grows.. | |
702 | plan = Plan(name, mode, osdmap, pools) | |
703 | else: | |
704 | plan = MsPlan(name, | |
705 | mode, | |
706 | MappingState(osdmap, | |
707 | self.get("pg_stats"), | |
708 | self.get("pool_stats"), | |
709 | 'plan %s initial' % name), | |
710 | pools) | |
3efd9988 FG |
711 | return plan |
712 | ||
713 | def plan_rm(self, name): | |
714 | if name in self.plans: | |
715 | del self.plans[name] | |
716 | ||
94b18763 | 717 | def calc_eval(self, ms, pools): |
3efd9988 FG |
718 | pe = Eval(ms) |
719 | pool_rule = {} | |
720 | pool_info = {} | |
721 | for p in ms.osdmap_dump.get('pools',[]): | |
94b18763 FG |
722 | if len(pools) and p['pool_name'] not in pools: |
723 | continue | |
724 | # skip dead or not-yet-ready pools too | |
725 | if p['pool'] not in ms.poolids: | |
726 | continue | |
3efd9988 FG |
727 | pe.pool_name[p['pool']] = p['pool_name'] |
728 | pe.pool_id[p['pool_name']] = p['pool'] | |
729 | pool_rule[p['pool_name']] = p['crush_rule'] | |
730 | pe.pool_roots[p['pool_name']] = [] | |
731 | pool_info[p['pool_name']] = p | |
94b18763 | 732 | if len(pool_info) == 0: |
3efd9988 FG |
733 | return pe |
734 | self.log.debug('pool_name %s' % pe.pool_name) | |
735 | self.log.debug('pool_id %s' % pe.pool_id) | |
736 | self.log.debug('pools %s' % pools) | |
737 | self.log.debug('pool_rule %s' % pool_rule) | |
738 | ||
739 | osd_weight = { a['osd']: a['weight'] | |
94b18763 | 740 | for a in ms.osdmap_dump.get('osds',[]) if a['weight'] > 0 } |
3efd9988 FG |
741 | |
742 | # get expected distributions by root | |
743 | actual_by_root = {} | |
744 | rootids = ms.crush.find_takes() | |
745 | roots = [] | |
746 | for rootid in rootids: | |
3efd9988 | 747 | ls = ms.osdmap.get_pools_by_take(rootid) |
94b18763 FG |
748 | want = [] |
749 | # find out roots associating with pools we are passed in | |
750 | for candidate in ls: | |
751 | if candidate in pe.pool_name: | |
752 | want.append(candidate) | |
753 | if len(want) == 0: | |
754 | continue | |
755 | root = ms.crush.get_item_name(rootid) | |
3efd9988 | 756 | pe.root_pools[root] = [] |
94b18763 | 757 | for poolid in want: |
3efd9988 FG |
758 | pe.pool_roots[pe.pool_name[poolid]].append(root) |
759 | pe.root_pools[root].append(pe.pool_name[poolid]) | |
94b18763 FG |
760 | pe.root_ids[root] = rootid |
761 | roots.append(root) | |
3efd9988 FG |
762 | weight_map = ms.crush.get_take_weight_osd_map(rootid) |
763 | adjusted_map = { | |
94b18763 | 764 | osd: cw * osd_weight[osd] |
1adf2230 | 765 | for osd,cw in six.iteritems(weight_map) if osd in osd_weight and cw > 0 |
3efd9988 | 766 | } |
94b18763 FG |
767 | sum_w = sum(adjusted_map.values()) |
768 | assert len(adjusted_map) == 0 or sum_w > 0 | |
3efd9988 | 769 | pe.target_by_root[root] = { osd: w / sum_w |
1adf2230 | 770 | for osd,w in six.iteritems(adjusted_map) } |
3efd9988 FG |
771 | actual_by_root[root] = { |
772 | 'pgs': {}, | |
773 | 'objects': {}, | |
774 | 'bytes': {}, | |
775 | } | |
11fdf7f2 | 776 | for osd in pe.target_by_root[root]: |
3efd9988 FG |
777 | actual_by_root[root]['pgs'][osd] = 0 |
778 | actual_by_root[root]['objects'][osd] = 0 | |
779 | actual_by_root[root]['bytes'][osd] = 0 | |
780 | pe.total_by_root[root] = { | |
781 | 'pgs': 0, | |
782 | 'objects': 0, | |
783 | 'bytes': 0, | |
784 | } | |
785 | self.log.debug('pool_roots %s' % pe.pool_roots) | |
786 | self.log.debug('root_pools %s' % pe.root_pools) | |
787 | self.log.debug('target_by_root %s' % pe.target_by_root) | |
788 | ||
789 | # pool and root actual | |
1adf2230 | 790 | for pool, pi in six.iteritems(pool_info): |
3efd9988 FG |
791 | poolid = pi['pool'] |
792 | pm = ms.pg_up_by_poolid[poolid] | |
793 | pgs = 0 | |
794 | objects = 0 | |
795 | bytes = 0 | |
796 | pgs_by_osd = {} | |
797 | objects_by_osd = {} | |
798 | bytes_by_osd = {} | |
1adf2230 | 799 | for pgid, up in six.iteritems(pm): |
3efd9988 | 800 | for osd in [int(osd) for osd in up]: |
b32b8144 FG |
801 | if osd == CRUSHMap.ITEM_NONE: |
802 | continue | |
9f95a23c TL |
803 | if osd not in pgs_by_osd: |
804 | pgs_by_osd[osd] = 0 | |
805 | objects_by_osd[osd] = 0 | |
806 | bytes_by_osd[osd] = 0 | |
3efd9988 FG |
807 | pgs_by_osd[osd] += 1 |
808 | objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects'] | |
809 | bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes'] | |
810 | # pick a root to associate this pg instance with. | |
811 | # note that this is imprecise if the roots have | |
812 | # overlapping children. | |
813 | # FIXME: divide bytes by k for EC pools. | |
814 | for root in pe.pool_roots[pool]: | |
815 | if osd in pe.target_by_root[root]: | |
816 | actual_by_root[root]['pgs'][osd] += 1 | |
817 | actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects'] | |
818 | actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes'] | |
819 | pgs += 1 | |
820 | objects += ms.pg_stat[pgid]['num_objects'] | |
821 | bytes += ms.pg_stat[pgid]['num_bytes'] | |
822 | pe.total_by_root[root]['pgs'] += 1 | |
823 | pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects'] | |
824 | pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes'] | |
825 | break | |
826 | pe.count_by_pool[pool] = { | |
827 | 'pgs': { | |
828 | k: v | |
1adf2230 | 829 | for k, v in six.iteritems(pgs_by_osd) |
3efd9988 FG |
830 | }, |
831 | 'objects': { | |
832 | k: v | |
1adf2230 | 833 | for k, v in six.iteritems(objects_by_osd) |
3efd9988 FG |
834 | }, |
835 | 'bytes': { | |
836 | k: v | |
1adf2230 | 837 | for k, v in six.iteritems(bytes_by_osd) |
3efd9988 FG |
838 | }, |
839 | } | |
840 | pe.actual_by_pool[pool] = { | |
841 | 'pgs': { | |
842 | k: float(v) / float(max(pgs, 1)) | |
1adf2230 | 843 | for k, v in six.iteritems(pgs_by_osd) |
3efd9988 FG |
844 | }, |
845 | 'objects': { | |
846 | k: float(v) / float(max(objects, 1)) | |
1adf2230 | 847 | for k, v in six.iteritems(objects_by_osd) |
3efd9988 FG |
848 | }, |
849 | 'bytes': { | |
850 | k: float(v) / float(max(bytes, 1)) | |
1adf2230 | 851 | for k, v in six.iteritems(bytes_by_osd) |
3efd9988 FG |
852 | }, |
853 | } | |
854 | pe.total_by_pool[pool] = { | |
855 | 'pgs': pgs, | |
856 | 'objects': objects, | |
857 | 'bytes': bytes, | |
858 | } | |
11fdf7f2 | 859 | for root in pe.total_by_root: |
3efd9988 FG |
860 | pe.count_by_root[root] = { |
861 | 'pgs': { | |
862 | k: float(v) | |
1adf2230 | 863 | for k, v in six.iteritems(actual_by_root[root]['pgs']) |
3efd9988 FG |
864 | }, |
865 | 'objects': { | |
866 | k: float(v) | |
1adf2230 | 867 | for k, v in six.iteritems(actual_by_root[root]['objects']) |
3efd9988 FG |
868 | }, |
869 | 'bytes': { | |
870 | k: float(v) | |
1adf2230 | 871 | for k, v in six.iteritems(actual_by_root[root]['bytes']) |
3efd9988 FG |
872 | }, |
873 | } | |
874 | pe.actual_by_root[root] = { | |
875 | 'pgs': { | |
876 | k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1)) | |
1adf2230 | 877 | for k, v in six.iteritems(actual_by_root[root]['pgs']) |
3efd9988 FG |
878 | }, |
879 | 'objects': { | |
880 | k: float(v) / float(max(pe.total_by_root[root]['objects'], 1)) | |
1adf2230 | 881 | for k, v in six.iteritems(actual_by_root[root]['objects']) |
3efd9988 FG |
882 | }, |
883 | 'bytes': { | |
884 | k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1)) | |
1adf2230 | 885 | for k, v in six.iteritems(actual_by_root[root]['bytes']) |
3efd9988 FG |
886 | }, |
887 | } | |
888 | self.log.debug('actual_by_pool %s' % pe.actual_by_pool) | |
889 | self.log.debug('actual_by_root %s' % pe.actual_by_root) | |
890 | ||
891 | # average and stddev and score | |
892 | pe.stats_by_root = { | |
893 | a: pe.calc_stats( | |
894 | b, | |
895 | pe.target_by_root[a], | |
896 | pe.total_by_root[a] | |
1adf2230 | 897 | ) for a, b in six.iteritems(pe.count_by_root) |
3efd9988 | 898 | } |
94b18763 | 899 | self.log.debug('stats_by_root %s' % pe.stats_by_root) |
3efd9988 FG |
900 | |
901 | # the scores are already normalized | |
902 | pe.score_by_root = { | |
903 | r: { | |
904 | 'pgs': pe.stats_by_root[r]['pgs']['score'], | |
905 | 'objects': pe.stats_by_root[r]['objects']['score'], | |
906 | 'bytes': pe.stats_by_root[r]['bytes']['score'], | |
907 | } for r in pe.total_by_root.keys() | |
908 | } | |
94b18763 | 909 | self.log.debug('score_by_root %s' % pe.score_by_root) |
3efd9988 | 910 | |
f64942e4 | 911 | # get the list of score metrics, comma separated |
11fdf7f2 | 912 | metrics = self.get_module_option('crush_compat_metrics').split(',') |
f64942e4 | 913 | |
3efd9988 FG |
914 | # total score is just average of normalized stddevs |
915 | pe.score = 0.0 | |
1adf2230 AA |
916 | for r, vs in six.iteritems(pe.score_by_root): |
917 | for k, v in six.iteritems(vs): | |
f64942e4 AA |
918 | if k in metrics: |
919 | pe.score += v | |
920 | pe.score /= len(metrics) * len(roots) | |
3efd9988 FG |
921 | return pe |
922 | ||
94b18763 FG |
923 | def evaluate(self, ms, pools, verbose=False): |
924 | pe = self.calc_eval(ms, pools) | |
3efd9988 FG |
925 | return pe.show(verbose=verbose) |
926 | ||
927 | def optimize(self, plan): | |
928 | self.log.info('Optimize plan %s' % plan.name) | |
9f95a23c | 929 | max_misplaced = self.get_ceph_option('target_max_misplaced_ratio') |
3efd9988 FG |
930 | self.log.info('Mode %s, max misplaced %f' % |
931 | (plan.mode, max_misplaced)) | |
932 | ||
933 | info = self.get('pg_status') | |
934 | unknown = info.get('unknown_pgs_ratio', 0.0) | |
935 | degraded = info.get('degraded_ratio', 0.0) | |
936 | inactive = info.get('inactive_pgs_ratio', 0.0) | |
937 | misplaced = info.get('misplaced_ratio', 0.0) | |
9f95a23c | 938 | plan.pg_status = info |
3efd9988 FG |
939 | self.log.debug('unknown %f degraded %f inactive %f misplaced %g', |
940 | unknown, degraded, inactive, misplaced) | |
941 | if unknown > 0.0: | |
94b18763 FG |
942 | detail = 'Some PGs (%f) are unknown; try again later' % unknown |
943 | self.log.info(detail) | |
944 | return -errno.EAGAIN, detail | |
3efd9988 | 945 | elif degraded > 0.0: |
94b18763 FG |
946 | detail = 'Some objects (%f) are degraded; try again later' % degraded |
947 | self.log.info(detail) | |
948 | return -errno.EAGAIN, detail | |
3efd9988 | 949 | elif inactive > 0.0: |
94b18763 FG |
950 | detail = 'Some PGs (%f) are inactive; try again later' % inactive |
951 | self.log.info(detail) | |
952 | return -errno.EAGAIN, detail | |
3efd9988 | 953 | elif misplaced >= max_misplaced: |
94b18763 FG |
954 | detail = 'Too many objects (%f > %f) are misplaced; ' \ |
955 | 'try again later' % (misplaced, max_misplaced) | |
956 | self.log.info(detail) | |
957 | return -errno.EAGAIN, detail | |
3efd9988 FG |
958 | else: |
959 | if plan.mode == 'upmap': | |
960 | return self.do_upmap(plan) | |
961 | elif plan.mode == 'crush-compat': | |
962 | return self.do_crush_compat(plan) | |
963 | elif plan.mode == 'none': | |
94b18763 | 964 | detail = 'Please do "ceph balancer mode" to choose a valid mode first' |
3efd9988 | 965 | self.log.info('Idle') |
94b18763 | 966 | return -errno.ENOEXEC, detail |
3efd9988 | 967 | else: |
94b18763 FG |
968 | detail = 'Unrecognized mode %s' % plan.mode |
969 | self.log.info(detail) | |
970 | return -errno.EINVAL, detail | |
3efd9988 FG |
971 | |
972 | def do_upmap(self, plan): | |
973 | self.log.info('do_upmap') | |
9f95a23c | 974 | max_optimizations = self.get_module_option('upmap_max_optimizations') |
11fdf7f2 | 975 | max_deviation = self.get_module_option('upmap_max_deviation') |
9f95a23c | 976 | osdmap_dump = plan.osdmap_dump |
3efd9988 | 977 | |
94b18763 FG |
978 | if len(plan.pools): |
979 | pools = plan.pools | |
980 | else: # all | |
9f95a23c | 981 | pools = [str(i['pool_name']) for i in osdmap_dump.get('pools',[])] |
3efd9988 | 982 | if len(pools) == 0: |
94b18763 FG |
983 | detail = 'No pools available' |
984 | self.log.info(detail) | |
985 | return -errno.ENOENT, detail | |
92f5a8d4 TL |
986 | # shuffle pool list so they all get equal (in)attention |
987 | random.shuffle(pools) | |
988 | self.log.info('pools %s' % pools) | |
3efd9988 | 989 | |
92f5a8d4 | 990 | adjusted_pools = [] |
3efd9988 FG |
991 | inc = plan.inc |
992 | total_did = 0 | |
9f95a23c | 993 | left = max_optimizations |
11fdf7f2 TL |
994 | pools_with_pg_merge = [p['pool_name'] for p in osdmap_dump.get('pools', []) |
995 | if p['pg_num'] > p['pg_num_target']] | |
996 | crush_rule_by_pool_name = dict((p['pool_name'], p['crush_rule']) for p in osdmap_dump.get('pools', [])) | |
3efd9988 | 997 | for pool in pools: |
11fdf7f2 TL |
998 | if pool not in crush_rule_by_pool_name: |
999 | self.log.info('pool %s does not exist' % pool) | |
1000 | continue | |
1001 | if pool in pools_with_pg_merge: | |
1002 | self.log.info('pool %s has pending PG(s) for merging, skipping for now' % pool) | |
1003 | continue | |
92f5a8d4 | 1004 | adjusted_pools.append(pool) |
11fdf7f2 | 1005 | # shuffle so all pools get equal (in)attention |
92f5a8d4 | 1006 | random.shuffle(adjusted_pools) |
9f95a23c | 1007 | pool_dump = osdmap_dump.get('pools', []) |
92f5a8d4 | 1008 | for pool in adjusted_pools: |
9f95a23c TL |
1009 | num_pg = 0 |
1010 | for p in pool_dump: | |
1011 | if p['pool_name'] == pool: | |
1012 | num_pg = p['pg_num'] | |
1013 | pool_id = p['pool'] | |
1014 | break | |
1015 | ||
1016 | # note that here we deliberately exclude any scrubbing pgs too | |
1017 | # since scrubbing activities have significant impacts on performance | |
1018 | num_pg_active_clean = 0 | |
1019 | for p in plan.pg_status.get('pgs_by_pool_state', []): | |
1020 | pgs_pool_id = p['pool_id'] | |
1021 | if pgs_pool_id != pool_id: | |
1022 | continue | |
1023 | for s in p['pg_state_counts']: | |
1024 | if s['state_name'] == 'active+clean': | |
1025 | num_pg_active_clean += s['count'] | |
1026 | break | |
1027 | available = left - (num_pg - num_pg_active_clean) | |
1028 | did = plan.osdmap.calc_pg_upmaps(inc, max_deviation, available, [pool]) | |
3efd9988 FG |
1029 | total_did += did |
1030 | left -= did | |
1031 | if left <= 0: | |
1032 | break | |
9f95a23c | 1033 | self.log.info('prepared %d/%d changes' % (total_did, max_optimizations)) |
94b18763 | 1034 | if total_did == 0: |
11fdf7f2 | 1035 | return -errno.EALREADY, 'Unable to find further optimization, ' \ |
92f5a8d4 | 1036 | 'or pool(s) pg_num is decreasing, ' \ |
94b18763 FG |
1037 | 'or distribution is already perfect' |
1038 | return 0, '' | |
3efd9988 FG |
1039 | |
1040 | def do_crush_compat(self, plan): | |
1041 | self.log.info('do_crush_compat') | |
11fdf7f2 | 1042 | max_iterations = self.get_module_option('crush_compat_max_iterations') |
3efd9988 | 1043 | if max_iterations < 1: |
94b18763 | 1044 | return -errno.EINVAL, '"crush_compat_max_iterations" must be >= 1' |
11fdf7f2 | 1045 | step = self.get_module_option('crush_compat_step') |
3efd9988 | 1046 | if step <= 0 or step >= 1.0: |
94b18763 | 1047 | return -errno.EINVAL, '"crush_compat_step" must be in (0, 1)' |
9f95a23c | 1048 | max_misplaced = self.get_ceph_option('target_max_misplaced_ratio') |
3efd9988 FG |
1049 | min_pg_per_osd = 2 |
1050 | ||
1051 | ms = plan.initial | |
1052 | osdmap = ms.osdmap | |
1053 | crush = osdmap.get_crush() | |
94b18763 | 1054 | pe = self.calc_eval(ms, plan.pools) |
11fdf7f2 | 1055 | min_score_to_optimize = self.get_module_option('min_score') |
94b18763 FG |
1056 | if pe.score <= min_score_to_optimize: |
1057 | if pe.score == 0: | |
1058 | detail = 'Distribution is already perfect' | |
1059 | else: | |
1060 | detail = 'score %f <= min_score %f, will not optimize' \ | |
1061 | % (pe.score, min_score_to_optimize) | |
1062 | self.log.info(detail) | |
1063 | return -errno.EALREADY, detail | |
3efd9988 FG |
1064 | |
1065 | # get current osd reweights | |
1066 | orig_osd_weight = { a['osd']: a['weight'] | |
1067 | for a in ms.osdmap_dump.get('osds',[]) } | |
1adf2230 | 1068 | reweighted_osds = [ a for a,b in six.iteritems(orig_osd_weight) |
3efd9988 FG |
1069 | if b < 1.0 and b > 0.0 ] |
1070 | ||
1071 | # get current compat weight-set weights | |
94b18763 FG |
1072 | orig_ws = self.get_compat_weight_set_weights(ms) |
1073 | if not orig_ws: | |
1074 | return -errno.EAGAIN, 'compat weight-set not available' | |
1adf2230 | 1075 | orig_ws = { a: b for a, b in six.iteritems(orig_ws) if a >= 0 } |
3efd9988 FG |
1076 | |
1077 | # Make sure roots don't overlap their devices. If so, we | |
1078 | # can't proceed. | |
eafe8130 | 1079 | roots = list(pe.target_by_root.keys()) |
3efd9988 FG |
1080 | self.log.debug('roots %s', roots) |
1081 | visited = {} | |
1082 | overlap = {} | |
1083 | root_ids = {} | |
1adf2230 | 1084 | for root, wm in six.iteritems(pe.target_by_root): |
11fdf7f2 | 1085 | for osd in wm: |
3efd9988 | 1086 | if osd in visited: |
11fdf7f2 TL |
1087 | if osd not in overlap: |
1088 | overlap[osd] = [ visited[osd] ] | |
1089 | overlap[osd].append(root) | |
1090 | visited[osd] = root | |
3efd9988 | 1091 | if len(overlap) > 0: |
94b18763 | 1092 | detail = 'Some osds belong to multiple subtrees: %s' % \ |
11fdf7f2 | 1093 | overlap |
94b18763 FG |
1094 | self.log.error(detail) |
1095 | return -errno.EOPNOTSUPP, detail | |
3efd9988 | 1096 | |
f64942e4 | 1097 | # rebalance by pgs, objects, or bytes |
11fdf7f2 | 1098 | metrics = self.get_module_option('crush_compat_metrics').split(',') |
f64942e4 AA |
1099 | key = metrics[0] # balancing using the first score metric |
1100 | if key not in ['pgs', 'bytes', 'objects']: | |
1101 | self.log.warn("Invalid crush_compat balancing key %s. Using 'pgs'." % key) | |
1102 | key = 'pgs' | |
3efd9988 FG |
1103 | |
1104 | # go | |
1105 | best_ws = copy.deepcopy(orig_ws) | |
1106 | best_ow = copy.deepcopy(orig_osd_weight) | |
1107 | best_pe = pe | |
1108 | left = max_iterations | |
1109 | bad_steps = 0 | |
1110 | next_ws = copy.deepcopy(best_ws) | |
1111 | next_ow = copy.deepcopy(best_ow) | |
1112 | while left > 0: | |
1113 | # adjust | |
1114 | self.log.debug('best_ws %s' % best_ws) | |
1115 | random.shuffle(roots) | |
1116 | for root in roots: | |
1117 | pools = best_pe.root_pools[root] | |
94b18763 FG |
1118 | osds = len(best_pe.target_by_root[root]) |
1119 | min_pgs = osds * min_pg_per_osd | |
1120 | if best_pe.total_by_root[root][key] < min_pgs: | |
3efd9988 FG |
1121 | self.log.info('Skipping root %s (pools %s), total pgs %d ' |
1122 | '< minimum %d (%d per osd)', | |
94b18763 FG |
1123 | root, pools, |
1124 | best_pe.total_by_root[root][key], | |
1125 | min_pgs, min_pg_per_osd) | |
3efd9988 FG |
1126 | continue |
1127 | self.log.info('Balancing root %s (pools %s) by %s' % | |
1128 | (root, pools, key)) | |
1129 | target = best_pe.target_by_root[root] | |
1130 | actual = best_pe.actual_by_root[root][key] | |
1131 | queue = sorted(actual.keys(), | |
1132 | key=lambda osd: -abs(target[osd] - actual[osd])) | |
1133 | for osd in queue: | |
1134 | if orig_osd_weight[osd] == 0: | |
1135 | self.log.debug('skipping out osd.%d', osd) | |
1136 | else: | |
1137 | deviation = target[osd] - actual[osd] | |
1138 | if deviation == 0: | |
1139 | break | |
1140 | self.log.debug('osd.%d deviation %f', osd, deviation) | |
1141 | weight = best_ws[osd] | |
1142 | ow = orig_osd_weight[osd] | |
1143 | if actual[osd] > 0: | |
1144 | calc_weight = target[osd] / actual[osd] * weight * ow | |
1145 | else: | |
81eedcae TL |
1146 | # for newly created osds, reset calc_weight at target value |
1147 | # this way weight-set will end up absorbing *step* of its | |
1148 | # target (final) value at the very beginning and slowly catch up later. | |
1149 | # note that if this turns out causing too many misplaced | |
1150 | # pgs, then we'll reduce step and retry | |
1151 | calc_weight = target[osd] | |
3efd9988 FG |
1152 | new_weight = weight * (1.0 - step) + calc_weight * step |
1153 | self.log.debug('Reweight osd.%d %f -> %f', osd, weight, | |
1154 | new_weight) | |
1155 | next_ws[osd] = new_weight | |
1156 | if ow < 1.0: | |
1157 | new_ow = min(1.0, max(step + (1.0 - step) * ow, | |
1158 | ow + .005)) | |
1159 | self.log.debug('Reweight osd.%d reweight %f -> %f', | |
1160 | osd, ow, new_ow) | |
1161 | next_ow[osd] = new_ow | |
1162 | ||
1163 | # normalize weights under this root | |
1164 | root_weight = crush.get_item_weight(pe.root_ids[root]) | |
1adf2230 | 1165 | root_sum = sum(b for a,b in six.iteritems(next_ws) |
3efd9988 FG |
1166 | if a in target.keys()) |
1167 | if root_sum > 0 and root_weight > 0: | |
1168 | factor = root_sum / root_weight | |
1169 | self.log.debug('normalizing root %s %d, weight %f, ' | |
1170 | 'ws sum %f, factor %f', | |
1171 | root, pe.root_ids[root], root_weight, | |
1172 | root_sum, factor) | |
1173 | for osd in actual.keys(): | |
1174 | next_ws[osd] = next_ws[osd] / factor | |
1175 | ||
1176 | # recalc | |
1177 | plan.compat_ws = copy.deepcopy(next_ws) | |
1178 | next_ms = plan.final_state() | |
94b18763 | 1179 | next_pe = self.calc_eval(next_ms, plan.pools) |
3efd9988 FG |
1180 | next_misplaced = next_ms.calc_misplaced_from(ms) |
1181 | self.log.debug('Step result score %f -> %f, misplacing %f', | |
1182 | best_pe.score, next_pe.score, next_misplaced) | |
1183 | ||
1184 | if next_misplaced > max_misplaced: | |
1185 | if best_pe.score < pe.score: | |
1186 | self.log.debug('Step misplaced %f > max %f, stopping', | |
1187 | next_misplaced, max_misplaced) | |
1188 | break | |
1189 | step /= 2.0 | |
1190 | next_ws = copy.deepcopy(best_ws) | |
1191 | next_ow = copy.deepcopy(best_ow) | |
1192 | self.log.debug('Step misplaced %f > max %f, reducing step to %f', | |
1193 | next_misplaced, max_misplaced, step) | |
1194 | else: | |
1195 | if next_pe.score > best_pe.score * 1.0001: | |
94b18763 | 1196 | bad_steps += 1 |
3efd9988 FG |
1197 | if bad_steps < 5 and random.randint(0, 100) < 70: |
1198 | self.log.debug('Score got worse, taking another step') | |
1199 | else: | |
1200 | step /= 2.0 | |
1201 | next_ws = copy.deepcopy(best_ws) | |
1202 | next_ow = copy.deepcopy(best_ow) | |
1203 | self.log.debug('Score got worse, trying smaller step %f', | |
1204 | step) | |
1205 | else: | |
1206 | bad_steps = 0 | |
1207 | best_pe = next_pe | |
91327a77 AA |
1208 | best_ws = copy.deepcopy(next_ws) |
1209 | best_ow = copy.deepcopy(next_ow) | |
3efd9988 FG |
1210 | if best_pe.score == 0: |
1211 | break | |
1212 | left -= 1 | |
1213 | ||
1214 | # allow a small regression if we are phasing out osd weights | |
1215 | fudge = 0 | |
81eedcae | 1216 | if best_ow != orig_osd_weight: |
3efd9988 FG |
1217 | fudge = .001 |
1218 | ||
1219 | if best_pe.score < pe.score + fudge: | |
1220 | self.log.info('Success, score %f -> %f', pe.score, best_pe.score) | |
1221 | plan.compat_ws = best_ws | |
1adf2230 | 1222 | for osd, w in six.iteritems(best_ow): |
3efd9988 FG |
1223 | if w != orig_osd_weight[osd]: |
1224 | self.log.debug('osd.%d reweight %f', osd, w) | |
1225 | plan.osd_weights[osd] = w | |
94b18763 | 1226 | return 0, '' |
3efd9988 FG |
1227 | else: |
1228 | self.log.info('Failed to find further optimization, score %f', | |
1229 | pe.score) | |
94b18763 FG |
1230 | plan.compat_ws = {} |
1231 | return -errno.EDOM, 'Unable to find further optimization, ' \ | |
1232 | 'change balancer mode and retry might help' | |
1233 | ||
1234 | def get_compat_weight_set_weights(self, ms): | |
11fdf7f2 | 1235 | if not CRUSHMap.have_default_choose_args(ms.crush_dump): |
94b18763 FG |
1236 | # enable compat weight-set first |
1237 | self.log.debug('ceph osd crush weight-set create-compat') | |
1238 | result = CommandResult('') | |
1239 | self.send_command(result, 'mon', '', json.dumps({ | |
1240 | 'prefix': 'osd crush weight-set create-compat', | |
1241 | 'format': 'json', | |
1242 | }), '') | |
1243 | r, outb, outs = result.wait() | |
1244 | if r != 0: | |
1245 | self.log.error('Error creating compat weight-set') | |
1246 | return | |
1247 | ||
1248 | result = CommandResult('') | |
1249 | self.send_command(result, 'mon', '', json.dumps({ | |
1250 | 'prefix': 'osd crush dump', | |
1251 | 'format': 'json', | |
1252 | }), '') | |
1253 | r, outb, outs = result.wait() | |
1254 | if r != 0: | |
1255 | self.log.error('Error dumping crush map') | |
1256 | return | |
1257 | try: | |
1258 | crushmap = json.loads(outb) | |
1259 | except: | |
1260 | raise RuntimeError('unable to parse crush map') | |
1261 | else: | |
1262 | crushmap = ms.crush_dump | |
3efd9988 | 1263 | |
11fdf7f2 | 1264 | raw = CRUSHMap.get_default_choose_args(crushmap) |
3efd9988 FG |
1265 | weight_set = {} |
1266 | for b in raw: | |
1267 | bucket = None | |
1268 | for t in crushmap['buckets']: | |
1269 | if t['id'] == b['bucket_id']: | |
1270 | bucket = t | |
1271 | break | |
1272 | if not bucket: | |
1273 | raise RuntimeError('could not find bucket %s' % b['bucket_id']) | |
1274 | self.log.debug('bucket items %s' % bucket['items']) | |
1275 | self.log.debug('weight set %s' % b['weight_set'][0]) | |
1276 | if len(bucket['items']) != len(b['weight_set'][0]): | |
1277 | raise RuntimeError('weight-set size does not match bucket items') | |
1278 | for pos in range(len(bucket['items'])): | |
1279 | weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos] | |
1280 | ||
1281 | self.log.debug('weight_set weights %s' % weight_set) | |
1282 | return weight_set | |
1283 | ||
1284 | def do_crush(self): | |
1285 | self.log.info('do_crush (not yet implemented)') | |
1286 | ||
1287 | def do_osd_weight(self): | |
1288 | self.log.info('do_osd_weight (not yet implemented)') | |
1289 | ||
1290 | def execute(self, plan): | |
1291 | self.log.info('Executing plan %s' % plan.name) | |
1292 | ||
1293 | commands = [] | |
1294 | ||
1295 | # compat weight-set | |
1296 | if len(plan.compat_ws) and \ | |
11fdf7f2 | 1297 | not CRUSHMap.have_default_choose_args(plan.initial.crush_dump): |
3efd9988 FG |
1298 | self.log.debug('ceph osd crush weight-set create-compat') |
1299 | result = CommandResult('') | |
1300 | self.send_command(result, 'mon', '', json.dumps({ | |
1301 | 'prefix': 'osd crush weight-set create-compat', | |
1302 | 'format': 'json', | |
1303 | }), '') | |
1304 | r, outb, outs = result.wait() | |
1305 | if r != 0: | |
1306 | self.log.error('Error creating compat weight-set') | |
94b18763 | 1307 | return r, outs |
3efd9988 | 1308 | |
1adf2230 | 1309 | for osd, weight in six.iteritems(plan.compat_ws): |
3efd9988 FG |
1310 | self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f', |
1311 | osd, weight) | |
b32b8144 | 1312 | result = CommandResult('') |
3efd9988 FG |
1313 | self.send_command(result, 'mon', '', json.dumps({ |
1314 | 'prefix': 'osd crush weight-set reweight-compat', | |
1315 | 'format': 'json', | |
1316 | 'item': 'osd.%d' % osd, | |
1317 | 'weight': [weight], | |
b32b8144 | 1318 | }), '') |
3efd9988 FG |
1319 | commands.append(result) |
1320 | ||
1321 | # new_weight | |
1322 | reweightn = {} | |
1adf2230 | 1323 | for osd, weight in six.iteritems(plan.osd_weights): |
3efd9988 FG |
1324 | reweightn[str(osd)] = str(int(weight * float(0x10000))) |
1325 | if len(reweightn): | |
1326 | self.log.info('ceph osd reweightn %s', reweightn) | |
b32b8144 | 1327 | result = CommandResult('') |
3efd9988 FG |
1328 | self.send_command(result, 'mon', '', json.dumps({ |
1329 | 'prefix': 'osd reweightn', | |
1330 | 'format': 'json', | |
1331 | 'weights': json.dumps(reweightn), | |
b32b8144 | 1332 | }), '') |
3efd9988 FG |
1333 | commands.append(result) |
1334 | ||
1335 | # upmap | |
1336 | incdump = plan.inc.dump() | |
9f95a23c TL |
1337 | for item in incdump.get('new_pg_upmap', []): |
1338 | self.log.info('ceph osd pg-upmap %s mappings %s', item['pgid'], | |
1339 | item['osds']) | |
3efd9988 FG |
1340 | result = CommandResult('foo') |
1341 | self.send_command(result, 'mon', '', json.dumps({ | |
9f95a23c TL |
1342 | 'prefix': 'osd pg-upmap', |
1343 | 'format': 'json', | |
1344 | 'pgid': item['pgid'], | |
1345 | 'id': item['osds'], | |
1346 | }), 'foo') | |
1347 | commands.append(result) | |
1348 | ||
1349 | for pgid in incdump.get('old_pg_upmap', []): | |
1350 | self.log.info('ceph osd rm-pg-upmap %s', pgid) | |
1351 | result = CommandResult('foo') | |
1352 | self.send_command(result, 'mon', '', json.dumps({ | |
1353 | 'prefix': 'osd rm-pg-upmap', | |
3efd9988 FG |
1354 | 'format': 'json', |
1355 | 'pgid': pgid, | |
1356 | }), 'foo') | |
1357 | commands.append(result) | |
1358 | ||
1359 | for item in incdump.get('new_pg_upmap_items', []): | |
1360 | self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'], | |
1361 | item['mappings']) | |
1362 | osdlist = [] | |
1363 | for m in item['mappings']: | |
1364 | osdlist += [m['from'], m['to']] | |
1365 | result = CommandResult('foo') | |
1366 | self.send_command(result, 'mon', '', json.dumps({ | |
1367 | 'prefix': 'osd pg-upmap-items', | |
1368 | 'format': 'json', | |
1369 | 'pgid': item['pgid'], | |
1370 | 'id': osdlist, | |
1371 | }), 'foo') | |
1372 | commands.append(result) | |
1373 | ||
9f95a23c TL |
1374 | for pgid in incdump.get('old_pg_upmap_items', []): |
1375 | self.log.info('ceph osd rm-pg-upmap-items %s', pgid) | |
1376 | result = CommandResult('foo') | |
1377 | self.send_command(result, 'mon', '', json.dumps({ | |
1378 | 'prefix': 'osd rm-pg-upmap-items', | |
1379 | 'format': 'json', | |
1380 | 'pgid': pgid, | |
1381 | }), 'foo') | |
1382 | commands.append(result) | |
1383 | ||
3efd9988 FG |
1384 | # wait for commands |
1385 | self.log.debug('commands %s' % commands) | |
1386 | for result in commands: | |
1387 | r, outb, outs = result.wait() | |
1388 | if r != 0: | |
94b18763 FG |
1389 | self.log.error('execute error: r = %d, detail = %s' % (r, outs)) |
1390 | return r, outs | |
3efd9988 | 1391 | self.log.debug('done') |
94b18763 | 1392 | return 0, '' |
eafe8130 TL |
1393 | |
1394 | def gather_telemetry(self): | |
1395 | return { | |
1396 | 'active': self.active, | |
1397 | 'mode': self.mode, | |
1398 | } |