]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | Automatically scale pg_num based on how much data is stored in each pool. | |
3 | """ | |
4 | ||
5 | import errno | |
6 | import json | |
7 | import mgr_util | |
8 | import threading | |
9 | import uuid | |
81eedcae | 10 | from six import itervalues, iteritems |
11fdf7f2 | 11 | from collections import defaultdict |
eafe8130 | 12 | from prettytable import PrettyTable, PLAIN_COLUMNS |
11fdf7f2 TL |
13 | |
14 | from mgr_module import MgrModule | |
15 | ||
16 | """ | |
17 | Some terminology is made up for the purposes of this module: | |
18 | ||
19 | - "raw pgs": pg count after applying replication, i.e. the real resource | |
20 | consumption of a pool. | |
21 | - "grow/shrink" - increase/decrease the pg_num in a pool | |
22 | - "crush subtree" - non-overlapping domains in crush hierarchy: used as | |
23 | units of resource management. | |
24 | """ | |
25 | ||
26 | INTERVAL = 5 | |
27 | ||
28 | PG_NUM_MIN = 4 # unless specified on a per-pool basis | |
29 | ||
30 | def nearest_power_of_two(n): | |
31 | v = int(n) | |
32 | ||
33 | v -= 1 | |
34 | v |= v >> 1 | |
35 | v |= v >> 2 | |
36 | v |= v >> 4 | |
37 | v |= v >> 8 | |
38 | v |= v >> 16 | |
39 | ||
40 | # High bound power of two | |
41 | v += 1 | |
42 | ||
43 | # Low bound power of tow | |
44 | x = v >> 1 | |
45 | ||
46 | return x if (v - n) > (n - x) else v | |
47 | ||
48 | ||
49 | class PgAutoscaler(MgrModule): | |
50 | """ | |
51 | PG autoscaler. | |
52 | """ | |
53 | COMMANDS = [ | |
54 | { | |
55 | "cmd": "osd pool autoscale-status", | |
56 | "desc": "report on pool pg_num sizing recommendation and intent", | |
57 | "perm": "r" | |
58 | }, | |
59 | ] | |
60 | ||
61 | NATIVE_OPTIONS = [ | |
62 | 'mon_target_pg_per_osd', | |
63 | 'mon_max_pg_per_osd', | |
64 | ] | |
65 | ||
66 | MODULE_OPTIONS = [ | |
67 | { | |
68 | 'name': 'sleep_interval', | |
69 | 'default': str(60), | |
70 | }, | |
71 | ] | |
72 | ||
73 | def __init__(self, *args, **kwargs): | |
74 | super(PgAutoscaler, self).__init__(*args, **kwargs) | |
75 | self._shutdown = threading.Event() | |
76 | ||
77 | # So much of what we do peeks at the osdmap that it's easiest | |
78 | # to just keep a copy of the pythonized version. | |
79 | self._osd_map = None | |
80 | ||
81 | def config_notify(self): | |
82 | for opt in self.NATIVE_OPTIONS: | |
83 | setattr(self, | |
84 | opt, | |
85 | self.get_ceph_option(opt)) | |
86 | self.log.debug(' native option %s = %s', opt, getattr(self, opt)) | |
87 | for opt in self.MODULE_OPTIONS: | |
88 | setattr(self, | |
89 | opt['name'], | |
90 | self.get_module_option(opt['name']) or opt['default']) | |
91 | self.log.debug(' mgr option %s = %s', | |
92 | opt['name'], getattr(self, opt['name'])) | |
93 | ||
94 | ||
95 | def handle_command(self, inbuf, cmd): | |
96 | if cmd['prefix'] == "osd pool autoscale-status": | |
97 | retval = self._command_autoscale_status(cmd) | |
98 | else: | |
99 | assert False # ceph-mgr should never pass us unknown cmds | |
100 | return retval | |
101 | ||
102 | def _command_autoscale_status(self, cmd): | |
103 | osdmap = self.get_osdmap() | |
104 | pools = osdmap.get_pools_by_name() | |
105 | ps, root_map, pool_root = self._get_pool_status(osdmap, pools) | |
106 | ||
107 | if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty': | |
108 | return 0, json.dumps(ps, indent=2), '' | |
109 | else: | |
110 | table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE', | |
111 | 'RATE', 'RAW CAPACITY', | |
112 | 'RATIO', 'TARGET RATIO', | |
113 | 'BIAS', | |
114 | 'PG_NUM', | |
115 | # 'IDEAL', | |
116 | 'NEW PG_NUM', 'AUTOSCALE'], | |
117 | border=False) | |
eafe8130 TL |
118 | table.left_padding_width = 0 |
119 | table.right_padding_width = 1 | |
11fdf7f2 TL |
120 | table.align['POOL'] = 'l' |
121 | table.align['SIZE'] = 'r' | |
122 | table.align['TARGET SIZE'] = 'r' | |
123 | table.align['RATE'] = 'r' | |
124 | table.align['RAW CAPACITY'] = 'r' | |
125 | table.align['RATIO'] = 'r' | |
126 | table.align['TARGET RATIO'] = 'r' | |
127 | table.align['BIAS'] = 'r' | |
128 | table.align['PG_NUM'] = 'r' | |
129 | # table.align['IDEAL'] = 'r' | |
130 | table.align['NEW PG_NUM'] = 'r' | |
131 | table.align['AUTOSCALE'] = 'l' | |
132 | for p in ps: | |
133 | if p['would_adjust']: | |
134 | final = str(p['pg_num_final']) | |
135 | else: | |
136 | final = '' | |
137 | if p['target_bytes'] > 0: | |
138 | ts = mgr_util.format_bytes(p['target_bytes'], 6) | |
139 | else: | |
140 | ts = '' | |
141 | if p['target_ratio'] > 0.0: | |
142 | tr = '%.4f' % p['target_ratio'] | |
143 | else: | |
144 | tr = '' | |
145 | table.add_row([ | |
146 | p['pool_name'], | |
147 | mgr_util.format_bytes(p['logical_used'], 6), | |
148 | ts, | |
149 | p['raw_used_rate'], | |
150 | mgr_util.format_bytes(p['subtree_capacity'], 6), | |
151 | '%.4f' % p['capacity_ratio'], | |
152 | tr, | |
153 | p['bias'], | |
154 | p['pg_num_target'], | |
155 | # p['pg_num_ideal'], | |
156 | final, | |
157 | p['pg_autoscale_mode'], | |
158 | ]) | |
159 | return 0, table.get_string(), '' | |
160 | ||
161 | def serve(self): | |
162 | self.config_notify() | |
163 | while not self._shutdown.is_set(): | |
164 | self._maybe_adjust() | |
165 | self._shutdown.wait(timeout=int(self.sleep_interval)) | |
166 | ||
167 | def get_subtree_resource_status(self, osdmap, crush): | |
168 | """ | |
169 | For each CRUSH subtree of interest (i.e. the roots under which | |
170 | we have pools), calculate the current resource usages and targets, | |
171 | such as how many PGs there are, vs. how many PGs we would | |
172 | like there to be. | |
173 | """ | |
174 | result = {} | |
175 | pool_root = {} | |
176 | roots = [] | |
177 | ||
178 | class CrushSubtreeResourceStatus(object): | |
179 | def __init__(self): | |
180 | self.root_ids = [] | |
181 | self.osds = set() | |
182 | self.osd_count = None # Number of OSDs | |
183 | self.pg_target = None # Ideal full-capacity PG count? | |
184 | self.pg_current = 0 # How many PGs already? | |
185 | self.capacity = None # Total capacity of OSDs in subtree | |
186 | self.pool_ids = [] | |
187 | self.pool_names = [] | |
188 | ||
189 | # identify subtrees (note that they may overlap!) | |
190 | for pool_id, pool in osdmap.get_pools().items(): | |
191 | cr_name = crush.get_rule_by_id(pool['crush_rule'])['rule_name'] | |
192 | root_id = int(crush.get_rule_root(cr_name)) | |
193 | pool_root[pool_id] = root_id | |
194 | osds = set(crush.get_osds_under(root_id)) | |
195 | ||
196 | # do we intersect an existing root? | |
197 | s = None | |
81eedcae | 198 | for prev in itervalues(result): |
11fdf7f2 TL |
199 | if osds & prev.osds: |
200 | s = prev | |
201 | break | |
202 | if not s: | |
203 | s = CrushSubtreeResourceStatus() | |
204 | roots.append(s) | |
205 | result[root_id] = s | |
206 | s.root_ids.append(root_id) | |
207 | s.osds |= osds | |
208 | s.pool_ids.append(int(pool_id)) | |
209 | s.pool_names.append(pool['pool_name']) | |
210 | s.pg_current += pool['pg_num_target'] * pool['size'] | |
211 | ||
212 | ||
213 | # finish subtrees | |
214 | all_stats = self.get('osd_stats') | |
215 | for s in roots: | |
216 | s.osd_count = len(s.osds) | |
217 | s.pg_target = s.osd_count * int(self.mon_target_pg_per_osd) | |
218 | ||
219 | capacity = 0.0 | |
220 | for osd_stats in all_stats['osd_stats']: | |
221 | if osd_stats['osd'] in s.osds: | |
222 | # Intentionally do not apply the OSD's reweight to | |
223 | # this, because we want to calculate PG counts based | |
224 | # on the physical storage available, not how it is | |
225 | # reweighted right now. | |
226 | capacity += osd_stats['kb'] * 1024 | |
227 | ||
228 | s.capacity = capacity | |
229 | ||
230 | self.log.debug('root_ids %s pools %s with %d osds, pg_target %d', | |
231 | s.root_ids, | |
232 | s.pool_ids, | |
233 | s.osd_count, | |
234 | s.pg_target) | |
235 | ||
236 | return result, pool_root | |
237 | ||
238 | ||
239 | def _get_pool_status( | |
240 | self, | |
241 | osdmap, | |
242 | pools, | |
243 | threshold=3.0, | |
244 | ): | |
245 | assert threshold >= 2.0 | |
246 | ||
247 | crush_map = osdmap.get_crush() | |
248 | ||
249 | root_map, pool_root = self.get_subtree_resource_status(osdmap, crush_map) | |
250 | ||
251 | df = self.get('df') | |
252 | pool_stats = dict([(p['id'], p['stats']) for p in df['pools']]) | |
253 | ||
254 | ret = [] | |
255 | ||
256 | # iterate over all pools to determine how they should be sized | |
81eedcae | 257 | for pool_name, p in iteritems(pools): |
11fdf7f2 | 258 | pool_id = p['pool'] |
eafe8130 TL |
259 | if pool_id not in pool_stats: |
260 | # race with pool deletion; skip | |
261 | continue | |
11fdf7f2 TL |
262 | |
263 | # FIXME: we assume there is only one take per pool, but that | |
264 | # may not be true. | |
265 | cr_name = crush_map.get_rule_by_id(p['crush_rule'])['rule_name'] | |
266 | root_id = int(crush_map.get_rule_root(cr_name)) | |
267 | pool_root[pool_name] = root_id | |
268 | ||
269 | capacity = root_map[root_id].capacity | |
270 | if capacity == 0: | |
271 | self.log.debug('skipping empty subtree %s', cr_name) | |
272 | continue | |
273 | ||
274 | raw_used_rate = osdmap.pool_raw_used_rate(pool_id) | |
275 | ||
eafe8130 | 276 | pool_logical_used = pool_stats[pool_id]['stored'] |
11fdf7f2 TL |
277 | bias = p['options'].get('pg_autoscale_bias', 1.0) |
278 | target_bytes = p['options'].get('target_size_bytes', 0) | |
279 | ||
280 | # What proportion of space are we using? | |
281 | actual_raw_used = pool_logical_used * raw_used_rate | |
282 | actual_capacity_ratio = float(actual_raw_used) / capacity | |
283 | ||
284 | pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate | |
285 | capacity_ratio = float(pool_raw_used) / capacity | |
286 | ||
287 | target_ratio = p['options'].get('target_size_ratio', 0.0) | |
288 | final_ratio = max(capacity_ratio, target_ratio) | |
289 | ||
290 | # So what proportion of pg allowance should we be using? | |
291 | pool_pg_target = (final_ratio * root_map[root_id].pg_target) / raw_used_rate * bias | |
292 | ||
293 | final_pg_target = max(p['options'].get('pg_num_min', PG_NUM_MIN), | |
294 | nearest_power_of_two(pool_pg_target)) | |
295 | ||
296 | self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, " | |
297 | "pg target {4} quantized to {5} (current {6})".format( | |
298 | p['pool_name'], | |
299 | root_id, | |
300 | final_ratio, | |
301 | bias, | |
302 | pool_pg_target, | |
303 | final_pg_target, | |
304 | p['pg_num_target'] | |
305 | )) | |
306 | ||
307 | adjust = False | |
308 | if (final_pg_target > p['pg_num_target'] * threshold or \ | |
309 | final_pg_target <= p['pg_num_target'] / threshold) and \ | |
310 | final_ratio >= 0.0 and \ | |
311 | final_ratio <= 1.0: | |
312 | adjust = True | |
313 | ||
314 | ret.append({ | |
315 | 'pool_id': pool_id, | |
316 | 'pool_name': p['pool_name'], | |
317 | 'crush_root_id': root_id, | |
318 | 'pg_autoscale_mode': p['pg_autoscale_mode'], | |
319 | 'pg_num_target': p['pg_num_target'], | |
320 | 'logical_used': pool_logical_used, | |
321 | 'target_bytes': target_bytes, | |
322 | 'raw_used_rate': raw_used_rate, | |
323 | 'subtree_capacity': capacity, | |
324 | 'actual_raw_used': actual_raw_used, | |
325 | 'raw_used': pool_raw_used, | |
326 | 'actual_capacity_ratio': actual_capacity_ratio, | |
327 | 'capacity_ratio': capacity_ratio, | |
328 | 'target_ratio': target_ratio, | |
329 | 'pg_num_ideal': int(pool_pg_target), | |
330 | 'pg_num_final': final_pg_target, | |
331 | 'would_adjust': adjust, | |
332 | 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0), | |
333 | }); | |
334 | ||
335 | return (ret, root_map, pool_root) | |
336 | ||
337 | ||
338 | def _maybe_adjust(self): | |
339 | self.log.info('_maybe_adjust') | |
340 | osdmap = self.get_osdmap() | |
341 | pools = osdmap.get_pools_by_name() | |
342 | ps, root_map, pool_root = self._get_pool_status(osdmap, pools) | |
343 | ||
344 | # Anyone in 'warn', set the health message for them and then | |
345 | # drop them from consideration. | |
346 | too_few = [] | |
347 | too_many = [] | |
348 | health_checks = {} | |
349 | ||
81eedcae TL |
350 | total_ratio = dict([(r, 0.0) for r in iter(root_map)]) |
351 | total_target_ratio = dict([(r, 0.0) for r in iter(root_map)]) | |
352 | target_ratio_pools = dict([(r, []) for r in iter(root_map)]) | |
11fdf7f2 | 353 | |
81eedcae TL |
354 | total_bytes = dict([(r, 0) for r in iter(root_map)]) |
355 | total_target_bytes = dict([(r, 0.0) for r in iter(root_map)]) | |
356 | target_bytes_pools = dict([(r, []) for r in iter(root_map)]) | |
11fdf7f2 TL |
357 | |
358 | for p in ps: | |
359 | total_ratio[p['crush_root_id']] += max(p['actual_capacity_ratio'], | |
360 | p['target_ratio']) | |
361 | if p['target_ratio'] > 0: | |
362 | total_target_ratio[p['crush_root_id']] += p['target_ratio'] | |
363 | target_ratio_pools[p['crush_root_id']].append(p['pool_name']) | |
364 | total_bytes[p['crush_root_id']] += max( | |
365 | p['actual_raw_used'], | |
366 | p['target_bytes'] * p['raw_used_rate']) | |
367 | if p['target_bytes'] > 0: | |
368 | total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate'] | |
369 | target_bytes_pools[p['crush_root_id']].append(p['pool_name']) | |
370 | if not p['would_adjust']: | |
371 | continue | |
372 | if p['pg_autoscale_mode'] == 'warn': | |
373 | msg = 'Pool %s has %d placement groups, should have %d' % ( | |
374 | p['pool_name'], | |
375 | p['pg_num_target'], | |
376 | p['pg_num_final']) | |
377 | if p['pg_num_final'] > p['pg_num_target']: | |
378 | too_few.append(msg) | |
379 | else: | |
380 | too_many.append(msg) | |
381 | ||
382 | if p['pg_autoscale_mode'] == 'on': | |
383 | # Note that setting pg_num actually sets pg_num_target (see | |
384 | # OSDMonitor.cc) | |
385 | r = self.mon_command({ | |
386 | 'prefix': 'osd pool set', | |
387 | 'pool': p['pool_name'], | |
388 | 'var': 'pg_num', | |
389 | 'val': str(p['pg_num_final']) | |
390 | }) | |
391 | ||
392 | if r[0] != 0: | |
393 | # FIXME: this is a serious and unexpected thing, | |
394 | # we should expose it as a cluster log error once | |
395 | # the hook for doing that from ceph-mgr modules is | |
396 | # in. | |
397 | self.log.error("pg_num adjustment on {0} to {1} failed: {2}" | |
398 | .format(p['pool_name'], | |
399 | p['pg_num_final'], r)) | |
400 | ||
401 | if too_few: | |
402 | summary = "{0} pools have too few placement groups".format( | |
403 | len(too_few)) | |
404 | health_checks['POOL_TOO_FEW_PGS'] = { | |
405 | 'severity': 'warning', | |
406 | 'summary': summary, | |
407 | 'detail': too_few | |
408 | } | |
409 | if too_many: | |
410 | summary = "{0} pools have too many placement groups".format( | |
411 | len(too_many)) | |
412 | health_checks['POOL_TOO_MANY_PGS'] = { | |
413 | 'severity': 'warning', | |
414 | 'summary': summary, | |
415 | 'detail': too_many | |
416 | } | |
417 | ||
418 | too_much_target_ratio = [] | |
81eedcae | 419 | for root_id, total in iteritems(total_ratio): |
11fdf7f2 TL |
420 | total_target = total_target_ratio[root_id] |
421 | if total > 1.0: | |
422 | too_much_target_ratio.append( | |
423 | 'Pools %s overcommit available storage by %.03fx due to ' | |
424 | 'target_size_ratio %.03f on pools %s' % ( | |
425 | root_map[root_id].pool_names, | |
426 | total, | |
427 | total_target, | |
428 | target_ratio_pools[root_id] | |
429 | ) | |
430 | ) | |
431 | elif total_target > 1.0: | |
432 | too_much_target_ratio.append( | |
433 | 'Pools %s have collective target_size_ratio %.03f > 1.0' % ( | |
434 | root_map[root_id].pool_names, | |
435 | total_target | |
436 | ) | |
437 | ) | |
438 | if too_much_target_ratio: | |
439 | health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = { | |
440 | 'severity': 'warning', | |
441 | 'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio), | |
442 | 'detail': too_much_target_ratio, | |
443 | } | |
444 | ||
445 | too_much_target_bytes = [] | |
81eedcae | 446 | for root_id, total in iteritems(total_bytes): |
11fdf7f2 TL |
447 | total_target = total_target_bytes[root_id] |
448 | if total > root_map[root_id].capacity: | |
449 | too_much_target_bytes.append( | |
450 | 'Pools %s overcommit available storage by %.03fx due to ' | |
451 | 'target_size_bytes %s on pools %s' % ( | |
452 | root_map[root_id].pool_names, | |
453 | total / root_map[root_id].capacity, | |
454 | mgr_util.format_bytes(total_target, 5, colored=False), | |
455 | target_bytes_pools[root_id] | |
456 | ) | |
457 | ) | |
458 | elif total_target > root_map[root_id].capacity: | |
459 | too_much_target_bytes.append( | |
460 | 'Pools %s overcommit available storage by %.03fx due to ' | |
461 | 'collective target_size_bytes of %s' % ( | |
462 | root_map[root_id].pool_names, | |
463 | total / root_map[root_id].capacity, | |
464 | mgr_util.format_bytes(total_target, 5, colored=False), | |
465 | ) | |
466 | ) | |
467 | if too_much_target_bytes: | |
468 | health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = { | |
469 | 'severity': 'warning', | |
470 | 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes), | |
471 | 'detail': too_much_target_bytes, | |
472 | } | |
473 | ||
474 | ||
475 | self.set_health_checks(health_checks) |