]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | Automatically scale pg_num based on how much data is stored in each pool. | |
3 | """ | |
4 | ||
11fdf7f2 TL |
5 | import json |
6 | import mgr_util | |
7 | import threading | |
522d829b | 8 | from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union |
11fdf7f2 | 9 | import uuid |
9f95a23c | 10 | from prettytable import PrettyTable |
522d829b | 11 | from mgr_module import HealthChecksT, CLIReadCommand, CLIWriteCommand, CRUSHMap, MgrModule, Option, OSDMap |
11fdf7f2 TL |
12 | |
13 | """ | |
14 | Some terminology is made up for the purposes of this module: | |
15 | ||
16 | - "raw pgs": pg count after applying replication, i.e. the real resource | |
17 | consumption of a pool. | |
18 | - "grow/shrink" - increase/decrease the pg_num in a pool | |
19 | - "crush subtree" - non-overlapping domains in crush hierarchy: used as | |
20 | units of resource management. | |
21 | """ | |
22 | ||
23 | INTERVAL = 5 | |
24 | ||
92f5a8d4 | 25 | PG_NUM_MIN = 32 # unless specified on a per-pool basis |
11fdf7f2 | 26 | |
522d829b TL |
27 | if TYPE_CHECKING: |
28 | import sys | |
29 | if sys.version_info >= (3, 8): | |
30 | from typing import Literal | |
31 | else: | |
32 | from typing_extensions import Literal | |
33 | ||
20effc67 | 34 | PassT = Literal['first', 'second', 'third'] |
522d829b TL |
35 | |
36 | ||
37 | def nearest_power_of_two(n: int) -> int: | |
11fdf7f2 TL |
38 | v = int(n) |
39 | ||
40 | v -= 1 | |
41 | v |= v >> 1 | |
42 | v |= v >> 2 | |
43 | v |= v >> 4 | |
44 | v |= v >> 8 | |
45 | v |= v >> 16 | |
46 | ||
47 | # High bound power of two | |
48 | v += 1 | |
49 | ||
50 | # Low bound power of tow | |
51 | x = v >> 1 | |
52 | ||
53 | return x if (v - n) > (n - x) else v | |
54 | ||
522d829b TL |
55 | |
56 | def effective_target_ratio(target_ratio: float, | |
57 | total_target_ratio: float, | |
58 | total_target_bytes: int, | |
59 | capacity: int) -> float: | |
9f95a23c TL |
60 | """ |
61 | Returns the target ratio after normalizing for ratios across pools and | |
62 | adjusting for capacity reserved by pools that have target_size_bytes set. | |
63 | """ | |
64 | target_ratio = float(target_ratio) | |
65 | if total_target_ratio: | |
66 | target_ratio = target_ratio / total_target_ratio | |
67 | ||
68 | if total_target_bytes and capacity: | |
69 | fraction_available = 1.0 - min(1.0, float(total_target_bytes) / capacity) | |
70 | target_ratio *= fraction_available | |
71 | ||
72 | return target_ratio | |
73 | ||
74 | ||
75 | class PgAdjustmentProgress(object): | |
76 | """ | |
77 | Keeps the initial and target pg_num values | |
78 | """ | |
522d829b TL |
79 | |
80 | def __init__(self, pool_id: int, pg_num: int, pg_num_target: int) -> None: | |
9f95a23c TL |
81 | self.ev_id = str(uuid.uuid4()) |
82 | self.pool_id = pool_id | |
83 | self.reset(pg_num, pg_num_target) | |
84 | ||
522d829b | 85 | def reset(self, pg_num: int, pg_num_target: int) -> None: |
9f95a23c TL |
86 | self.pg_num = pg_num |
87 | self.pg_num_target = pg_num_target | |
88 | ||
522d829b | 89 | def update(self, module: MgrModule, progress: float) -> None: |
9f95a23c TL |
90 | desc = 'increasing' if self.pg_num < self.pg_num_target else 'decreasing' |
91 | module.remote('progress', 'update', self.ev_id, | |
92 | ev_msg="PG autoscaler %s pool %d PGs from %d to %d" % | |
522d829b | 93 | (desc, self.pool_id, self.pg_num, self.pg_num_target), |
9f95a23c TL |
94 | ev_progress=progress, |
95 | refs=[("pool", self.pool_id)]) | |
96 | ||
11fdf7f2 | 97 | |
522d829b TL |
98 | class CrushSubtreeResourceStatus: |
99 | def __init__(self) -> None: | |
100 | self.root_ids: List[int] = [] | |
101 | self.osds: Set[int] = set() | |
102 | self.osd_count: Optional[int] = None # Number of OSDs | |
103 | self.pg_target: Optional[int] = None # Ideal full-capacity PG count? | |
104 | self.pg_current = 0 # How many PGs already? | |
105 | self.pg_left = 0 | |
106 | self.capacity: Optional[int] = None # Total capacity of OSDs in subtree | |
107 | self.pool_ids: List[int] = [] | |
108 | self.pool_names: List[str] = [] | |
109 | self.pool_count: Optional[int] = None | |
110 | self.pool_used = 0 | |
111 | self.total_target_ratio = 0.0 | |
112 | self.total_target_bytes = 0 # including replication / EC overhead | |
113 | ||
114 | ||
11fdf7f2 TL |
115 | class PgAutoscaler(MgrModule): |
116 | """ | |
117 | PG autoscaler. | |
118 | """ | |
11fdf7f2 TL |
119 | NATIVE_OPTIONS = [ |
120 | 'mon_target_pg_per_osd', | |
121 | 'mon_max_pg_per_osd', | |
122 | ] | |
123 | ||
124 | MODULE_OPTIONS = [ | |
522d829b TL |
125 | Option( |
126 | name='sleep_interval', | |
127 | type='secs', | |
128 | default=60), | |
20effc67 TL |
129 | |
130 | Option( | |
131 | name='threshold', | |
132 | type='float', | |
133 | desc='scaling threshold', | |
134 | long_desc=('The factor by which the `NEW PG_NUM` must vary from the current' | |
135 | '`PG_NUM` before being accepted. Cannot be less than 1.0'), | |
136 | default=3.0, | |
137 | min=1.0), | |
522d829b | 138 | Option( |
20effc67 TL |
139 | name='noautoscale', |
140 | type='bool', | |
141 | desc='global autoscale flag', | |
142 | long_desc=('Option to turn on/off the autoscaler for all pools'), | |
143 | default=False), | |
11fdf7f2 TL |
144 | ] |
145 | ||
522d829b | 146 | def __init__(self, *args: Any, **kwargs: Any) -> None: |
11fdf7f2 TL |
147 | super(PgAutoscaler, self).__init__(*args, **kwargs) |
148 | self._shutdown = threading.Event() | |
522d829b | 149 | self._event: Dict[int, PgAdjustmentProgress] = {} |
11fdf7f2 TL |
150 | |
151 | # So much of what we do peeks at the osdmap that it's easiest | |
152 | # to just keep a copy of the pythonized version. | |
153 | self._osd_map = None | |
522d829b | 154 | if TYPE_CHECKING: |
522d829b TL |
155 | self.sleep_interval = 60 |
156 | self.mon_target_pg_per_osd = 0 | |
20effc67 TL |
157 | self.threshold = 3.0 |
158 | self.noautoscale = False | |
11fdf7f2 | 159 | |
522d829b | 160 | def config_notify(self) -> None: |
11fdf7f2 TL |
161 | for opt in self.NATIVE_OPTIONS: |
162 | setattr(self, | |
163 | opt, | |
164 | self.get_ceph_option(opt)) | |
165 | self.log.debug(' native option %s = %s', opt, getattr(self, opt)) | |
166 | for opt in self.MODULE_OPTIONS: | |
167 | setattr(self, | |
168 | opt['name'], | |
9f95a23c | 169 | self.get_module_option(opt['name'])) |
11fdf7f2 TL |
170 | self.log.debug(' mgr option %s = %s', |
171 | opt['name'], getattr(self, opt['name'])) | |
172 | ||
522d829b TL |
173 | @CLIReadCommand('osd pool autoscale-status') |
174 | def _command_autoscale_status(self, format: str = 'plain') -> Tuple[int, str, str]: | |
175 | """ | |
176 | report on pool pg_num sizing recommendation and intent | |
177 | """ | |
11fdf7f2 TL |
178 | osdmap = self.get_osdmap() |
179 | pools = osdmap.get_pools_by_name() | |
20effc67 | 180 | ps, root_map = self._get_pool_status(osdmap, pools) |
11fdf7f2 | 181 | |
522d829b | 182 | if format in ('json', 'json-pretty'): |
9f95a23c | 183 | return 0, json.dumps(ps, indent=4, sort_keys=True), '' |
11fdf7f2 TL |
184 | else: |
185 | table = PrettyTable(['POOL', 'SIZE', 'TARGET SIZE', | |
186 | 'RATE', 'RAW CAPACITY', | |
187 | 'RATIO', 'TARGET RATIO', | |
9f95a23c | 188 | 'EFFECTIVE RATIO', |
11fdf7f2 TL |
189 | 'BIAS', |
190 | 'PG_NUM', | |
191 | # 'IDEAL', | |
522d829b | 192 | 'NEW PG_NUM', 'AUTOSCALE', |
20effc67 | 193 | 'BULK'], |
11fdf7f2 | 194 | border=False) |
eafe8130 | 195 | table.left_padding_width = 0 |
9f95a23c | 196 | table.right_padding_width = 2 |
11fdf7f2 TL |
197 | table.align['POOL'] = 'l' |
198 | table.align['SIZE'] = 'r' | |
199 | table.align['TARGET SIZE'] = 'r' | |
200 | table.align['RATE'] = 'r' | |
201 | table.align['RAW CAPACITY'] = 'r' | |
202 | table.align['RATIO'] = 'r' | |
203 | table.align['TARGET RATIO'] = 'r' | |
9f95a23c | 204 | table.align['EFFECTIVE RATIO'] = 'r' |
11fdf7f2 TL |
205 | table.align['BIAS'] = 'r' |
206 | table.align['PG_NUM'] = 'r' | |
207 | # table.align['IDEAL'] = 'r' | |
208 | table.align['NEW PG_NUM'] = 'r' | |
209 | table.align['AUTOSCALE'] = 'l' | |
20effc67 | 210 | table.align['BULK'] = 'l' |
11fdf7f2 TL |
211 | for p in ps: |
212 | if p['would_adjust']: | |
213 | final = str(p['pg_num_final']) | |
214 | else: | |
215 | final = '' | |
216 | if p['target_bytes'] > 0: | |
217 | ts = mgr_util.format_bytes(p['target_bytes'], 6) | |
218 | else: | |
219 | ts = '' | |
220 | if p['target_ratio'] > 0.0: | |
221 | tr = '%.4f' % p['target_ratio'] | |
222 | else: | |
223 | tr = '' | |
9f95a23c TL |
224 | if p['effective_target_ratio'] > 0.0: |
225 | etr = '%.4f' % p['effective_target_ratio'] | |
226 | else: | |
227 | etr = '' | |
11fdf7f2 TL |
228 | table.add_row([ |
229 | p['pool_name'], | |
230 | mgr_util.format_bytes(p['logical_used'], 6), | |
231 | ts, | |
232 | p['raw_used_rate'], | |
233 | mgr_util.format_bytes(p['subtree_capacity'], 6), | |
234 | '%.4f' % p['capacity_ratio'], | |
235 | tr, | |
9f95a23c | 236 | etr, |
11fdf7f2 TL |
237 | p['bias'], |
238 | p['pg_num_target'], | |
239 | # p['pg_num_ideal'], | |
240 | final, | |
241 | p['pg_autoscale_mode'], | |
20effc67 | 242 | str(p['bulk']) |
11fdf7f2 TL |
243 | ]) |
244 | return 0, table.get_string(), '' | |
245 | ||
20effc67 TL |
246 | @CLIWriteCommand("osd pool set threshold") |
247 | def set_scaling_threshold(self, num: float) -> Tuple[int, str, str]: | |
248 | """ | |
249 | set the autoscaler threshold | |
250 | A.K.A. the factor by which the new PG_NUM must vary from the existing PG_NUM | |
251 | """ | |
252 | if num < 1.0: | |
253 | return 22, "", "threshold cannot be set less than 1.0" | |
254 | self.set_module_option("threshold", num) | |
255 | return 0, "threshold updated", "" | |
256 | ||
257 | def complete_all_progress_events(self) -> None: | |
258 | for pool_id in list(self._event): | |
259 | ev = self._event[pool_id] | |
260 | self.remote('progress', 'complete', ev.ev_id) | |
261 | del self._event[pool_id] | |
262 | ||
263 | def set_autoscale_mode_all_pools(self, status: str) -> None: | |
264 | osdmap = self.get_osdmap() | |
265 | pools = osdmap.get_pools_by_name() | |
266 | for pool_name, _ in pools.items(): | |
267 | self.mon_command({ | |
268 | 'prefix': 'osd pool set', | |
269 | 'pool': pool_name, | |
270 | 'var': 'pg_autoscale_mode', | |
271 | 'val': status | |
272 | }) | |
273 | @CLIWriteCommand("osd pool get noautoscale") | |
274 | def get_noautoscale(self) -> Tuple[int, str, str]: | |
522d829b | 275 | """ |
20effc67 TL |
276 | Get the noautoscale flag to see if all pools |
277 | are setting the autoscaler on or off as well | |
278 | as newly created pools in the future. | |
522d829b | 279 | """ |
20effc67 TL |
280 | |
281 | if self.noautoscale == None: | |
282 | raise TypeError("noautoscale cannot be None") | |
283 | elif self.noautoscale: | |
284 | return 0, "", "noautoscale is on" | |
522d829b | 285 | else: |
20effc67 | 286 | return 0, "", "noautoscale is off" |
522d829b | 287 | |
20effc67 TL |
288 | @CLIWriteCommand("osd pool unset noautoscale") |
289 | def unset_noautoscale(self) -> Tuple[int, str, str]: | |
522d829b | 290 | """ |
20effc67 TL |
291 | Unset the noautoscale flag so all pools will |
292 | have autoscale enabled (including newly created | |
293 | pools in the future). | |
522d829b | 294 | """ |
20effc67 TL |
295 | if not self.noautoscale: |
296 | return 0, "", "noautoscale is already unset!" | |
522d829b | 297 | else: |
20effc67 TL |
298 | self.set_module_option("noautoscale", False) |
299 | self.mon_command({ | |
300 | 'prefix': 'config set', | |
301 | 'who': 'global', | |
302 | 'name': 'osd_pool_default_pg_autoscale_mode', | |
303 | 'value': 'on' | |
304 | }) | |
305 | self.set_autoscale_mode_all_pools("on") | |
306 | return 0, "", "noautoscale is unset, all pools now have autoscale on" | |
307 | ||
308 | @CLIWriteCommand("osd pool set noautoscale") | |
309 | def set_noautoscale(self) -> Tuple[int, str, str]: | |
310 | """ | |
311 | set the noautoscale for all pools (including | |
312 | newly created pools in the future) | |
313 | and complete all on-going progress events | |
314 | regarding PG-autoscaling. | |
315 | """ | |
316 | if self.noautoscale: | |
317 | return 0, "", "noautoscale is already set!" | |
318 | else: | |
319 | self.set_module_option("noautoscale", True) | |
320 | self.mon_command({ | |
321 | 'prefix': 'config set', | |
322 | 'who': 'global', | |
323 | 'name': 'osd_pool_default_pg_autoscale_mode', | |
324 | 'value': 'off' | |
325 | }) | |
326 | self.set_autoscale_mode_all_pools("off") | |
327 | self.complete_all_progress_events() | |
328 | return 0, "", "noautoscale is set, all pools now have autoscale off" | |
522d829b TL |
329 | |
330 | def serve(self) -> None: | |
11fdf7f2 TL |
331 | self.config_notify() |
332 | while not self._shutdown.is_set(): | |
333 | self._maybe_adjust() | |
9f95a23c | 334 | self._update_progress_events() |
522d829b | 335 | self._shutdown.wait(timeout=self.sleep_interval) |
11fdf7f2 | 336 | |
522d829b | 337 | def shutdown(self) -> None: |
92f5a8d4 TL |
338 | self.log.info('Stopping pg_autoscaler') |
339 | self._shutdown.set() | |
340 | ||
522d829b TL |
341 | def identify_subtrees_and_overlaps(self, |
342 | osdmap: OSDMap, | |
343 | crush: CRUSHMap, | |
344 | result: Dict[int, CrushSubtreeResourceStatus], | |
345 | overlapped_roots: Set[int], | |
346 | roots: List[CrushSubtreeResourceStatus]) -> \ | |
347 | Tuple[List[CrushSubtreeResourceStatus], | |
348 | Set[int]]: | |
349 | ||
350 | # We identify subtrees and overlapping roots from osdmap | |
11fdf7f2 | 351 | for pool_id, pool in osdmap.get_pools().items(): |
522d829b TL |
352 | crush_rule = crush.get_rule_by_id(pool['crush_rule']) |
353 | assert crush_rule is not None | |
354 | cr_name = crush_rule['rule_name'] | |
355 | root_id = crush.get_rule_root(cr_name) | |
356 | assert root_id is not None | |
11fdf7f2 TL |
357 | osds = set(crush.get_osds_under(root_id)) |
358 | ||
522d829b | 359 | # Are there overlapping roots? |
11fdf7f2 | 360 | s = None |
522d829b | 361 | for prev_root_id, prev in result.items(): |
11fdf7f2 TL |
362 | if osds & prev.osds: |
363 | s = prev | |
522d829b TL |
364 | if prev_root_id != root_id: |
365 | overlapped_roots.add(prev_root_id) | |
366 | overlapped_roots.add(root_id) | |
367 | self.log.error('pool %d has overlapping roots: %s', | |
368 | pool_id, overlapped_roots) | |
11fdf7f2 TL |
369 | break |
370 | if not s: | |
371 | s = CrushSubtreeResourceStatus() | |
372 | roots.append(s) | |
373 | result[root_id] = s | |
374 | s.root_ids.append(root_id) | |
375 | s.osds |= osds | |
9f95a23c | 376 | s.pool_ids.append(pool_id) |
11fdf7f2 TL |
377 | s.pool_names.append(pool['pool_name']) |
378 | s.pg_current += pool['pg_num_target'] * pool['size'] | |
9f95a23c TL |
379 | target_ratio = pool['options'].get('target_size_ratio', 0.0) |
380 | if target_ratio: | |
381 | s.total_target_ratio += target_ratio | |
382 | else: | |
383 | target_bytes = pool['options'].get('target_size_bytes', 0) | |
384 | if target_bytes: | |
385 | s.total_target_bytes += target_bytes * osdmap.pool_raw_used_rate(pool_id) | |
522d829b | 386 | return roots, overlapped_roots |
11fdf7f2 | 387 | |
522d829b TL |
388 | def get_subtree_resource_status(self, |
389 | osdmap: OSDMap, | |
390 | crush: CRUSHMap) -> Tuple[Dict[int, CrushSubtreeResourceStatus], | |
391 | Set[int]]: | |
392 | """ | |
393 | For each CRUSH subtree of interest (i.e. the roots under which | |
394 | we have pools), calculate the current resource usages and targets, | |
395 | such as how many PGs there are, vs. how many PGs we would | |
396 | like there to be. | |
397 | """ | |
398 | result: Dict[int, CrushSubtreeResourceStatus] = {} | |
399 | roots: List[CrushSubtreeResourceStatus] = [] | |
400 | overlapped_roots: Set[int] = set() | |
401 | # identify subtrees and overlapping roots | |
402 | roots, overlapped_roots = self.identify_subtrees_and_overlaps(osdmap, | |
403 | crush, result, overlapped_roots, roots) | |
11fdf7f2 TL |
404 | # finish subtrees |
405 | all_stats = self.get('osd_stats') | |
406 | for s in roots: | |
522d829b | 407 | assert s.osds is not None |
11fdf7f2 | 408 | s.osd_count = len(s.osds) |
9f95a23c | 409 | s.pg_target = s.osd_count * self.mon_target_pg_per_osd |
522d829b TL |
410 | s.pg_left = s.pg_target |
411 | s.pool_count = len(s.pool_ids) | |
412 | capacity = 0 | |
11fdf7f2 TL |
413 | for osd_stats in all_stats['osd_stats']: |
414 | if osd_stats['osd'] in s.osds: | |
415 | # Intentionally do not apply the OSD's reweight to | |
416 | # this, because we want to calculate PG counts based | |
417 | # on the physical storage available, not how it is | |
418 | # reweighted right now. | |
419 | capacity += osd_stats['kb'] * 1024 | |
420 | ||
421 | s.capacity = capacity | |
11fdf7f2 TL |
422 | self.log.debug('root_ids %s pools %s with %d osds, pg_target %d', |
423 | s.root_ids, | |
424 | s.pool_ids, | |
425 | s.osd_count, | |
426 | s.pg_target) | |
427 | ||
522d829b | 428 | return result, overlapped_roots |
11fdf7f2 | 429 | |
522d829b | 430 | def _calc_final_pg_target( |
11fdf7f2 | 431 | self, |
522d829b TL |
432 | p: Dict[str, Any], |
433 | pool_name: str, | |
434 | root_map: Dict[int, CrushSubtreeResourceStatus], | |
435 | root_id: int, | |
436 | capacity_ratio: float, | |
522d829b | 437 | bias: float, |
20effc67 TL |
438 | even_pools: Dict[str, Dict[str, Any]], |
439 | bulk_pools: Dict[str, Dict[str, Any]], | |
440 | func_pass: 'PassT', | |
441 | bulk: bool, | |
522d829b TL |
442 | ) -> Union[Tuple[float, int, int], Tuple[None, None, None]]: |
443 | """ | |
444 | `profile` determines behaviour of the autoscaler. | |
20effc67 | 445 | `first_pass` flag used to determine if this is the first |
522d829b TL |
446 | pass where the caller tries to calculate/adjust pools that has |
447 | used_ratio > even_ratio else this is the second pass, | |
448 | we calculate final_ratio by giving it 1 / pool_count | |
449 | of the root we are currently looking at. | |
450 | """ | |
20effc67 TL |
451 | if func_pass == 'first': |
452 | # first pass to deal with small pools (no bulk flag) | |
453 | # calculating final_pg_target based on capacity ratio | |
454 | # we also keep track of bulk_pools to be used in second pass | |
455 | if not bulk: | |
456 | final_ratio = capacity_ratio | |
457 | pg_left = root_map[root_id].pg_left | |
458 | assert pg_left is not None | |
459 | used_pg = final_ratio * pg_left | |
522d829b | 460 | root_map[root_id].pg_left -= int(used_pg) |
20effc67 | 461 | root_map[root_id].pool_used += 1 |
522d829b | 462 | pool_pg_target = used_pg / p['size'] * bias |
522d829b | 463 | else: |
20effc67 TL |
464 | bulk_pools[pool_name] = p |
465 | return None, None, None | |
466 | ||
467 | elif func_pass == 'second': | |
468 | # second pass we calculate the final_pg_target | |
469 | # for pools that have used_ratio > even_ratio | |
470 | # and we keep track of even pools to be used in third pass | |
471 | pool_count = root_map[root_id].pool_count | |
472 | assert pool_count is not None | |
473 | even_ratio = 1 / (pool_count - root_map[root_id].pool_used) | |
474 | used_ratio = capacity_ratio | |
475 | ||
476 | if used_ratio > even_ratio: | |
477 | root_map[root_id].pool_used += 1 | |
478 | else: | |
479 | even_pools[pool_name] = p | |
480 | return None, None, None | |
481 | ||
482 | final_ratio = max(used_ratio, even_ratio) | |
483 | pg_left = root_map[root_id].pg_left | |
484 | assert pg_left is not None | |
485 | used_pg = final_ratio * pg_left | |
486 | root_map[root_id].pg_left -= int(used_pg) | |
487 | pool_pg_target = used_pg / p['size'] * bias | |
11fdf7f2 | 488 | |
20effc67 TL |
489 | else: |
490 | # third pass we just split the pg_left to all even_pools | |
491 | pool_count = root_map[root_id].pool_count | |
492 | assert pool_count is not None | |
493 | final_ratio = 1 / (pool_count - root_map[root_id].pool_used) | |
494 | pool_pg_target = (final_ratio * root_map[root_id].pg_left) / p['size'] * bias | |
495 | ||
496 | min_pg = p.get('options', {}).get('pg_num_min', PG_NUM_MIN) | |
497 | max_pg = p.get('options', {}).get('pg_num_max') | |
498 | final_pg_target = max(min_pg, nearest_power_of_two(pool_pg_target)) | |
499 | if max_pg and max_pg < final_pg_target: | |
500 | final_pg_target = max_pg | |
501 | self.log.info("Pool '{0}' root_id {1} using {2} of space, bias {3}, " | |
502 | "pg target {4} quantized to {5} (current {6})".format( | |
503 | p['pool_name'], | |
504 | root_id, | |
505 | capacity_ratio, | |
506 | bias, | |
507 | pool_pg_target, | |
508 | final_pg_target, | |
509 | p['pg_num_target'] | |
510 | )) | |
522d829b | 511 | return final_ratio, pool_pg_target, final_pg_target |
11fdf7f2 | 512 | |
20effc67 | 513 | def _get_pool_pg_targets( |
522d829b TL |
514 | self, |
515 | osdmap: OSDMap, | |
516 | pools: Dict[str, Dict[str, Any]], | |
517 | crush_map: CRUSHMap, | |
518 | root_map: Dict[int, CrushSubtreeResourceStatus], | |
519 | pool_stats: Dict[int, Dict[str, int]], | |
520 | ret: List[Dict[str, Any]], | |
521 | threshold: float, | |
20effc67 | 522 | func_pass: 'PassT', |
522d829b | 523 | overlapped_roots: Set[int], |
20effc67 | 524 | ) -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]] , Dict[str, Dict[str, Any]]]: |
522d829b TL |
525 | """ |
526 | Calculates final_pg_target of each pools and determine if it needs | |
527 | scaling, this depends on the profile of the autoscaler. For scale-down, | |
528 | we start out with a full complement of pgs and only descrease it when other | |
529 | pools needs more pgs due to increased usage. For scale-up, we start out with | |
530 | the minimal amount of pgs and only scale when there is increase in usage. | |
531 | """ | |
532 | even_pools: Dict[str, Dict[str, Any]] = {} | |
20effc67 | 533 | bulk_pools: Dict[str, Dict[str, Any]] = {} |
f67539c2 | 534 | for pool_name, p in pools.items(): |
11fdf7f2 | 535 | pool_id = p['pool'] |
eafe8130 TL |
536 | if pool_id not in pool_stats: |
537 | # race with pool deletion; skip | |
538 | continue | |
11fdf7f2 TL |
539 | |
540 | # FIXME: we assume there is only one take per pool, but that | |
541 | # may not be true. | |
522d829b TL |
542 | crush_rule = crush_map.get_rule_by_id(p['crush_rule']) |
543 | assert crush_rule is not None | |
544 | cr_name = crush_rule['rule_name'] | |
545 | root_id = crush_map.get_rule_root(cr_name) | |
546 | assert root_id is not None | |
20effc67 TL |
547 | if root_id in overlapped_roots: |
548 | # skip pools | |
522d829b TL |
549 | # with overlapping roots |
550 | self.log.warn("pool %d contains an overlapping root %d" | |
551 | "... skipping scaling", pool_id, root_id) | |
552 | continue | |
11fdf7f2 | 553 | capacity = root_map[root_id].capacity |
522d829b | 554 | assert capacity is not None |
11fdf7f2 TL |
555 | if capacity == 0: |
556 | self.log.debug('skipping empty subtree %s', cr_name) | |
557 | continue | |
558 | ||
559 | raw_used_rate = osdmap.pool_raw_used_rate(pool_id) | |
560 | ||
eafe8130 | 561 | pool_logical_used = pool_stats[pool_id]['stored'] |
11fdf7f2 | 562 | bias = p['options'].get('pg_autoscale_bias', 1.0) |
9f95a23c TL |
563 | target_bytes = 0 |
564 | # ratio takes precedence if both are set | |
565 | if p['options'].get('target_size_ratio', 0.0) == 0.0: | |
566 | target_bytes = p['options'].get('target_size_bytes', 0) | |
11fdf7f2 TL |
567 | |
568 | # What proportion of space are we using? | |
569 | actual_raw_used = pool_logical_used * raw_used_rate | |
570 | actual_capacity_ratio = float(actual_raw_used) / capacity | |
571 | ||
572 | pool_raw_used = max(pool_logical_used, target_bytes) * raw_used_rate | |
573 | capacity_ratio = float(pool_raw_used) / capacity | |
574 | ||
9f95a23c TL |
575 | self.log.info("effective_target_ratio {0} {1} {2} {3}".format( |
576 | p['options'].get('target_size_ratio', 0.0), | |
577 | root_map[root_id].total_target_ratio, | |
578 | root_map[root_id].total_target_bytes, | |
579 | capacity)) | |
522d829b | 580 | |
9f95a23c TL |
581 | target_ratio = effective_target_ratio(p['options'].get('target_size_ratio', 0.0), |
582 | root_map[root_id].total_target_ratio, | |
583 | root_map[root_id].total_target_bytes, | |
584 | capacity) | |
585 | ||
20effc67 TL |
586 | # determine if the pool is a bulk |
587 | bulk = False | |
588 | flags = p['flags_names'].split(",") | |
589 | if "bulk" in flags: | |
590 | bulk = True | |
591 | ||
522d829b TL |
592 | capacity_ratio = max(capacity_ratio, target_ratio) |
593 | final_ratio, pool_pg_target, final_pg_target = self._calc_final_pg_target( | |
20effc67 TL |
594 | p, pool_name, root_map, root_id, |
595 | capacity_ratio, bias, even_pools, | |
596 | bulk_pools, func_pass, bulk) | |
11fdf7f2 | 597 | |
522d829b TL |
598 | if final_ratio is None: |
599 | continue | |
11fdf7f2 TL |
600 | |
601 | adjust = False | |
522d829b TL |
602 | if (final_pg_target > p['pg_num_target'] * threshold or |
603 | final_pg_target < p['pg_num_target'] / threshold) and \ | |
604 | final_ratio >= 0.0 and \ | |
605 | final_ratio <= 1.0: | |
11fdf7f2 TL |
606 | adjust = True |
607 | ||
522d829b | 608 | assert pool_pg_target is not None |
11fdf7f2 TL |
609 | ret.append({ |
610 | 'pool_id': pool_id, | |
611 | 'pool_name': p['pool_name'], | |
612 | 'crush_root_id': root_id, | |
613 | 'pg_autoscale_mode': p['pg_autoscale_mode'], | |
614 | 'pg_num_target': p['pg_num_target'], | |
615 | 'logical_used': pool_logical_used, | |
616 | 'target_bytes': target_bytes, | |
617 | 'raw_used_rate': raw_used_rate, | |
618 | 'subtree_capacity': capacity, | |
619 | 'actual_raw_used': actual_raw_used, | |
620 | 'raw_used': pool_raw_used, | |
621 | 'actual_capacity_ratio': actual_capacity_ratio, | |
622 | 'capacity_ratio': capacity_ratio, | |
9f95a23c TL |
623 | 'target_ratio': p['options'].get('target_size_ratio', 0.0), |
624 | 'effective_target_ratio': target_ratio, | |
11fdf7f2 TL |
625 | 'pg_num_ideal': int(pool_pg_target), |
626 | 'pg_num_final': final_pg_target, | |
627 | 'would_adjust': adjust, | |
628 | 'bias': p.get('options', {}).get('pg_autoscale_bias', 1.0), | |
20effc67 | 629 | 'bulk': bulk, |
522d829b | 630 | }) |
11fdf7f2 | 631 | |
20effc67 | 632 | return ret, bulk_pools, even_pools |
11fdf7f2 | 633 | |
522d829b TL |
634 | def _get_pool_status( |
635 | self, | |
636 | osdmap: OSDMap, | |
637 | pools: Dict[str, Dict[str, Any]], | |
522d829b TL |
638 | ) -> Tuple[List[Dict[str, Any]], |
639 | Dict[int, CrushSubtreeResourceStatus]]: | |
20effc67 TL |
640 | threshold = self.threshold |
641 | assert threshold >= 1.0 | |
522d829b TL |
642 | |
643 | crush_map = osdmap.get_crush() | |
644 | root_map, overlapped_roots = self.get_subtree_resource_status(osdmap, crush_map) | |
645 | df = self.get('df') | |
646 | pool_stats = dict([(p['id'], p['stats']) for p in df['pools']]) | |
647 | ||
648 | ret: List[Dict[str, Any]] = [] | |
649 | ||
650 | # Iterate over all pools to determine how they should be sized. | |
20effc67 | 651 | # First call of _get_pool_pg_targets() is to find/adjust pools that uses more capacaity than |
522d829b TL |
652 | # the even_ratio of other pools and we adjust those first. |
653 | # Second call make use of the even_pools we keep track of in the first call. | |
654 | # All we need to do is iterate over those and give them 1/pool_count of the | |
655 | # total pgs. | |
656 | ||
20effc67 TL |
657 | ret, bulk_pools, _ = self._get_pool_pg_targets(osdmap, pools, crush_map, root_map, |
658 | pool_stats, ret, threshold, 'first', overlapped_roots) | |
522d829b | 659 | |
20effc67 TL |
660 | ret, _, even_pools = self._get_pool_pg_targets(osdmap, bulk_pools, crush_map, root_map, |
661 | pool_stats, ret, threshold, 'second', overlapped_roots) | |
662 | ||
663 | ret, _, _ = self._get_pool_pg_targets(osdmap, even_pools, crush_map, root_map, | |
664 | pool_stats, ret, threshold, 'third', overlapped_roots) | |
522d829b TL |
665 | |
666 | return (ret, root_map) | |
667 | ||
668 | def _update_progress_events(self) -> None: | |
20effc67 TL |
669 | if self.noautoscale: |
670 | return | |
9f95a23c TL |
671 | osdmap = self.get_osdmap() |
672 | pools = osdmap.get_pools() | |
673 | for pool_id in list(self._event): | |
674 | ev = self._event[pool_id] | |
675 | pool_data = pools.get(pool_id) | |
f6b5b4d7 | 676 | if pool_data is None or pool_data['pg_num'] == pool_data['pg_num_target'] or ev.pg_num == ev.pg_num_target: |
9f95a23c TL |
677 | # pool is gone or we've reached our target |
678 | self.remote('progress', 'complete', ev.ev_id) | |
679 | del self._event[pool_id] | |
680 | continue | |
681 | ev.update(self, (ev.pg_num - pool_data['pg_num']) / (ev.pg_num - ev.pg_num_target)) | |
11fdf7f2 | 682 | |
522d829b | 683 | def _maybe_adjust(self) -> None: |
20effc67 TL |
684 | if self.noautoscale: |
685 | return | |
11fdf7f2 TL |
686 | self.log.info('_maybe_adjust') |
687 | osdmap = self.get_osdmap() | |
9f95a23c TL |
688 | if osdmap.get_require_osd_release() < 'nautilus': |
689 | return | |
11fdf7f2 | 690 | pools = osdmap.get_pools_by_name() |
1d09f67e TL |
691 | self.log.debug("pool: {0}".format(json.dumps(pools, indent=4, |
692 | sort_keys=True))) | |
20effc67 | 693 | ps, root_map = self._get_pool_status(osdmap, pools) |
11fdf7f2 TL |
694 | |
695 | # Anyone in 'warn', set the health message for them and then | |
696 | # drop them from consideration. | |
697 | too_few = [] | |
698 | too_many = [] | |
9f95a23c | 699 | bytes_and_ratio = [] |
522d829b | 700 | health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {} |
11fdf7f2 | 701 | |
81eedcae TL |
702 | total_bytes = dict([(r, 0) for r in iter(root_map)]) |
703 | total_target_bytes = dict([(r, 0.0) for r in iter(root_map)]) | |
522d829b | 704 | target_bytes_pools: Dict[int, List[int]] = dict([(r, []) for r in iter(root_map)]) |
11fdf7f2 TL |
705 | |
706 | for p in ps: | |
9f95a23c TL |
707 | pool_id = p['pool_id'] |
708 | pool_opts = pools[p['pool_name']]['options'] | |
709 | if pool_opts.get('target_size_ratio', 0) > 0 and pool_opts.get('target_size_bytes', 0) > 0: | |
522d829b TL |
710 | bytes_and_ratio.append( |
711 | 'Pool %s has target_size_bytes and target_size_ratio set' % p['pool_name']) | |
11fdf7f2 TL |
712 | total_bytes[p['crush_root_id']] += max( |
713 | p['actual_raw_used'], | |
714 | p['target_bytes'] * p['raw_used_rate']) | |
715 | if p['target_bytes'] > 0: | |
716 | total_target_bytes[p['crush_root_id']] += p['target_bytes'] * p['raw_used_rate'] | |
717 | target_bytes_pools[p['crush_root_id']].append(p['pool_name']) | |
718 | if not p['would_adjust']: | |
719 | continue | |
720 | if p['pg_autoscale_mode'] == 'warn': | |
721 | msg = 'Pool %s has %d placement groups, should have %d' % ( | |
722 | p['pool_name'], | |
723 | p['pg_num_target'], | |
724 | p['pg_num_final']) | |
725 | if p['pg_num_final'] > p['pg_num_target']: | |
726 | too_few.append(msg) | |
727 | else: | |
728 | too_many.append(msg) | |
729 | ||
730 | if p['pg_autoscale_mode'] == 'on': | |
731 | # Note that setting pg_num actually sets pg_num_target (see | |
732 | # OSDMonitor.cc) | |
733 | r = self.mon_command({ | |
734 | 'prefix': 'osd pool set', | |
735 | 'pool': p['pool_name'], | |
736 | 'var': 'pg_num', | |
737 | 'val': str(p['pg_num_final']) | |
738 | }) | |
739 | ||
9f95a23c TL |
740 | # create new event or update existing one to reflect |
741 | # progress from current state to the new pg_num_target | |
742 | pool_data = pools[p['pool_name']] | |
743 | pg_num = pool_data['pg_num'] | |
744 | new_target = p['pg_num_final'] | |
745 | if pool_id in self._event: | |
746 | self._event[pool_id].reset(pg_num, new_target) | |
747 | else: | |
748 | self._event[pool_id] = PgAdjustmentProgress(pool_id, pg_num, new_target) | |
749 | self._event[pool_id].update(self, 0.0) | |
750 | ||
11fdf7f2 TL |
751 | if r[0] != 0: |
752 | # FIXME: this is a serious and unexpected thing, | |
753 | # we should expose it as a cluster log error once | |
754 | # the hook for doing that from ceph-mgr modules is | |
755 | # in. | |
756 | self.log.error("pg_num adjustment on {0} to {1} failed: {2}" | |
757 | .format(p['pool_name'], | |
758 | p['pg_num_final'], r)) | |
759 | ||
760 | if too_few: | |
761 | summary = "{0} pools have too few placement groups".format( | |
762 | len(too_few)) | |
763 | health_checks['POOL_TOO_FEW_PGS'] = { | |
764 | 'severity': 'warning', | |
765 | 'summary': summary, | |
9f95a23c | 766 | 'count': len(too_few), |
11fdf7f2 TL |
767 | 'detail': too_few |
768 | } | |
769 | if too_many: | |
770 | summary = "{0} pools have too many placement groups".format( | |
771 | len(too_many)) | |
772 | health_checks['POOL_TOO_MANY_PGS'] = { | |
773 | 'severity': 'warning', | |
774 | 'summary': summary, | |
9f95a23c | 775 | 'count': len(too_many), |
11fdf7f2 TL |
776 | 'detail': too_many |
777 | } | |
778 | ||
11fdf7f2 | 779 | too_much_target_bytes = [] |
f67539c2 | 780 | for root_id, total in total_bytes.items(): |
522d829b TL |
781 | total_target = int(total_target_bytes[root_id]) |
782 | capacity = root_map[root_id].capacity | |
783 | assert capacity is not None | |
784 | if total_target > 0 and total > capacity and capacity: | |
11fdf7f2 TL |
785 | too_much_target_bytes.append( |
786 | 'Pools %s overcommit available storage by %.03fx due to ' | |
787 | 'target_size_bytes %s on pools %s' % ( | |
788 | root_map[root_id].pool_names, | |
522d829b | 789 | total / capacity, |
11fdf7f2 TL |
790 | mgr_util.format_bytes(total_target, 5, colored=False), |
791 | target_bytes_pools[root_id] | |
792 | ) | |
793 | ) | |
522d829b | 794 | elif total_target > capacity and capacity: |
11fdf7f2 TL |
795 | too_much_target_bytes.append( |
796 | 'Pools %s overcommit available storage by %.03fx due to ' | |
797 | 'collective target_size_bytes of %s' % ( | |
798 | root_map[root_id].pool_names, | |
522d829b | 799 | total / capacity, |
11fdf7f2 TL |
800 | mgr_util.format_bytes(total_target, 5, colored=False), |
801 | ) | |
802 | ) | |
803 | if too_much_target_bytes: | |
804 | health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = { | |
805 | 'severity': 'warning', | |
806 | 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes), | |
9f95a23c | 807 | 'count': len(too_much_target_bytes), |
11fdf7f2 TL |
808 | 'detail': too_much_target_bytes, |
809 | } | |
810 | ||
9f95a23c TL |
811 | if bytes_and_ratio: |
812 | health_checks['POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO'] = { | |
813 | 'severity': 'warning', | |
814 | 'summary': "%d pools have both target_size_bytes and target_size_ratio set" % len(bytes_and_ratio), | |
815 | 'count': len(bytes_and_ratio), | |
816 | 'detail': bytes_and_ratio, | |
817 | } | |
11fdf7f2 TL |
818 | |
819 | self.set_health_checks(health_checks) |