]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/CrushWrapper.cc
update sources to v12.1.4
[ceph.git] / ceph / src / crush / CrushWrapper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
10
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
13
14 #define dout_subsys ceph_subsys_crush
15
16 bool CrushWrapper::has_legacy_rulesets() const
17 {
18 for (unsigned i=0; i<crush->max_rules; i++) {
19 crush_rule *r = crush->rules[i];
20 if (r &&
21 r->mask.ruleset != i) {
22 return true;
23 }
24 }
25 return false;
26 }
27
28 int CrushWrapper::renumber_rules_by_ruleset()
29 {
30 int max_ruleset = 0;
31 for (unsigned i=0; i<crush->max_rules; i++) {
32 crush_rule *r = crush->rules[i];
33 if (r && r->mask.ruleset >= max_ruleset) {
34 max_ruleset = r->mask.ruleset + 1;
35 }
36 }
37 struct crush_rule **newrules =
38 (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
39 for (unsigned i=0; i<crush->max_rules; i++) {
40 crush_rule *r = crush->rules[i];
41 if (!r)
42 continue;
43 if (newrules[r->mask.ruleset]) {
44 // collision, we can't do it.
45 free(newrules);
46 return -EINVAL;
47 }
48 newrules[r->mask.ruleset] = r;
49 }
50
51 // success, swap!
52 free(crush->rules);
53 crush->rules = newrules;
54 crush->max_rules = max_ruleset;
55 return 0;
56 }
57
58 bool CrushWrapper::has_multirule_rulesets() const
59 {
60 for (unsigned i=0; i<crush->max_rules; i++) {
61 crush_rule *r = crush->rules[i];
62 if (!r)
63 continue;
64 for (unsigned j=i+1; j<crush->max_rules; j++) {
65 crush_rule *s = crush->rules[j];
66 if (!s)
67 continue;
68 if (r->mask.ruleset == s->mask.ruleset)
69 return true;
70 }
71 }
72 return false;
73 }
74
75 bool CrushWrapper::has_non_straw2_buckets() const
76 {
77 for (int i=0; i<crush->max_buckets; ++i) {
78 crush_bucket *b = crush->buckets[i];
79 if (!b)
80 continue;
81 if (b->alg != CRUSH_BUCKET_STRAW2)
82 return true;
83 }
84 return false;
85 }
86
87 bool CrushWrapper::has_v2_rules() const
88 {
89 for (unsigned i=0; i<crush->max_rules; i++) {
90 if (is_v2_rule(i)) {
91 return true;
92 }
93 }
94 return false;
95 }
96
97 bool CrushWrapper::is_v2_rule(unsigned ruleid) const
98 {
99 // check rule for use of indep or new SET_* rule steps
100 if (ruleid >= crush->max_rules)
101 return false;
102 crush_rule *r = crush->rules[ruleid];
103 if (!r)
104 return false;
105 for (unsigned j=0; j<r->len; j++) {
106 if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
107 r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
108 r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
109 r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
110 return true;
111 }
112 }
113 return false;
114 }
115
116 bool CrushWrapper::has_v3_rules() const
117 {
118 for (unsigned i=0; i<crush->max_rules; i++) {
119 if (is_v3_rule(i)) {
120 return true;
121 }
122 }
123 return false;
124 }
125
126 bool CrushWrapper::is_v3_rule(unsigned ruleid) const
127 {
128 // check rule for use of SET_CHOOSELEAF_VARY_R step
129 if (ruleid >= crush->max_rules)
130 return false;
131 crush_rule *r = crush->rules[ruleid];
132 if (!r)
133 return false;
134 for (unsigned j=0; j<r->len; j++) {
135 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
136 return true;
137 }
138 }
139 return false;
140 }
141
142 bool CrushWrapper::has_v4_buckets() const
143 {
144 for (int i=0; i<crush->max_buckets; ++i) {
145 crush_bucket *b = crush->buckets[i];
146 if (!b)
147 continue;
148 if (b->alg == CRUSH_BUCKET_STRAW2)
149 return true;
150 }
151 return false;
152 }
153
154 bool CrushWrapper::has_v5_rules() const
155 {
156 for (unsigned i=0; i<crush->max_rules; i++) {
157 if (is_v5_rule(i)) {
158 return true;
159 }
160 }
161 return false;
162 }
163
164 bool CrushWrapper::is_v5_rule(unsigned ruleid) const
165 {
166 // check rule for use of SET_CHOOSELEAF_STABLE step
167 if (ruleid >= crush->max_rules)
168 return false;
169 crush_rule *r = crush->rules[ruleid];
170 if (!r)
171 return false;
172 for (unsigned j=0; j<r->len; j++) {
173 if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
174 return true;
175 }
176 }
177 return false;
178 }
179
180 bool CrushWrapper::has_choose_args() const
181 {
182 return !choose_args.empty();
183 }
184
185 bool CrushWrapper::has_incompat_choose_args() const
186 {
187 if (choose_args.empty())
188 return false;
189 if (choose_args.size() > 1)
190 return true;
191 if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
192 return true;
193 crush_choose_arg_map arg_map = choose_args.begin()->second;
194 for (__u32 i = 0; i < arg_map.size; i++) {
195 crush_choose_arg *arg = &arg_map.args[i];
196 if (arg->weight_set_size == 0 &&
197 arg->ids_size == 0)
198 continue;
199 if (arg->weight_set_size != 1)
200 return true;
201 if (arg->ids_size != 0)
202 return true;
203 }
204 return false;
205 }
206
207 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
208 {
209 if (!item_exists(i))
210 return -EINVAL;
211 string name = get_item_name(i);
212 size_t pos = name.find("~");
213 if (pos == string::npos) {
214 *idout = i;
215 *classout = -1;
216 return 0;
217 }
218 string name_no_class = name.substr(0, pos);
219 if (!name_exists(name_no_class))
220 return -ENOENT;
221 string class_name = name.substr(pos + 1);
222 if (!class_exists(class_name))
223 return -ENOENT;
224 *idout = get_item_id(name_no_class);
225 *classout = get_class_id(class_name);
226 return 0;
227 }
228
229 int CrushWrapper::can_rename_item(const string& srcname,
230 const string& dstname,
231 ostream *ss) const
232 {
233 if (name_exists(srcname)) {
234 if (name_exists(dstname)) {
235 *ss << "dstname = '" << dstname << "' already exists";
236 return -EEXIST;
237 }
238 if (is_valid_crush_name(dstname)) {
239 return 0;
240 } else {
241 *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
242 return -EINVAL;
243 }
244 } else {
245 if (name_exists(dstname)) {
246 *ss << "srcname = '" << srcname << "' does not exist "
247 << "and dstname = '" << dstname << "' already exists";
248 return -EALREADY;
249 } else {
250 *ss << "srcname = '" << srcname << "' does not exist";
251 return -ENOENT;
252 }
253 }
254 }
255
256 int CrushWrapper::rename_item(const string& srcname,
257 const string& dstname,
258 ostream *ss)
259 {
260 int ret = can_rename_item(srcname, dstname, ss);
261 if (ret < 0)
262 return ret;
263 int oldid = get_item_id(srcname);
264 return set_item_name(oldid, dstname);
265 }
266
267 int CrushWrapper::can_rename_bucket(const string& srcname,
268 const string& dstname,
269 ostream *ss) const
270 {
271 int ret = can_rename_item(srcname, dstname, ss);
272 if (ret)
273 return ret;
274 int srcid = get_item_id(srcname);
275 if (srcid >= 0) {
276 *ss << "srcname = '" << srcname << "' is not a bucket "
277 << "because its id = " << srcid << " is >= 0";
278 return -ENOTDIR;
279 }
280 return 0;
281 }
282
283 int CrushWrapper::rename_bucket(const string& srcname,
284 const string& dstname,
285 ostream *ss)
286 {
287 int ret = can_rename_bucket(srcname, dstname, ss);
288 if (ret < 0)
289 return ret;
290 int oldid = get_item_id(srcname);
291 return set_item_name(oldid, dstname);
292 }
293
294 void CrushWrapper::find_takes(set<int>& roots) const
295 {
296 for (unsigned i=0; i<crush->max_rules; i++) {
297 crush_rule *r = crush->rules[i];
298 if (!r)
299 continue;
300 for (unsigned j=0; j<r->len; j++) {
301 if (r->steps[j].op == CRUSH_RULE_TAKE)
302 roots.insert(r->steps[j].arg1);
303 }
304 }
305 }
306
307 void CrushWrapper::find_roots(set<int>& roots) const
308 {
309 for (int i = 0; i < crush->max_buckets; i++) {
310 if (!crush->buckets[i])
311 continue;
312 crush_bucket *b = crush->buckets[i];
313 if (!_search_item_exists(b->id))
314 roots.insert(b->id);
315 }
316 }
317
318 bool CrushWrapper::subtree_contains(int root, int item) const
319 {
320 if (root == item)
321 return true;
322
323 if (root >= 0)
324 return false; // root is a leaf
325
326 const crush_bucket *b = get_bucket(root);
327 if (IS_ERR(b))
328 return false;
329
330 for (unsigned j=0; j<b->size; j++) {
331 if (subtree_contains(b->items[j], item))
332 return true;
333 }
334 return false;
335 }
336
337 bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
338 {
339 // last instance?
340 if (_search_item_exists(item)) {
341 return false;
342 }
343 if (item < 0 && _bucket_is_in_use(item)) {
344 return false;
345 }
346
347 if (item < 0 && !unlink_only) {
348 crush_bucket *t = get_bucket(item);
349 ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
350 crush_remove_bucket(crush, t);
351 if (class_bucket.count(item) != 0)
352 class_bucket.erase(item);
353 class_remove_item(item);
354 }
355 if ((item >= 0 || !unlink_only) && name_map.count(item)) {
356 ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
357 name_map.erase(item);
358 have_rmaps = false;
359 if (item >= 0 && !unlink_only) {
360 class_remove_item(item);
361 }
362 }
363 rebuild_roots_with_classes();
364 return true;
365 }
366
367 int CrushWrapper::remove_root(int item)
368 {
369 crush_bucket *b = get_bucket(item);
370 if (IS_ERR(b)) {
371 // should be idempotent
372 // e.g.: we use 'crush link' to link same host into
373 // different roots, which as a result can cause different
374 // shadow trees reference same hosts too. This means
375 // we may need to destory the same buckets(hosts, racks, etc.)
376 // multiple times during rebuilding all shadow trees.
377 return 0;
378 }
379
380 for (unsigned n = 0; n < b->size; n++) {
381 if (b->items[n] >= 0)
382 continue;
383 int r = remove_root(b->items[n]);
384 if (r < 0)
385 return r;
386 }
387
388 crush_remove_bucket(crush, b);
389 if (name_map.count(item) != 0) {
390 name_map.erase(item);
391 have_rmaps = false;
392 }
393 if (class_bucket.count(item) != 0)
394 class_bucket.erase(item);
395 class_remove_item(item);
396 return 0;
397 }
398
399 int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
400 {
401 ldout(cct, 5) << "remove_item " << item
402 << (unlink_only ? " unlink_only":"") << dendl;
403
404 int ret = -ENOENT;
405
406 if (item < 0 && !unlink_only) {
407 crush_bucket *t = get_bucket(item);
408 if (IS_ERR(t)) {
409 ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
410 << dendl;
411 return -ENOENT;
412 }
413
414 if (t->size) {
415 ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
416 << " items, not empty" << dendl;
417 return -ENOTEMPTY;
418 }
419 if (_bucket_is_in_use(item)) {
420 return -EBUSY;
421 }
422 }
423
424 for (int i = 0; i < crush->max_buckets; i++) {
425 if (!crush->buckets[i])
426 continue;
427 crush_bucket *b = crush->buckets[i];
428
429 for (unsigned i=0; i<b->size; ++i) {
430 int id = b->items[i];
431 if (id == item) {
432 ldout(cct, 5) << "remove_item removing item " << item
433 << " from bucket " << b->id << dendl;
434 for (auto& p : choose_args) {
435 // weight down each weight-set to 0 before we remove the item
436 vector<int> weightv(get_choose_args_positions(p.second), 0);
437 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
438 }
439 bucket_remove_item(b, item);
440 adjust_item_weight(cct, b->id, b->weight);
441 ret = 0;
442 }
443 }
444 }
445
446 if (_maybe_remove_last_instance(cct, item, unlink_only))
447 ret = 0;
448
449 return ret;
450 }
451
452 bool CrushWrapper::_search_item_exists(int item) const
453 {
454 for (int i = 0; i < crush->max_buckets; i++) {
455 if (!crush->buckets[i])
456 continue;
457 crush_bucket *b = crush->buckets[i];
458 for (unsigned j=0; j<b->size; ++j) {
459 if (b->items[j] == item)
460 return true;
461 }
462 }
463 return false;
464 }
465
466 bool CrushWrapper::_bucket_is_in_use(int item)
467 {
468 for (auto &i : class_bucket)
469 for (auto &j : i.second)
470 if (j.second == item)
471 return true;
472 for (unsigned i = 0; i < crush->max_rules; ++i) {
473 crush_rule *r = crush->rules[i];
474 if (!r)
475 continue;
476 for (unsigned j = 0; j < r->len; ++j) {
477 if (r->steps[j].op == CRUSH_RULE_TAKE) {
478 int step_item = r->steps[j].arg1;
479 int original_item;
480 int c;
481 int res = split_id_class(step_item, &original_item, &c);
482 if (res < 0)
483 return false;
484 if (step_item == item || original_item == item)
485 return true;
486 }
487 }
488 }
489 return false;
490 }
491
492 int CrushWrapper::_remove_item_under(
493 CephContext *cct, int item, int ancestor, bool unlink_only)
494 {
495 ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
496 << (unlink_only ? " unlink_only":"") << dendl;
497
498 if (ancestor >= 0) {
499 return -EINVAL;
500 }
501
502 if (!bucket_exists(ancestor))
503 return -EINVAL;
504
505 int ret = -ENOENT;
506
507 crush_bucket *b = get_bucket(ancestor);
508 for (unsigned i=0; i<b->size; ++i) {
509 int id = b->items[i];
510 if (id == item) {
511 ldout(cct, 5) << "_remove_item_under removing item " << item
512 << " from bucket " << b->id << dendl;
513 for (auto& p : choose_args) {
514 // weight down each weight-set to 0 before we remove the item
515 vector<int> weightv(get_choose_args_positions(p.second), 0);
516 _choose_args_adjust_item_weight_in_bucket(
517 cct, p.second, b->id, item, weightv, nullptr);
518 }
519 bucket_remove_item(b, item);
520 adjust_item_weight(cct, b->id, b->weight);
521 ret = 0;
522 } else if (id < 0) {
523 int r = remove_item_under(cct, item, id, unlink_only);
524 if (r == 0)
525 ret = 0;
526 }
527 }
528 return ret;
529 }
530
531 int CrushWrapper::remove_item_under(
532 CephContext *cct, int item, int ancestor, bool unlink_only)
533 {
534 ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
535 << (unlink_only ? " unlink_only":"") << dendl;
536
537 if (!unlink_only && _bucket_is_in_use(item)) {
538 return -EBUSY;
539 }
540
541 int ret = _remove_item_under(cct, item, ancestor, unlink_only);
542 if (ret < 0)
543 return ret;
544
545 if (item < 0 && !unlink_only) {
546 crush_bucket *t = get_bucket(item);
547 if (IS_ERR(t)) {
548 ldout(cct, 1) << "remove_item_under bucket " << item
549 << " does not exist" << dendl;
550 return -ENOENT;
551 }
552
553 if (t->size) {
554 ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
555 << " items, not empty" << dendl;
556 return -ENOTEMPTY;
557 }
558 }
559
560 if (_maybe_remove_last_instance(cct, item, unlink_only))
561 ret = 0;
562
563 return ret;
564 }
565
566 int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
567 const std::multimap<string,string>& loc)
568 {
569 ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
570 if (!item_exists(id))
571 return -ENOENT;
572 map<string,string> id_loc = get_full_location(id);
573 ldout(cct, 20) << " id is at " << id_loc << dendl;
574
575 for (map<int,string>::const_iterator p = type_map.begin();
576 p != type_map.end();
577 ++p) {
578 map<string,string>::iterator ip = id_loc.find(p->second);
579 if (ip == id_loc.end())
580 continue;
581 for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
582 q != loc.end();
583 ++q) {
584 if (q->first != p->second)
585 break;
586 if (q->second == ip->second)
587 return p->first;
588 }
589 }
590 return -ERANGE;
591 }
592
593 int CrushWrapper::parse_loc_map(const std::vector<string>& args,
594 std::map<string,string> *ploc)
595 {
596 ploc->clear();
597 for (unsigned i = 0; i < args.size(); ++i) {
598 const char *s = args[i].c_str();
599 const char *pos = strchr(s, '=');
600 if (!pos)
601 return -EINVAL;
602 string key(s, 0, pos-s);
603 string value(pos+1);
604 if (value.length())
605 (*ploc)[key] = value;
606 else
607 return -EINVAL;
608 }
609 return 0;
610 }
611
612 int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
613 std::multimap<string,string> *ploc)
614 {
615 ploc->clear();
616 for (unsigned i = 0; i < args.size(); ++i) {
617 const char *s = args[i].c_str();
618 const char *pos = strchr(s, '=');
619 if (!pos)
620 return -EINVAL;
621 string key(s, 0, pos-s);
622 string value(pos+1);
623 if (value.length())
624 ploc->insert(make_pair(key, value));
625 else
626 return -EINVAL;
627 }
628 return 0;
629 }
630
631 bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
632 int *weight)
633 {
634 ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
635
636 for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
637 // ignore device
638 if (p->first == 0)
639 continue;
640
641 // ignore types that aren't specified in loc
642 map<string,string>::const_iterator q = loc.find(p->second);
643 if (q == loc.end()) {
644 ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
645 << type_map << ")" << dendl;
646 continue;
647 }
648
649 if (!name_exists(q->second)) {
650 ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
651 return false;
652 }
653
654 int id = get_item_id(q->second);
655 if (id >= 0) {
656 ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
657 << " is a device, not bucket" << dendl;
658 return false;
659 }
660
661 assert(bucket_exists(id));
662 crush_bucket *b = get_bucket(id);
663
664 // see if item exists in this bucket
665 for (unsigned j=0; j<b->size; j++) {
666 if (b->items[j] == item) {
667 ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
668 if (weight)
669 *weight = crush_get_bucket_item_weight(b, j);
670 return true;
671 }
672 }
673 return false;
674 }
675
676 ldout(cct, 1) << "check_item_loc item " << item << " loc " << loc << dendl;
677 return false;
678 }
679
680 map<string, string> CrushWrapper::get_full_location(int id)
681 {
682 vector<pair<string, string> > full_location_ordered;
683 map<string,string> full_location;
684
685 get_full_location_ordered(id, full_location_ordered);
686
687 std::copy(full_location_ordered.begin(),
688 full_location_ordered.end(),
689 std::inserter(full_location, full_location.begin()));
690
691 return full_location;
692 }
693
694 int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path)
695 {
696 if (!item_exists(id))
697 return -ENOENT;
698 int cur = id;
699 int ret;
700 while (true) {
701 pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
702 if (ret != 0)
703 break;
704 path.push_back(parent_coord);
705 cur = get_item_id(parent_coord.second);
706 }
707 return 0;
708 }
709
710 string CrushWrapper::get_full_location_ordered_string(int id)
711 {
712 vector<pair<string, string> > full_location_ordered;
713 string full_location;
714 get_full_location_ordered(id, full_location_ordered);
715 reverse(begin(full_location_ordered), end(full_location_ordered));
716 for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
717 full_location = full_location + i->first + "=" + i->second;
718 if (i != full_location_ordered.end() - 1) {
719 full_location = full_location + ",";
720 }
721 }
722 return full_location;
723 }
724
725 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
726 {
727 map<int,string> parent_hierarchy;
728 pair<string, string> parent_coord = get_immediate_parent(id);
729 int parent_id;
730
731 // get the integer type for id and create a counter from there
732 int type_counter = get_bucket_type(id);
733
734 // if we get a negative type then we can assume that we have an OSD
735 // change behavior in get_item_type FIXME
736 if (type_counter < 0)
737 type_counter = 0;
738
739 // read the type map and get the name of the type with the largest ID
740 int high_type = 0;
741 for (map<int, string>::iterator it = type_map.begin(); it != type_map.end(); ++it){
742 if ( (*it).first > high_type )
743 high_type = (*it).first;
744 }
745
746 parent_id = get_item_id(parent_coord.second);
747
748 while (type_counter < high_type) {
749 type_counter++;
750 parent_hierarchy[ type_counter ] = parent_coord.first;
751
752 if (type_counter < high_type){
753 // get the coordinate information for the next parent
754 parent_coord = get_immediate_parent(parent_id);
755 parent_id = get_item_id(parent_coord.second);
756 }
757 }
758
759 return parent_hierarchy;
760 }
761
762 int CrushWrapper::get_children(int id, list<int> *children)
763 {
764 // leaf?
765 if (id >= 0) {
766 return 0;
767 }
768
769 crush_bucket *b = get_bucket(id);
770 if (IS_ERR(b)) {
771 return -ENOENT;
772 }
773
774 for (unsigned n=0; n<b->size; n++) {
775 children->push_back(b->items[n]);
776 }
777 return b->size;
778 }
779
780 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
781 {
782 assert(leaves);
783
784 // Already leaf?
785 if (id >= 0) {
786 leaves->push_back(id);
787 return 0;
788 }
789
790 crush_bucket *b = get_bucket(id);
791 if (IS_ERR(b)) {
792 return -ENOENT;
793 }
794
795 for (unsigned n = 0; n < b->size; n++) {
796 if (b->items[n] >= 0) {
797 leaves->push_back(b->items[n]);
798 } else {
799 // is a bucket, do recursive call
800 int r = _get_leaves(b->items[n], leaves);
801 if (r < 0) {
802 return r;
803 }
804 }
805 }
806
807 return 0; // all is well
808 }
809
810 int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
811 {
812 assert(leaves);
813 leaves->clear();
814
815 if (!name_exists(name)) {
816 return -ENOENT;
817 }
818
819 int id = get_item_id(name);
820 if (id >= 0) {
821 // already leaf
822 leaves->insert(id);
823 return 0;
824 }
825
826 list<int> unordered;
827 int r = _get_leaves(id, &unordered);
828 if (r < 0) {
829 return r;
830 }
831
832 for (auto &p : unordered) {
833 leaves->insert(p);
834 }
835
836 return 0;
837 }
838
839 int CrushWrapper::insert_item(
840 CephContext *cct, int item, float weight, string name,
841 const map<string,string>& loc) // typename -> bucketname
842 {
843 ldout(cct, 5) << "insert_item item " << item << " weight " << weight
844 << " name " << name << " loc " << loc << dendl;
845
846 if (!is_valid_crush_name(name))
847 return -EINVAL;
848
849 if (!is_valid_crush_loc(cct, loc))
850 return -EINVAL;
851
852 int r = validate_weightf(weight);
853 if (r < 0) {
854 return r;
855 }
856
857 if (name_exists(name)) {
858 if (get_item_id(name) != item) {
859 ldout(cct, 10) << "device name '" << name << "' already exists as id "
860 << get_item_id(name) << dendl;
861 return -EEXIST;
862 }
863 } else {
864 set_item_name(item, name);
865 }
866
867 int cur = item;
868
869 // create locations if locations don't exist and add child in
870 // location with 0 weight the more detail in the insert_item method
871 // declaration in CrushWrapper.h
872 for (auto p = type_map.begin(); p != type_map.end(); ++p) {
873 // ignore device type
874 if (p->first == 0)
875 continue;
876
877 // skip types that are unspecified
878 map<string,string>::const_iterator q = loc.find(p->second);
879 if (q == loc.end()) {
880 ldout(cct, 2) << "warning: did not specify location for '"
881 << p->second << "' level (levels are "
882 << type_map << ")" << dendl;
883 continue;
884 }
885
886 if (!name_exists(q->second)) {
887 ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
888 int empty = 0, newid;
889 int r = add_bucket(0, 0,
890 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
891 if (r < 0) {
892 ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
893 << dendl;
894 return r;
895 }
896 set_item_name(newid, q->second);
897
898 cur = newid;
899 continue;
900 }
901
902 // add to an existing bucket
903 int id = get_item_id(q->second);
904 if (!bucket_exists(id)) {
905 ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
906 return -EINVAL;
907 }
908
909 // check that we aren't creating a cycle.
910 if (subtree_contains(id, cur)) {
911 ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
912 << id << dendl;
913 return -EINVAL;
914 }
915
916 // we have done sanity check above
917 crush_bucket *b = get_bucket(id);
918
919 if (p->first != b->type) {
920 ldout(cct, 1) << "insert_item existing bucket has type "
921 << "'" << type_map[b->type] << "' != "
922 << "'" << type_map[p->first] << "'" << dendl;
923 return -EINVAL;
924 }
925
926 // are we forming a loop?
927 if (subtree_contains(cur, b->id)) {
928 ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
929 << "; cannot form loop" << dendl;
930 return -ELOOP;
931 }
932
933 ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
934 << " to bucket " << id << dendl;
935 int r = bucket_add_item(b, cur, 0);
936 assert (!r);
937 break;
938 }
939
940 // adjust the item's weight in location
941 if (adjust_item_weightf_in_loc(cct, item, weight, loc) > 0) {
942 if (item >= crush->max_devices) {
943 crush->max_devices = item + 1;
944 ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
945 << dendl;
946 }
947 r = rebuild_roots_with_classes();
948 if (r < 0) {
949 ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
950 << cpp_strerror(r) << dendl;
951 return r;
952 }
953 return 0;
954 }
955
956 ldout(cct, 1) << "error: didn't find anywhere to add item " << item
957 << " in " << loc << dendl;
958 return -EINVAL;
959 }
960
961
962 int CrushWrapper::move_bucket(
963 CephContext *cct, int id, const map<string,string>& loc)
964 {
965 // sorry this only works for buckets
966 if (id >= 0)
967 return -EINVAL;
968
969 if (!item_exists(id))
970 return -ENOENT;
971
972 // get the name of the bucket we are trying to move for later
973 string id_name = get_item_name(id);
974
975 // detach the bucket
976 int bucket_weight = detach_bucket(cct, id);
977
978 // insert the bucket back into the hierarchy
979 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
980 }
981
982 int CrushWrapper::detach_bucket(CephContext *cct, int item)
983 {
984 if (!crush)
985 return (-EINVAL);
986
987 if (item >= 0)
988 return (-EINVAL);
989
990 // check that the bucket that we want to detach exists
991 assert(bucket_exists(item));
992
993 // get the bucket's weight
994 crush_bucket *b = get_bucket(item);
995 unsigned bucket_weight = b->weight;
996
997 // get where the bucket is located
998 pair<string, string> bucket_location = get_immediate_parent(item);
999
1000 // get the id of the parent bucket
1001 int parent_id = get_item_id(bucket_location.second);
1002
1003 // get the parent bucket
1004 crush_bucket *parent_bucket = get_bucket(parent_id);
1005
1006 if (!IS_ERR(parent_bucket)) {
1007 // zero out the bucket weight
1008 bucket_adjust_item_weight(cct, parent_bucket, item, 0);
1009 adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
1010 for (auto& p : choose_args) {
1011 // weight down each weight-set to 0 before we remove the item
1012 vector<int> weightv(get_choose_args_positions(p.second), 0);
1013 choose_args_adjust_item_weight(cct, p.second, item, weightv, nullptr);
1014 }
1015
1016 // remove the bucket from the parent
1017 bucket_remove_item(parent_bucket, item);
1018 } else if (PTR_ERR(parent_bucket) != -ENOENT) {
1019 return PTR_ERR(parent_bucket);
1020 }
1021
1022 // check that we're happy
1023 int test_weight = 0;
1024 map<string,string> test_location;
1025 test_location[ bucket_location.first ] = (bucket_location.second);
1026
1027 bool successful_detach = !(check_item_loc(cct, item, test_location,
1028 &test_weight));
1029 assert(successful_detach);
1030 assert(test_weight == 0);
1031
1032 return bucket_weight;
1033 }
1034
1035 int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
1036 {
1037 if (src >= 0 || dst >= 0)
1038 return -EINVAL;
1039 if (!item_exists(src) || !item_exists(dst))
1040 return -EINVAL;
1041 crush_bucket *a = get_bucket(src);
1042 crush_bucket *b = get_bucket(dst);
1043 unsigned aw = a->weight;
1044 unsigned bw = b->weight;
1045
1046 // swap weights
1047 adjust_item_weight(cct, a->id, bw);
1048 adjust_item_weight(cct, b->id, aw);
1049
1050 // swap items
1051 map<int,unsigned> tmp;
1052 unsigned as = a->size;
1053 unsigned bs = b->size;
1054 for (unsigned i = 0; i < as; ++i) {
1055 int item = a->items[0];
1056 int itemw = crush_get_bucket_item_weight(a, 0);
1057 tmp[item] = itemw;
1058 bucket_remove_item(a, item);
1059 }
1060 assert(a->size == 0);
1061 assert(b->size == bs);
1062 for (unsigned i = 0; i < bs; ++i) {
1063 int item = b->items[0];
1064 int itemw = crush_get_bucket_item_weight(b, 0);
1065 bucket_remove_item(b, item);
1066 bucket_add_item(a, item, itemw);
1067 }
1068 assert(a->size == bs);
1069 assert(b->size == 0);
1070 for (auto t : tmp) {
1071 bucket_add_item(b, t.first, t.second);
1072 }
1073 assert(a->size == bs);
1074 assert(b->size == as);
1075
1076 // swap names
1077 swap_names(src, dst);
1078 return 0;
1079 }
1080
1081 int CrushWrapper::link_bucket(
1082 CephContext *cct, int id, const map<string,string>& loc)
1083 {
1084 // sorry this only works for buckets
1085 if (id >= 0)
1086 return -EINVAL;
1087
1088 if (!item_exists(id))
1089 return -ENOENT;
1090
1091 // get the name of the bucket we are trying to move for later
1092 string id_name = get_item_name(id);
1093
1094 crush_bucket *b = get_bucket(id);
1095 unsigned bucket_weight = b->weight;
1096
1097 return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
1098 }
1099
1100 int CrushWrapper::create_or_move_item(
1101 CephContext *cct, int item, float weight, string name,
1102 const map<string,string>& loc) // typename -> bucketname
1103 {
1104 int ret = 0;
1105 int old_iweight;
1106
1107 if (!is_valid_crush_name(name))
1108 return -EINVAL;
1109
1110 if (check_item_loc(cct, item, loc, &old_iweight)) {
1111 ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
1112 << dendl;
1113 } else {
1114 if (_search_item_exists(item)) {
1115 weight = get_item_weightf(item);
1116 ldout(cct, 10) << "create_or_move_item " << item
1117 << " exists with weight " << weight << dendl;
1118 remove_item(cct, item, true);
1119 }
1120 ldout(cct, 5) << "create_or_move_item adding " << item
1121 << " weight " << weight
1122 << " at " << loc << dendl;
1123 ret = insert_item(cct, item, weight, name, loc);
1124 if (ret == 0)
1125 ret = 1; // changed
1126 }
1127 return ret;
1128 }
1129
1130 int CrushWrapper::update_item(
1131 CephContext *cct, int item, float weight, string name,
1132 const map<string,string>& loc) // typename -> bucketname
1133 {
1134 ldout(cct, 5) << "update_item item " << item << " weight " << weight
1135 << " name " << name << " loc " << loc << dendl;
1136 int ret = 0;
1137
1138 if (!is_valid_crush_name(name))
1139 return -EINVAL;
1140
1141 if (!is_valid_crush_loc(cct, loc))
1142 return -EINVAL;
1143
1144 ret = validate_weightf(weight);
1145 if (ret < 0) {
1146 return ret;
1147 }
1148
1149 // compare quantized (fixed-point integer) weights!
1150 int iweight = (int)(weight * (float)0x10000);
1151 int old_iweight;
1152 if (check_item_loc(cct, item, loc, &old_iweight)) {
1153 ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
1154 if (old_iweight != iweight) {
1155 ldout(cct, 5) << "update_item " << item << " adjusting weight "
1156 << ((float)old_iweight/(float)0x10000) << " -> " << weight
1157 << dendl;
1158 adjust_item_weight_in_loc(cct, item, iweight, loc);
1159 ret = 1;
1160 }
1161 if (get_item_name(item) != name) {
1162 ldout(cct, 5) << "update_item setting " << item << " name to " << name
1163 << dendl;
1164 set_item_name(item, name);
1165 ret = 1;
1166 }
1167 } else {
1168 if (item_exists(item)) {
1169 remove_item(cct, item, true);
1170 }
1171 ldout(cct, 5) << "update_item adding " << item << " weight " << weight
1172 << " at " << loc << dendl;
1173 ret = insert_item(cct, item, weight, name, loc);
1174 if (ret == 0)
1175 ret = 1; // changed
1176 }
1177 return ret;
1178 }
1179
1180 int CrushWrapper::get_item_weight(int id) const
1181 {
1182 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1183 crush_bucket *b = crush->buckets[bidx];
1184 if (b == NULL)
1185 continue;
1186 if (b->id == id)
1187 return b->weight;
1188 for (unsigned i = 0; i < b->size; i++)
1189 if (b->items[i] == id)
1190 return crush_get_bucket_item_weight(b, i);
1191 }
1192 return -ENOENT;
1193 }
1194
1195 int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
1196 {
1197 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
1198
1199 int bid = get_item_id(l->second);
1200 if (!bucket_exists(bid))
1201 continue;
1202 crush_bucket *b = get_bucket(bid);
1203 for (unsigned int i = 0; i < b->size; i++) {
1204 if (b->items[i] == id) {
1205 return crush_get_bucket_item_weight(b, i);
1206 }
1207 }
1208 }
1209 return -ENOENT;
1210 }
1211
1212 int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight)
1213 {
1214 ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
1215 int changed = 0;
1216 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1217 crush_bucket *b = crush->buckets[bidx];
1218 if (b == 0)
1219 continue;
1220 for (unsigned i = 0; i < b->size; i++) {
1221 if (b->items[i] == id) {
1222 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1223 ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff
1224 << " in bucket " << bidx << dendl;
1225 adjust_item_weight(cct, -1 - bidx, b->weight);
1226 changed++;
1227 }
1228 }
1229 }
1230 if (!changed)
1231 return -ENOENT;
1232 return changed;
1233 }
1234
1235 int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight, const map<string,string>& loc)
1236 {
1237 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
1238 << " in " << loc << dendl;
1239 int changed = 0;
1240
1241 for (auto l = loc.begin(); l != loc.end(); ++l) {
1242 int bid = get_item_id(l->second);
1243 if (!bucket_exists(bid))
1244 continue;
1245 crush_bucket *b = get_bucket(bid);
1246 for (unsigned int i = 0; i < b->size; i++) {
1247 if (b->items[i] == id) {
1248 int diff = bucket_adjust_item_weight(cct, b, id, weight);
1249 ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff
1250 << " in bucket " << bid << dendl;
1251 adjust_item_weight(cct, bid, b->weight);
1252 changed++;
1253 }
1254 }
1255 }
1256 if (!changed)
1257 return -ENOENT;
1258 return changed;
1259 }
1260
1261 int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
1262 {
1263 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
1264 crush_bucket *b = get_bucket(id);
1265 if (IS_ERR(b))
1266 return PTR_ERR(b);
1267 int changed = 0;
1268 list<crush_bucket*> q;
1269 q.push_back(b);
1270 while (!q.empty()) {
1271 b = q.front();
1272 q.pop_front();
1273 int local_changed = 0;
1274 for (unsigned i=0; i<b->size; ++i) {
1275 int n = b->items[i];
1276 if (n >= 0) {
1277 bucket_adjust_item_weight(cct, b, n, weight);
1278 ++changed;
1279 ++local_changed;
1280 } else {
1281 crush_bucket *sub = get_bucket(n);
1282 if (IS_ERR(sub))
1283 continue;
1284 q.push_back(sub);
1285 }
1286 }
1287 if (local_changed) {
1288 adjust_item_weight(cct, b->id, b->weight);
1289 }
1290 }
1291 return changed;
1292 }
1293
1294 bool CrushWrapper::check_item_present(int id) const
1295 {
1296 bool found = false;
1297
1298 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1299 crush_bucket *b = crush->buckets[bidx];
1300 if (b == 0)
1301 continue;
1302 for (unsigned i = 0; i < b->size; i++)
1303 if (b->items[i] == id)
1304 found = true;
1305 }
1306 return found;
1307 }
1308
1309
1310 pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret)
1311 {
1312
1313 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1314 crush_bucket *b = crush->buckets[bidx];
1315 if (b == 0)
1316 continue;
1317 if (is_shadow_item(b->id))
1318 continue;
1319 for (unsigned i = 0; i < b->size; i++)
1320 if (b->items[i] == id) {
1321 string parent_id = name_map[b->id];
1322 string parent_bucket_type = type_map[b->type];
1323 if (_ret)
1324 *_ret = 0;
1325 return make_pair(parent_bucket_type, parent_id);
1326 }
1327 }
1328
1329 if (_ret)
1330 *_ret = -ENOENT;
1331
1332 return pair<string, string>();
1333 }
1334
1335 int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
1336 {
1337 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
1338 crush_bucket *b = crush->buckets[bidx];
1339 if (b == 0)
1340 continue;
1341 if (is_shadow_item(b->id))
1342 continue;
1343 for (unsigned i = 0; i < b->size; i++) {
1344 if (b->items[i] == id) {
1345 *parent = b->id;
1346 return 0;
1347 }
1348 }
1349 }
1350 return -ENOENT;
1351 }
1352
1353 int CrushWrapper::get_parent_of_type(int item, int type) const
1354 {
1355 do {
1356 int r = get_immediate_parent_id(item, &item);
1357 if (r < 0) {
1358 return 0;
1359 }
1360 } while (get_bucket_type(item) != type);
1361 return item;
1362 }
1363
1364 int CrushWrapper::rename_class(const string& srcname, const string& dstname)
1365 {
1366 auto i = class_rname.find(srcname);
1367 if (i == class_rname.end())
1368 return -ENOENT;
1369 auto j = class_rname.find(dstname);
1370 if (j != class_rname.end())
1371 return -EEXIST;
1372
1373 int class_id = i->second;
1374 assert(class_name.count(class_id));
1375 // rename any shadow buckets of old class name
1376 for (auto &it: class_map) {
1377 if (it.first < 0 && it.second == class_id) {
1378 string old_name = get_item_name(it.first);
1379 size_t pos = old_name.find("~");
1380 assert(pos != string::npos);
1381 string name_no_class = old_name.substr(0, pos);
1382 string old_class_name = old_name.substr(pos + 1);
1383 assert(old_class_name == srcname);
1384 string new_name = name_no_class + "~" + dstname;
1385 // we do not use set_item_name
1386 // because the name is intentionally invalid
1387 name_map[it.first] = new_name;
1388 have_rmaps = false;
1389 }
1390 }
1391
1392 // rename class
1393 class_rname.erase(srcname);
1394 class_name.erase(class_id);
1395 class_rname[dstname] = class_id;
1396 class_name[class_id] = dstname;
1397 return 0;
1398 }
1399
1400 int CrushWrapper::populate_classes(
1401 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
1402 {
1403 // build set of previous used shadow ids
1404 set<int32_t> used_ids;
1405 for (auto& p : old_class_bucket) {
1406 for (auto& q : p.second) {
1407 used_ids.insert(q.second);
1408 }
1409 }
1410 // accumulate weight values for each carg and bucket as we go. because it is
1411 // depth first, we will have the nested bucket weights we need when we
1412 // finish constructing the containing buckets.
1413 map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> weights
1414 set<int> roots;
1415 find_nonshadow_roots(roots);
1416 for (auto &r : roots) {
1417 if (r >= 0)
1418 continue;
1419 for (auto &c : class_name) {
1420 int clone;
1421 int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
1422 &clone, &cmap_item_weight);
1423 if (res < 0)
1424 return res;
1425 }
1426 }
1427 return 0;
1428 }
1429
1430 int CrushWrapper::trim_roots_with_class()
1431 {
1432 set<int> roots;
1433 find_shadow_roots(roots);
1434 for (auto &r : roots) {
1435 if (r >= 0)
1436 continue;
1437 int res = remove_root(r);
1438 if (res)
1439 return res;
1440 }
1441 // there is no need to reweight because we only remove from the
1442 // root and down
1443 return 0;
1444 }
1445
1446 int32_t CrushWrapper::_alloc_class_id() const {
1447 if (class_name.empty()) {
1448 return 0;
1449 }
1450 int32_t class_id = class_name.rbegin()->first + 1;
1451 if (class_id >= 0) {
1452 return class_id;
1453 }
1454 // wrapped, pick a random start and do exhaustive search
1455 uint32_t upperlimit = numeric_limits<int32_t>::max();
1456 upperlimit++;
1457 class_id = rand() % upperlimit;
1458 const auto start = class_id;
1459 do {
1460 if (!class_name.count(class_id)) {
1461 return class_id;
1462 } else {
1463 class_id++;
1464 if (class_id < 0) {
1465 class_id = 0;
1466 }
1467 }
1468 } while (class_id != start);
1469 assert(0 == "no available class id");
1470 }
1471
1472 void CrushWrapper::reweight(CephContext *cct)
1473 {
1474 set<int> roots;
1475 find_roots(roots);
1476 for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
1477 if (*p >= 0)
1478 continue;
1479 crush_bucket *b = get_bucket(*p);
1480 ldout(cct, 5) << "reweight bucket " << *p << dendl;
1481 int r = crush_reweight_bucket(crush, b);
1482 assert(r == 0);
1483 }
1484 }
1485
1486 int CrushWrapper::add_simple_rule_at(
1487 string name, string root_name,
1488 string failure_domain_name,
1489 string device_class,
1490 string mode, int rule_type,
1491 int rno,
1492 ostream *err)
1493 {
1494 if (rule_exists(name)) {
1495 if (err)
1496 *err << "rule " << name << " exists";
1497 return -EEXIST;
1498 }
1499 if (rno >= 0) {
1500 if (rule_exists(rno)) {
1501 if (err)
1502 *err << "rule with ruleno " << rno << " exists";
1503 return -EEXIST;
1504 }
1505 if (ruleset_exists(rno)) {
1506 if (err)
1507 *err << "ruleset " << rno << " exists";
1508 return -EEXIST;
1509 }
1510 } else {
1511 for (rno = 0; rno < get_max_rules(); rno++) {
1512 if (!rule_exists(rno) && !ruleset_exists(rno))
1513 break;
1514 }
1515 }
1516 if (!name_exists(root_name)) {
1517 if (err)
1518 *err << "root item " << root_name << " does not exist";
1519 return -ENOENT;
1520 }
1521 int root = get_item_id(root_name);
1522 int type = 0;
1523 if (failure_domain_name.length()) {
1524 type = get_type_id(failure_domain_name);
1525 if (type < 0) {
1526 if (err)
1527 *err << "unknown type " << failure_domain_name;
1528 return -EINVAL;
1529 }
1530 }
1531 if (device_class.size()) {
1532 if (!class_exists(device_class)) {
1533 if (err)
1534 *err << "device class " << device_class << " does not exist";
1535 return -EINVAL;
1536 }
1537 int c = get_class_id(device_class);
1538 if (class_bucket.count(root) == 0 ||
1539 class_bucket[root].count(c) == 0) {
1540 if (err)
1541 *err << "root " << root_name << " has no devices with class "
1542 << device_class;
1543 return -EINVAL;
1544 }
1545 root = class_bucket[root][c];
1546 }
1547 if (mode != "firstn" && mode != "indep") {
1548 if (err)
1549 *err << "unknown mode " << mode;
1550 return -EINVAL;
1551 }
1552
1553 int steps = 3;
1554 if (mode == "indep")
1555 steps = 5;
1556 int min_rep = mode == "firstn" ? 1 : 3;
1557 int max_rep = mode == "firstn" ? 10 : 20;
1558 //set the ruleset the same as rule_id(rno)
1559 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
1560 assert(rule);
1561 int step = 0;
1562 if (mode == "indep") {
1563 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1564 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1565 }
1566 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1567 if (type)
1568 crush_rule_set_step(rule, step++,
1569 mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
1570 CRUSH_RULE_CHOOSELEAF_INDEP,
1571 CRUSH_CHOOSE_N,
1572 type);
1573 else
1574 crush_rule_set_step(rule, step++,
1575 mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
1576 CRUSH_RULE_CHOOSE_INDEP,
1577 CRUSH_CHOOSE_N,
1578 0);
1579 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1580
1581 int ret = crush_add_rule(crush, rule, rno);
1582 if(ret < 0) {
1583 *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
1584 return ret;
1585 }
1586 set_rule_name(rno, name);
1587 have_rmaps = false;
1588 return rno;
1589 }
1590
1591 int CrushWrapper::add_simple_rule(
1592 string name, string root_name,
1593 string failure_domain_name,
1594 string device_class,
1595 string mode, int rule_type,
1596 ostream *err)
1597 {
1598 return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
1599 mode,
1600 rule_type, -1, err);
1601 }
1602
1603 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
1604 {
1605 if (ruleno >= crush->max_rules)
1606 return -ENOENT;
1607 if (crush->rules[ruleno] == NULL)
1608 return -ENOENT;
1609 crush_rule *rule = crush->rules[ruleno];
1610
1611 // build a weight map for each TAKE in the rule, and then merge them
1612
1613 // FIXME: if there are multiple takes that place a different number of
1614 // objects we do not take that into account. (Also, note that doing this
1615 // right is also a function of the pool, since the crush rule
1616 // might choose 2 + choose 2 but pool size may only be 3.)
1617 for (unsigned i=0; i<rule->len; ++i) {
1618 map<int,float> m;
1619 float sum = 0;
1620 if (rule->steps[i].op == CRUSH_RULE_TAKE) {
1621 int n = rule->steps[i].arg1;
1622 if (n >= 0) {
1623 m[n] = 1.0;
1624 sum = 1.0;
1625 } else {
1626 list<int> q;
1627 q.push_back(n);
1628 //breadth first iterate the OSD tree
1629 while (!q.empty()) {
1630 int bno = q.front();
1631 q.pop_front();
1632 crush_bucket *b = crush->buckets[-1-bno];
1633 assert(b);
1634 for (unsigned j=0; j<b->size; ++j) {
1635 int item_id = b->items[j];
1636 if (item_id >= 0) { //it's an OSD
1637 float w = crush_get_bucket_item_weight(b, j);
1638 m[item_id] = w;
1639 sum += w;
1640 } else { //not an OSD, expand the child later
1641 q.push_back(item_id);
1642 }
1643 }
1644 }
1645 }
1646 }
1647 for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
1648 map<int,float>::iterator q = pmap->find(p->first);
1649 if (q == pmap->end()) {
1650 (*pmap)[p->first] = p->second / sum;
1651 } else {
1652 q->second += p->second / sum;
1653 }
1654 }
1655 }
1656
1657 return 0;
1658 }
1659
1660 int CrushWrapper::remove_rule(int ruleno)
1661 {
1662 if (ruleno >= (int)crush->max_rules)
1663 return -ENOENT;
1664 if (crush->rules[ruleno] == NULL)
1665 return -ENOENT;
1666 crush_destroy_rule(crush->rules[ruleno]);
1667 crush->rules[ruleno] = NULL;
1668 rule_name_map.erase(ruleno);
1669 have_rmaps = false;
1670 return 0;
1671 }
1672
1673 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
1674 {
1675 if (cct->_conf->osd_crush_update_weight_set) {
1676 unsigned position;
1677 for (position = 0; position < bucket->size; position++)
1678 if (bucket->items[position] == item)
1679 break;
1680 assert(position != bucket->size);
1681 for (auto w : choose_args) {
1682 crush_choose_arg_map arg_map = w.second;
1683 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1684 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1685 crush_weight_set *weight_set = &arg->weight_set[j];
1686 weight_set->weights[position] = weight;
1687 }
1688 }
1689 }
1690 return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
1691 }
1692
1693 int CrushWrapper::add_bucket(
1694 int bucketno, int alg, int hash, int type, int size,
1695 int *items, int *weights, int *idout)
1696 {
1697 if (alg == 0) {
1698 alg = get_default_bucket_alg();
1699 if (alg == 0)
1700 return -EINVAL;
1701 }
1702 crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
1703 weights);
1704 assert(b);
1705 int r = crush_add_bucket(crush, bucketno, b, idout);
1706 for (auto& p : choose_args) {
1707 crush_choose_arg_map& cmap = p.second;
1708 if (cmap.args) {
1709 if ((int)cmap.size <= *idout) {
1710 cmap.args = (crush_choose_arg*)realloc(
1711 cmap.args,
1712 sizeof(crush_choose_arg) * (*idout + 1));
1713 memset(&cmap.args[cmap.size], 0,
1714 sizeof(crush_choose_arg) * (*idout + 1 - cmap.size));
1715 cmap.size = *idout + 1;
1716 }
1717 } else {
1718 cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
1719 *idout + 1);
1720 cmap.size = *idout + 1;
1721 }
1722 if (size > 0) {
1723 int positions = get_choose_args_positions(cmap);
1724 crush_choose_arg& carg = cmap.args[*idout];
1725 carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
1726 size);
1727 carg.weight_set_size = positions;
1728 for (int ppos = 0; ppos < positions; ++ppos) {
1729 carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
1730 carg.weight_set[ppos].size = size;
1731 for (int bpos = 0; bpos < size; ++bpos) {
1732 carg.weight_set[ppos].weights[bpos] = weights[bpos];
1733 }
1734 }
1735 }
1736 }
1737 return r;
1738 }
1739
1740 int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
1741 {
1742 __u32 new_size = bucket->size + 1;
1743 int r = crush_bucket_add_item(crush, bucket, item, weight);
1744 if (r < 0) {
1745 return r;
1746 }
1747 for (auto w : choose_args) {
1748 crush_choose_arg_map arg_map = w.second;
1749 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1750 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1751 crush_weight_set *weight_set = &arg->weight_set[j];
1752 weight_set->weights = (__u32*)realloc(weight_set->weights,
1753 new_size * sizeof(__u32));
1754 assert(weight_set->size + 1 == new_size);
1755 weight_set->weights[weight_set->size] = weight;
1756 weight_set->size = new_size;
1757 }
1758 if (arg->ids_size) {
1759 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1760 assert(arg->ids_size + 1 == new_size);
1761 arg->ids[arg->ids_size] = item;
1762 arg->ids_size = new_size;
1763 }
1764 }
1765 return 0;
1766 }
1767
1768 int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
1769 {
1770 __u32 new_size = bucket->size - 1;
1771 unsigned position;
1772 for (position = 0; position < bucket->size; position++)
1773 if (bucket->items[position] == item)
1774 break;
1775 assert(position != bucket->size);
1776 int r = crush_bucket_remove_item(crush, bucket, item);
1777 if (r < 0) {
1778 return r;
1779 }
1780 for (auto w : choose_args) {
1781 crush_choose_arg_map arg_map = w.second;
1782 crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
1783 for (__u32 j = 0; j < arg->weight_set_size; j++) {
1784 crush_weight_set *weight_set = &arg->weight_set[j];
1785 assert(weight_set->size - 1 == new_size);
1786 for (__u32 k = position; k < new_size; k++)
1787 weight_set->weights[k] = weight_set->weights[k+1];
1788 if (new_size) {
1789 weight_set->weights = (__u32*)realloc(weight_set->weights,
1790 new_size * sizeof(__u32));
1791 } else {
1792 weight_set->weights = NULL;
1793 }
1794 weight_set->size = new_size;
1795 }
1796 if (arg->ids_size) {
1797 assert(arg->ids_size - 1 == new_size);
1798 for (__u32 k = position; k < new_size; k++)
1799 arg->ids[k] = arg->ids[k+1];
1800 if (new_size) {
1801 arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
1802 } else {
1803 arg->ids = NULL;
1804 }
1805 arg->ids_size = new_size;
1806 }
1807 }
1808 return 0;
1809 }
1810
1811 int CrushWrapper::update_device_class(int id,
1812 const string& class_name,
1813 const string& name,
1814 ostream *ss)
1815 {
1816 assert(item_exists(id));
1817 auto old_class_name = get_item_class(id);
1818 if (old_class_name && old_class_name != class_name) {
1819 *ss << "osd." << id << " has already bound to class '" << old_class_name
1820 << "', can not reset class to '" << class_name << "'; "
1821 << "use 'ceph osd crush rm-device-class <osd>' to "
1822 << "remove old class first";
1823 return -EBUSY;
1824 }
1825
1826 int class_id = get_or_create_class_id(class_name);
1827 if (id < 0) {
1828 *ss << name << " id " << id << " is negative";
1829 return -EINVAL;
1830 }
1831
1832 if (class_map.count(id) != 0 && class_map[id] == class_id) {
1833 *ss << name << " already set to class " << class_name;
1834 return 0;
1835 }
1836
1837 set_item_class(id, class_id);
1838
1839 int r = rebuild_roots_with_classes();
1840 if (r < 0)
1841 return r;
1842 return 1;
1843 }
1844
1845 int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
1846 {
1847 assert(ss);
1848 const char *name = get_item_name(id);
1849 if (!name) {
1850 *ss << "osd." << id << " does not have a name";
1851 return -ENOENT;
1852 }
1853
1854 const char *class_name = get_item_class(id);
1855 if (!class_name) {
1856 *ss << "osd." << id << " has not been bound to a specific class yet";
1857 return 0;
1858 }
1859 class_remove_item(id);
1860
1861 int r = rebuild_roots_with_classes();
1862 if (r < 0) {
1863 *ss << "unable to rebuild roots with class '" << class_name << "' "
1864 << "of osd." << id << ": " << cpp_strerror(r);
1865 return r;
1866 }
1867 return 0;
1868 }
1869
1870 int CrushWrapper::device_class_clone(
1871 int original_id, int device_class,
1872 const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
1873 const std::set<int32_t>& used_ids,
1874 int *clone,
1875 map<int,map<int,vector<int>>> *cmap_item_weight)
1876 {
1877 const char *item_name = get_item_name(original_id);
1878 if (item_name == NULL)
1879 return -ECHILD;
1880 const char *class_name = get_class_name(device_class);
1881 if (class_name == NULL)
1882 return -EBADF;
1883 string copy_name = item_name + string("~") + class_name;
1884 if (name_exists(copy_name)) {
1885 *clone = get_item_id(copy_name);
1886 return 0;
1887 }
1888
1889 crush_bucket *original = get_bucket(original_id);
1890 assert(!IS_ERR(original));
1891 crush_bucket *copy = crush_make_bucket(crush,
1892 original->alg,
1893 original->hash,
1894 original->type,
1895 0, NULL, NULL);
1896 assert(copy);
1897
1898 vector<unsigned> item_orig_pos; // new item pos -> orig item pos
1899 for (unsigned i = 0; i < original->size; i++) {
1900 int item = original->items[i];
1901 int weight = crush_get_bucket_item_weight(original, i);
1902 if (item >= 0) {
1903 if (class_map.count(item) != 0 && class_map[item] == device_class) {
1904 int res = crush_bucket_add_item(crush, copy, item, weight);
1905 if (res)
1906 return res;
1907 } else {
1908 continue;
1909 }
1910 } else {
1911 int child_copy_id;
1912 int res = device_class_clone(item, device_class, old_class_bucket,
1913 used_ids, &child_copy_id,
1914 cmap_item_weight);
1915 if (res < 0)
1916 return res;
1917 crush_bucket *child_copy = get_bucket(child_copy_id);
1918 assert(!IS_ERR(child_copy));
1919 res = crush_bucket_add_item(crush, copy, child_copy_id,
1920 child_copy->weight);
1921 if (res)
1922 return res;
1923 }
1924 item_orig_pos.push_back(i);
1925 }
1926 assert(item_orig_pos.size() == copy->size);
1927
1928 int bno = 0;
1929 if (old_class_bucket.count(original_id) &&
1930 old_class_bucket.at(original_id).count(device_class)) {
1931 bno = old_class_bucket.at(original_id).at(device_class);
1932 } else {
1933 // pick a new shadow bucket id that is not used by the current map
1934 // *or* any previous shadow buckets.
1935 bno = -1;
1936 while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
1937 used_ids.count(bno)) {
1938 --bno;
1939 }
1940 }
1941 int res = crush_add_bucket(crush, bno, copy, clone);
1942 if (res)
1943 return res;
1944 assert(!bno || bno == *clone);
1945
1946 res = set_item_class(*clone, device_class);
1947 if (res < 0)
1948 return res;
1949
1950 // we do not use set_item_name because the name is intentionally invalid
1951 name_map[*clone] = copy_name;
1952 if (have_rmaps)
1953 name_rmap[copy_name] = *clone;
1954 class_bucket[original_id][device_class] = *clone;
1955
1956 // set up choose_args for the new bucket.
1957 for (auto& w : choose_args) {
1958 crush_choose_arg_map& cmap = w.second;
1959 if (-1-bno >= (int)cmap.size) {
1960 unsigned new_size = -1-bno + 1;
1961 cmap.args = (crush_choose_arg*)realloc(cmap.args,
1962 new_size * sizeof(cmap.args[0]));
1963 memset(cmap.args + cmap.size, 0,
1964 (new_size - cmap.size) * sizeof(cmap.args[0]));
1965 }
1966 auto& o = cmap.args[-1-original_id];
1967 auto& n = cmap.args[-1-bno];
1968 n.ids_size = 0; // FIXME: implement me someday
1969 n.weight_set_size = o.weight_set_size;
1970 n.weight_set = (crush_weight_set*)calloc(
1971 n.weight_set_size, sizeof(crush_weight_set));
1972 for (size_t s = 0; s < n.weight_set_size; ++s) {
1973 n.weight_set[s].size = copy->size;
1974 n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
1975 }
1976 for (size_t s = 0; s < n.weight_set_size; ++s) {
1977 vector<int> bucket_weights(n.weight_set_size);
1978 for (size_t i = 0; i < copy->size; ++i) {
1979 int item = copy->items[i];
1980 if (item >= 0) {
1981 n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
1982 } else {
1983 n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
1984 }
1985 bucket_weights[s] += n.weight_set[s].weights[i];
1986 }
1987 (*cmap_item_weight)[w.first][bno] = bucket_weights;
1988 }
1989 }
1990 return 0;
1991 }
1992
1993 bool CrushWrapper::_class_is_dead(int class_id)
1994 {
1995 for (auto &p: class_map) {
1996 if (p.first >= 0 && p.second == class_id) {
1997 return false;
1998 }
1999 }
2000 for (unsigned i = 0; i < crush->max_rules; ++i) {
2001 crush_rule *r = crush->rules[i];
2002 if (!r)
2003 continue;
2004 for (unsigned j = 0; j < r->len; ++j) {
2005 if (r->steps[j].op == CRUSH_RULE_TAKE) {
2006 int root = r->steps[j].arg1;
2007 for (auto &p : class_bucket) {
2008 auto& q = p.second;
2009 if (q.count(class_id) && q[class_id] == root) {
2010 return false;
2011 }
2012 }
2013 }
2014 }
2015 }
2016 // no more referenced by any devices or crush rules
2017 return true;
2018 }
2019
2020 void CrushWrapper::cleanup_dead_classes()
2021 {
2022 auto p = class_name.begin();
2023 while (p != class_name.end()) {
2024 if (_class_is_dead(p->first)) {
2025 string n = p->second;
2026 ++p;
2027 remove_class_name(n);
2028 } else {
2029 ++p;
2030 }
2031 }
2032 }
2033
2034 int CrushWrapper::rebuild_roots_with_classes()
2035 {
2036 std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
2037 cleanup_dead_classes();
2038 int r = trim_roots_with_class();
2039 if (r < 0)
2040 return r;
2041 class_bucket.clear();
2042 return populate_classes(old_class_bucket);
2043 }
2044
2045 void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
2046 {
2047 assert(crush);
2048
2049 __u32 magic = CRUSH_MAGIC;
2050 ::encode(magic, bl);
2051
2052 ::encode(crush->max_buckets, bl);
2053 ::encode(crush->max_rules, bl);
2054 ::encode(crush->max_devices, bl);
2055
2056 bool encode_compat_choose_args = false;
2057 crush_choose_arg_map arg_map;
2058 memset(&arg_map, '\0', sizeof(arg_map));
2059 if (has_choose_args() &&
2060 !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
2061 assert(!has_incompat_choose_args());
2062 encode_compat_choose_args = true;
2063 arg_map = choose_args.begin()->second;
2064 }
2065
2066 // buckets
2067 for (int i=0; i<crush->max_buckets; i++) {
2068 __u32 alg = 0;
2069 if (crush->buckets[i]) alg = crush->buckets[i]->alg;
2070 ::encode(alg, bl);
2071 if (!alg)
2072 continue;
2073
2074 ::encode(crush->buckets[i]->id, bl);
2075 ::encode(crush->buckets[i]->type, bl);
2076 ::encode(crush->buckets[i]->alg, bl);
2077 ::encode(crush->buckets[i]->hash, bl);
2078 ::encode(crush->buckets[i]->weight, bl);
2079 ::encode(crush->buckets[i]->size, bl);
2080 for (unsigned j=0; j<crush->buckets[i]->size; j++)
2081 ::encode(crush->buckets[i]->items[j], bl);
2082
2083 switch (crush->buckets[i]->alg) {
2084 case CRUSH_BUCKET_UNIFORM:
2085 ::encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
2086 break;
2087
2088 case CRUSH_BUCKET_LIST:
2089 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2090 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
2091 ::encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
2092 }
2093 break;
2094
2095 case CRUSH_BUCKET_TREE:
2096 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
2097 for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
2098 ::encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
2099 break;
2100
2101 case CRUSH_BUCKET_STRAW:
2102 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2103 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
2104 ::encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
2105 }
2106 break;
2107
2108 case CRUSH_BUCKET_STRAW2:
2109 {
2110 __u32 *weights;
2111 if (encode_compat_choose_args &&
2112 arg_map.args[i].weight_set_size > 0) {
2113 weights = arg_map.args[i].weight_set[0].weights;
2114 } else {
2115 weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
2116 }
2117 for (unsigned j=0; j<crush->buckets[i]->size; j++) {
2118 ::encode(weights[j], bl);
2119 }
2120 }
2121 break;
2122
2123 default:
2124 ceph_abort();
2125 break;
2126 }
2127 }
2128
2129 // rules
2130 for (unsigned i=0; i<crush->max_rules; i++) {
2131 __u32 yes = crush->rules[i] ? 1:0;
2132 ::encode(yes, bl);
2133 if (!yes)
2134 continue;
2135
2136 ::encode(crush->rules[i]->len, bl);
2137 ::encode(crush->rules[i]->mask, bl);
2138 for (unsigned j=0; j<crush->rules[i]->len; j++)
2139 ::encode(crush->rules[i]->steps[j], bl);
2140 }
2141
2142 // name info
2143 ::encode(type_map, bl);
2144 ::encode(name_map, bl);
2145 ::encode(rule_name_map, bl);
2146
2147 // tunables
2148 ::encode(crush->choose_local_tries, bl);
2149 ::encode(crush->choose_local_fallback_tries, bl);
2150 ::encode(crush->choose_total_tries, bl);
2151 ::encode(crush->chooseleaf_descend_once, bl);
2152 ::encode(crush->chooseleaf_vary_r, bl);
2153 ::encode(crush->straw_calc_version, bl);
2154 ::encode(crush->allowed_bucket_algs, bl);
2155 if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
2156 ::encode(crush->chooseleaf_stable, bl);
2157 }
2158
2159 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
2160 // device classes
2161 ::encode(class_map, bl);
2162 ::encode(class_name, bl);
2163 ::encode(class_bucket, bl);
2164
2165 // choose args
2166 __u32 size = (__u32)choose_args.size();
2167 ::encode(size, bl);
2168 for (auto c : choose_args) {
2169 ::encode(c.first, bl);
2170 crush_choose_arg_map arg_map = c.second;
2171 size = 0;
2172 for (__u32 i = 0; i < arg_map.size; i++) {
2173 crush_choose_arg *arg = &arg_map.args[i];
2174 if (arg->weight_set_size == 0 &&
2175 arg->ids_size == 0)
2176 continue;
2177 size++;
2178 }
2179 ::encode(size, bl);
2180 for (__u32 i = 0; i < arg_map.size; i++) {
2181 crush_choose_arg *arg = &arg_map.args[i];
2182 if (arg->weight_set_size == 0 &&
2183 arg->ids_size == 0)
2184 continue;
2185 ::encode(i, bl);
2186 ::encode(arg->weight_set_size, bl);
2187 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2188 crush_weight_set *weight_set = &arg->weight_set[j];
2189 ::encode(weight_set->size, bl);
2190 for (__u32 k = 0; k < weight_set->size; k++)
2191 ::encode(weight_set->weights[k], bl);
2192 }
2193 ::encode(arg->ids_size, bl);
2194 for (__u32 j = 0; j < arg->ids_size; j++)
2195 ::encode(arg->ids[j], bl);
2196 }
2197 }
2198 }
2199 }
2200
2201 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
2202 {
2203 m.clear();
2204 __u32 n;
2205 ::decode(n, blp);
2206 while (n--) {
2207 __s32 key;
2208 ::decode(key, blp);
2209
2210 __u32 strlen;
2211 ::decode(strlen, blp);
2212 if (strlen == 0) {
2213 // der, key was actually 64-bits!
2214 ::decode(strlen, blp);
2215 }
2216 ::decode_nohead(strlen, m[key], blp);
2217 }
2218 }
2219
2220 void CrushWrapper::decode(bufferlist::iterator& blp)
2221 {
2222 create();
2223
2224 __u32 magic;
2225 ::decode(magic, blp);
2226 if (magic != CRUSH_MAGIC)
2227 throw buffer::malformed_input("bad magic number");
2228
2229 ::decode(crush->max_buckets, blp);
2230 ::decode(crush->max_rules, blp);
2231 ::decode(crush->max_devices, blp);
2232
2233 // legacy tunables, unless we decode something newer
2234 set_tunables_legacy();
2235
2236 try {
2237 // buckets
2238 crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
2239 for (int i=0; i<crush->max_buckets; i++) {
2240 decode_crush_bucket(&crush->buckets[i], blp);
2241 }
2242
2243 // rules
2244 crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
2245 for (unsigned i = 0; i < crush->max_rules; ++i) {
2246 __u32 yes;
2247 ::decode(yes, blp);
2248 if (!yes) {
2249 crush->rules[i] = NULL;
2250 continue;
2251 }
2252
2253 __u32 len;
2254 ::decode(len, blp);
2255 crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
2256 crush->rules[i]->len = len;
2257 ::decode(crush->rules[i]->mask, blp);
2258 for (unsigned j=0; j<crush->rules[i]->len; j++)
2259 ::decode(crush->rules[i]->steps[j], blp);
2260 }
2261
2262 // name info
2263 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2264 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2265 // both by assuming the string is always non-empty.
2266 decode_32_or_64_string_map(type_map, blp);
2267 decode_32_or_64_string_map(name_map, blp);
2268 decode_32_or_64_string_map(rule_name_map, blp);
2269
2270 // tunables
2271 if (!blp.end()) {
2272 ::decode(crush->choose_local_tries, blp);
2273 ::decode(crush->choose_local_fallback_tries, blp);
2274 ::decode(crush->choose_total_tries, blp);
2275 }
2276 if (!blp.end()) {
2277 ::decode(crush->chooseleaf_descend_once, blp);
2278 }
2279 if (!blp.end()) {
2280 ::decode(crush->chooseleaf_vary_r, blp);
2281 }
2282 if (!blp.end()) {
2283 ::decode(crush->straw_calc_version, blp);
2284 }
2285 if (!blp.end()) {
2286 ::decode(crush->allowed_bucket_algs, blp);
2287 }
2288 if (!blp.end()) {
2289 ::decode(crush->chooseleaf_stable, blp);
2290 }
2291 if (!blp.end()) {
2292 ::decode(class_map, blp);
2293 ::decode(class_name, blp);
2294 for (auto &c : class_name)
2295 class_rname[c.second] = c.first;
2296 ::decode(class_bucket, blp);
2297 }
2298 if (!blp.end()) {
2299 __u32 choose_args_size;
2300 ::decode(choose_args_size, blp);
2301 for (__u32 i = 0; i < choose_args_size; i++) {
2302 uint64_t choose_args_index;
2303 ::decode(choose_args_index, blp);
2304 crush_choose_arg_map arg_map;
2305 arg_map.size = crush->max_buckets;
2306 arg_map.args = (crush_choose_arg*)calloc(
2307 arg_map.size, sizeof(crush_choose_arg));
2308 __u32 size;
2309 ::decode(size, blp);
2310 for (__u32 j = 0; j < size; j++) {
2311 __u32 bucket_index;
2312 ::decode(bucket_index, blp);
2313 assert(bucket_index < arg_map.size);
2314 crush_choose_arg *arg = &arg_map.args[bucket_index];
2315 ::decode(arg->weight_set_size, blp);
2316 if (arg->weight_set_size) {
2317 arg->weight_set = (crush_weight_set*)calloc(
2318 arg->weight_set_size, sizeof(crush_weight_set));
2319 for (__u32 k = 0; k < arg->weight_set_size; k++) {
2320 crush_weight_set *weight_set = &arg->weight_set[k];
2321 ::decode(weight_set->size, blp);
2322 weight_set->weights = (__u32*)calloc(
2323 weight_set->size, sizeof(__u32));
2324 for (__u32 l = 0; l < weight_set->size; l++)
2325 ::decode(weight_set->weights[l], blp);
2326 }
2327 }
2328 ::decode(arg->ids_size, blp);
2329 if (arg->ids_size) {
2330 assert(arg->ids_size == crush->buckets[bucket_index]->size);
2331 arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
2332 for (__u32 k = 0; k < arg->ids_size; k++)
2333 ::decode(arg->ids[k], blp);
2334 }
2335 }
2336 choose_args[choose_args_index] = arg_map;
2337 }
2338 }
2339 finalize();
2340 }
2341 catch (...) {
2342 crush_destroy(crush);
2343 throw;
2344 }
2345 }
2346
2347 void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp)
2348 {
2349 __u32 alg;
2350 ::decode(alg, blp);
2351 if (!alg) {
2352 *bptr = NULL;
2353 return;
2354 }
2355
2356 int size = 0;
2357 switch (alg) {
2358 case CRUSH_BUCKET_UNIFORM:
2359 size = sizeof(crush_bucket_uniform);
2360 break;
2361 case CRUSH_BUCKET_LIST:
2362 size = sizeof(crush_bucket_list);
2363 break;
2364 case CRUSH_BUCKET_TREE:
2365 size = sizeof(crush_bucket_tree);
2366 break;
2367 case CRUSH_BUCKET_STRAW:
2368 size = sizeof(crush_bucket_straw);
2369 break;
2370 case CRUSH_BUCKET_STRAW2:
2371 size = sizeof(crush_bucket_straw2);
2372 break;
2373 default:
2374 {
2375 char str[128];
2376 snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
2377 throw buffer::malformed_input(str);
2378 }
2379 }
2380 crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
2381 *bptr = bucket;
2382
2383 ::decode(bucket->id, blp);
2384 ::decode(bucket->type, blp);
2385 ::decode(bucket->alg, blp);
2386 ::decode(bucket->hash, blp);
2387 ::decode(bucket->weight, blp);
2388 ::decode(bucket->size, blp);
2389
2390 bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
2391 for (unsigned j = 0; j < bucket->size; ++j) {
2392 ::decode(bucket->items[j], blp);
2393 }
2394
2395 switch (bucket->alg) {
2396 case CRUSH_BUCKET_UNIFORM:
2397 ::decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
2398 break;
2399
2400 case CRUSH_BUCKET_LIST: {
2401 crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
2402 cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2403 cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2404
2405 for (unsigned j = 0; j < bucket->size; ++j) {
2406 ::decode(cbl->item_weights[j], blp);
2407 ::decode(cbl->sum_weights[j], blp);
2408 }
2409 break;
2410 }
2411
2412 case CRUSH_BUCKET_TREE: {
2413 crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
2414 ::decode(cbt->num_nodes, blp);
2415 cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
2416 for (unsigned j=0; j<cbt->num_nodes; j++) {
2417 ::decode(cbt->node_weights[j], blp);
2418 }
2419 break;
2420 }
2421
2422 case CRUSH_BUCKET_STRAW: {
2423 crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
2424 cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2425 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2426 for (unsigned j = 0; j < bucket->size; ++j) {
2427 ::decode(cbs->item_weights[j], blp);
2428 ::decode(cbs->straws[j], blp);
2429 }
2430 break;
2431 }
2432
2433 case CRUSH_BUCKET_STRAW2: {
2434 crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
2435 cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
2436 for (unsigned j = 0; j < bucket->size; ++j) {
2437 ::decode(cbs->item_weights[j], blp);
2438 }
2439 break;
2440 }
2441
2442 default:
2443 // We should have handled this case in the first switch statement
2444 ceph_abort();
2445 break;
2446 }
2447 }
2448
2449
2450 void CrushWrapper::dump(Formatter *f) const
2451 {
2452 f->open_array_section("devices");
2453 for (int i=0; i<get_max_devices(); i++) {
2454 f->open_object_section("device");
2455 f->dump_int("id", i);
2456 const char *n = get_item_name(i);
2457 if (n) {
2458 f->dump_string("name", n);
2459 } else {
2460 char name[20];
2461 sprintf(name, "device%d", i);
2462 f->dump_string("name", name);
2463 }
2464 const char *device_class = get_item_class(i);
2465 if (device_class != NULL)
2466 f->dump_string("class", device_class);
2467 f->close_section();
2468 }
2469 f->close_section();
2470
2471 f->open_array_section("types");
2472 int n = get_num_type_names();
2473 for (int i=0; n; i++) {
2474 const char *name = get_type_name(i);
2475 if (!name) {
2476 if (i == 0) {
2477 f->open_object_section("type");
2478 f->dump_int("type_id", 0);
2479 f->dump_string("name", "device");
2480 f->close_section();
2481 }
2482 continue;
2483 }
2484 n--;
2485 f->open_object_section("type");
2486 f->dump_int("type_id", i);
2487 f->dump_string("name", name);
2488 f->close_section();
2489 }
2490 f->close_section();
2491
2492 f->open_array_section("buckets");
2493 for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
2494 if (!bucket_exists(bucket))
2495 continue;
2496 f->open_object_section("bucket");
2497 f->dump_int("id", bucket);
2498 if (get_item_name(bucket))
2499 f->dump_string("name", get_item_name(bucket));
2500 f->dump_int("type_id", get_bucket_type(bucket));
2501 if (get_type_name(get_bucket_type(bucket)))
2502 f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
2503 f->dump_int("weight", get_bucket_weight(bucket));
2504 f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
2505 f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
2506 f->open_array_section("items");
2507 for (int j=0; j<get_bucket_size(bucket); j++) {
2508 f->open_object_section("item");
2509 f->dump_int("id", get_bucket_item(bucket, j));
2510 f->dump_int("weight", get_bucket_item_weight(bucket, j));
2511 f->dump_int("pos", j);
2512 f->close_section();
2513 }
2514 f->close_section();
2515 f->close_section();
2516 }
2517 f->close_section();
2518
2519 f->open_array_section("rules");
2520 dump_rules(f);
2521 f->close_section();
2522
2523 f->open_object_section("tunables");
2524 dump_tunables(f);
2525 f->close_section();
2526
2527 dump_choose_args(f);
2528 }
2529
2530 namespace {
2531 // depth first walker
2532 class TreeDumper {
2533 typedef CrushTreeDumper::Item Item;
2534 const CrushWrapper *crush;
2535 const CrushTreeDumper::name_map_t& weight_set_names;
2536 public:
2537 explicit TreeDumper(const CrushWrapper *crush,
2538 const CrushTreeDumper::name_map_t& wsnames)
2539 : crush(crush), weight_set_names(wsnames) {}
2540
2541 void dump(Formatter *f) {
2542 set<int> roots;
2543 crush->find_roots(roots);
2544 for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
2545 dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
2546 }
2547 }
2548
2549 private:
2550 void dump_item(const Item& qi, Formatter* f) {
2551 if (qi.is_bucket()) {
2552 f->open_object_section("bucket");
2553 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2554 dump_bucket_children(qi, f);
2555 f->close_section();
2556 } else {
2557 f->open_object_section("device");
2558 CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
2559 f->close_section();
2560 }
2561 }
2562
2563 void dump_bucket_children(const Item& parent, Formatter* f) {
2564 f->open_array_section("items");
2565 const int max_pos = crush->get_bucket_size(parent.id);
2566 for (int pos = 0; pos < max_pos; pos++) {
2567 int id = crush->get_bucket_item(parent.id, pos);
2568 float weight = crush->get_bucket_item_weightf(parent.id, pos);
2569 dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
2570 }
2571 f->close_section();
2572 }
2573 };
2574 }
2575
2576 void CrushWrapper::dump_tree(
2577 Formatter *f,
2578 const CrushTreeDumper::name_map_t& weight_set_names) const
2579 {
2580 assert(f);
2581 TreeDumper(this, weight_set_names).dump(f);
2582 }
2583
2584 void CrushWrapper::dump_tunables(Formatter *f) const
2585 {
2586 f->dump_int("choose_local_tries", get_choose_local_tries());
2587 f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2588 f->dump_int("choose_total_tries", get_choose_total_tries());
2589 f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2590 f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2591 f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2592 f->dump_int("straw_calc_version", get_straw_calc_version());
2593 f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2594
2595 // be helpful about it
2596 if (has_jewel_tunables())
2597 f->dump_string("profile", "jewel");
2598 else if (has_hammer_tunables())
2599 f->dump_string("profile", "hammer");
2600 else if (has_firefly_tunables())
2601 f->dump_string("profile", "firefly");
2602 else if (has_bobtail_tunables())
2603 f->dump_string("profile", "bobtail");
2604 else if (has_argonaut_tunables())
2605 f->dump_string("profile", "argonaut");
2606 else
2607 f->dump_string("profile", "unknown");
2608 f->dump_int("optimal_tunables", (int)has_optimal_tunables());
2609 f->dump_int("legacy_tunables", (int)has_legacy_tunables());
2610
2611 // be helpful about minimum version required
2612 f->dump_string("minimum_required_version", get_min_required_version());
2613
2614 f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2615 f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2616 f->dump_int("has_v2_rules", (int)has_v2_rules());
2617 f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2618 f->dump_int("has_v3_rules", (int)has_v3_rules());
2619 f->dump_int("has_v4_buckets", (int)has_v4_buckets());
2620 f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2621 f->dump_int("has_v5_rules", (int)has_v5_rules());
2622 }
2623
2624 void CrushWrapper::dump_choose_args(Formatter *f) const
2625 {
2626 f->open_object_section("choose_args");
2627 for (auto c : choose_args) {
2628 crush_choose_arg_map arg_map = c.second;
2629 f->open_array_section(stringify(c.first).c_str());
2630 for (__u32 i = 0; i < arg_map.size; i++) {
2631 crush_choose_arg *arg = &arg_map.args[i];
2632 if (arg->weight_set_size == 0 &&
2633 arg->ids_size == 0)
2634 continue;
2635 f->open_object_section("choose_args");
2636 int bucket_index = i;
2637 f->dump_int("bucket_id", -1-bucket_index);
2638 if (arg->weight_set_size > 0) {
2639 f->open_array_section("weight_set");
2640 for (__u32 j = 0; j < arg->weight_set_size; j++) {
2641 f->open_array_section("weights");
2642 __u32 *weights = arg->weight_set[j].weights;
2643 __u32 size = arg->weight_set[j].size;
2644 for (__u32 k = 0; k < size; k++) {
2645 f->dump_float("weight", (float)weights[k]/(float)0x10000);
2646 }
2647 f->close_section();
2648 }
2649 f->close_section();
2650 }
2651 if (arg->ids_size > 0) {
2652 f->open_array_section("ids");
2653 for (__u32 j = 0; j < arg->ids_size; j++)
2654 f->dump_int("id", arg->ids[j]);
2655 f->close_section();
2656 }
2657 f->close_section();
2658 }
2659 f->close_section();
2660 }
2661 f->close_section();
2662 }
2663
2664 void CrushWrapper::dump_rules(Formatter *f) const
2665 {
2666 for (int i=0; i<get_max_rules(); i++) {
2667 if (!rule_exists(i))
2668 continue;
2669 dump_rule(i, f);
2670 }
2671 }
2672
2673 void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
2674 {
2675 f->open_object_section("rule");
2676 f->dump_int("rule_id", ruleset);
2677 if (get_rule_name(ruleset))
2678 f->dump_string("rule_name", get_rule_name(ruleset));
2679 f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
2680 f->dump_int("type", get_rule_mask_type(ruleset));
2681 f->dump_int("min_size", get_rule_mask_min_size(ruleset));
2682 f->dump_int("max_size", get_rule_mask_max_size(ruleset));
2683 f->open_array_section("steps");
2684 for (int j=0; j<get_rule_len(ruleset); j++) {
2685 f->open_object_section("step");
2686 switch (get_rule_op(ruleset, j)) {
2687 case CRUSH_RULE_NOOP:
2688 f->dump_string("op", "noop");
2689 break;
2690 case CRUSH_RULE_TAKE:
2691 f->dump_string("op", "take");
2692 {
2693 int item = get_rule_arg1(ruleset, j);
2694 f->dump_int("item", item);
2695
2696 const char *name = get_item_name(item);
2697 f->dump_string("item_name", name ? name : "");
2698 }
2699 break;
2700 case CRUSH_RULE_EMIT:
2701 f->dump_string("op", "emit");
2702 break;
2703 case CRUSH_RULE_CHOOSE_FIRSTN:
2704 f->dump_string("op", "choose_firstn");
2705 f->dump_int("num", get_rule_arg1(ruleset, j));
2706 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2707 break;
2708 case CRUSH_RULE_CHOOSE_INDEP:
2709 f->dump_string("op", "choose_indep");
2710 f->dump_int("num", get_rule_arg1(ruleset, j));
2711 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2712 break;
2713 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
2714 f->dump_string("op", "chooseleaf_firstn");
2715 f->dump_int("num", get_rule_arg1(ruleset, j));
2716 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2717 break;
2718 case CRUSH_RULE_CHOOSELEAF_INDEP:
2719 f->dump_string("op", "chooseleaf_indep");
2720 f->dump_int("num", get_rule_arg1(ruleset, j));
2721 f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
2722 break;
2723 case CRUSH_RULE_SET_CHOOSE_TRIES:
2724 f->dump_string("op", "set_choose_tries");
2725 f->dump_int("num", get_rule_arg1(ruleset, j));
2726 break;
2727 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
2728 f->dump_string("op", "set_chooseleaf_tries");
2729 f->dump_int("num", get_rule_arg1(ruleset, j));
2730 break;
2731 default:
2732 f->dump_int("opcode", get_rule_op(ruleset, j));
2733 f->dump_int("arg1", get_rule_arg1(ruleset, j));
2734 f->dump_int("arg2", get_rule_arg2(ruleset, j));
2735 }
2736 f->close_section();
2737 }
2738 f->close_section();
2739 f->close_section();
2740 }
2741
2742 void CrushWrapper::list_rules(Formatter *f) const
2743 {
2744 for (int rule = 0; rule < get_max_rules(); rule++) {
2745 if (!rule_exists(rule))
2746 continue;
2747 f->dump_string("name", get_rule_name(rule));
2748 }
2749 }
2750
2751 void CrushWrapper::list_rules(ostream *ss) const
2752 {
2753 for (int rule = 0; rule < get_max_rules(); rule++) {
2754 if (!rule_exists(rule))
2755 continue;
2756 *ss << get_rule_name(rule) << "\n";
2757 }
2758 }
2759
2760 class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
2761 public:
2762 typedef CrushTreeDumper::Dumper<TextTable> Parent;
2763
2764 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2765 const CrushTreeDumper::name_map_t& wsnames)
2766 : Parent(crush, wsnames) {}
2767 explicit CrushTreePlainDumper(const CrushWrapper *crush,
2768 const CrushTreeDumper::name_map_t& wsnames,
2769 bool show_shadow)
2770 : Parent(crush, wsnames, show_shadow) {}
2771
2772
2773 void dump(TextTable *tbl) {
2774 tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
2775 tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
2776 tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
2777 for (auto& p : crush->choose_args) {
2778 if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
2779 tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
2780 } else {
2781 string name;
2782 auto q = weight_set_names.find(p.first);
2783 name = q != weight_set_names.end() ? q->second :
2784 stringify(p.first);
2785 tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
2786 }
2787 }
2788 tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
2789 Parent::dump(tbl);
2790 }
2791
2792 protected:
2793 void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
2794 const char *c = crush->get_item_class(qi.id);
2795 if (!c)
2796 c = "";
2797 *tbl << qi.id
2798 << c
2799 << weightf_t(qi.weight);
2800 for (auto& p : crush->choose_args) {
2801 if (qi.parent < 0) {
2802 const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
2803 int bidx = -1 - qi.parent;
2804 const crush_bucket *b = crush->get_bucket(qi.parent);
2805 if (b &&
2806 bidx < (int)cmap.size &&
2807 cmap.args[bidx].weight_set &&
2808 cmap.args[bidx].weight_set_size >= 1) {
2809 int pos;
2810 for (pos = 0;
2811 pos < (int)cmap.args[bidx].weight_set[0].size &&
2812 b->items[pos] != qi.id;
2813 ++pos) ;
2814 *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
2815 (float)0x10000);
2816 continue;
2817 }
2818 }
2819 *tbl << "";
2820 }
2821 ostringstream ss;
2822 for (int k=0; k < qi.depth; k++) {
2823 ss << " ";
2824 }
2825 if (qi.is_bucket()) {
2826 ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
2827 << crush->get_item_name(qi.id);
2828 } else {
2829 ss << "osd." << qi.id;
2830 }
2831 *tbl << ss.str();
2832 *tbl << TextTable::endrow;
2833 }
2834 };
2835
2836
2837 class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
2838 public:
2839 typedef CrushTreeDumper::FormattingDumper Parent;
2840
2841 explicit CrushTreeFormattingDumper(
2842 const CrushWrapper *crush,
2843 const CrushTreeDumper::name_map_t& wsnames)
2844 : Parent(crush, wsnames) {}
2845
2846 explicit CrushTreeFormattingDumper(
2847 const CrushWrapper *crush,
2848 const CrushTreeDumper::name_map_t& wsnames,
2849 bool show_shadow)
2850 : Parent(crush, wsnames, show_shadow) {}
2851
2852 void dump(Formatter *f) {
2853 f->open_array_section("nodes");
2854 Parent::dump(f);
2855 f->close_section();
2856 f->open_array_section("stray");
2857 f->close_section();
2858 }
2859 };
2860
2861
2862 void CrushWrapper::dump_tree(
2863 ostream *out,
2864 Formatter *f,
2865 const CrushTreeDumper::name_map_t& weight_set_names,
2866 bool show_shadow) const
2867 {
2868 if (out) {
2869 TextTable tbl;
2870 CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
2871 *out << tbl;
2872 }
2873 if (f) {
2874 CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
2875 }
2876 }
2877
2878 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
2879 {
2880 o.push_back(new CrushWrapper);
2881 // fixme
2882 }
2883
2884 /**
2885 * Determine the default CRUSH ruleset ID to be used with
2886 * newly created replicated pools.
2887 *
2888 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
2889 */
2890 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
2891 {
2892 int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
2893 if (crush_ruleset < 0) {
2894 crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
2895 } else if (!ruleset_exists(crush_ruleset)) {
2896 crush_ruleset = -1; // match find_first_ruleset() retval
2897 }
2898 return crush_ruleset;
2899 }
2900
2901 bool CrushWrapper::is_valid_crush_name(const string& s)
2902 {
2903 if (s.empty())
2904 return false;
2905 for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
2906 if (!(*p == '-') &&
2907 !(*p == '_') &&
2908 !(*p == '.') &&
2909 !(*p >= '0' && *p <= '9') &&
2910 !(*p >= 'A' && *p <= 'Z') &&
2911 !(*p >= 'a' && *p <= 'z'))
2912 return false;
2913 }
2914 return true;
2915 }
2916
2917 bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
2918 const map<string,string>& loc)
2919 {
2920 for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
2921 if (!is_valid_crush_name(l->first) ||
2922 !is_valid_crush_name(l->second)) {
2923 ldout(cct, 1) << "loc["
2924 << l->first << "] = '"
2925 << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
2926 << dendl;
2927 return false;
2928 }
2929 }
2930 return true;
2931 }
2932
2933 int CrushWrapper::_choose_type_stack(
2934 CephContext *cct,
2935 const vector<pair<int,int>>& stack,
2936 const set<int>& overfull,
2937 const vector<int>& underfull,
2938 const vector<int>& orig,
2939 vector<int>::const_iterator& i,
2940 set<int>& used,
2941 vector<int> *pw) const
2942 {
2943 vector<int> w = *pw;
2944 vector<int> o;
2945
2946 ldout(cct, 10) << __func__ << " stack " << stack
2947 << " orig " << orig
2948 << " at " << *i
2949 << " pw " << *pw
2950 << dendl;
2951
2952 vector<int> cumulative_fanout(stack.size());
2953 int f = 1;
2954 for (int j = (int)stack.size() - 1; j >= 0; --j) {
2955 cumulative_fanout[j] = f;
2956 f *= stack[j].second;
2957 }
2958 ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
2959 << dendl;
2960
2961 // identify underful targets for each intermediate level.
2962 // this serves two purposes:
2963 // 1. we can tell when we are selecting a bucket that does not have any underfull
2964 // devices beneath it. that means that if the current input includes an overfull
2965 // device, we won't be able to find an underfull device with this parent to
2966 // swap for it.
2967 // 2. when we decide we should reject a bucket due to the above, this list gives us
2968 // a list of peers to consider that *do* have underfull devices available.. (we
2969 // are careful to pick one that has the same parent.)
2970 vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
2971 underfull_buckets.resize(stack.size() - 1);
2972 for (auto osd : underfull) {
2973 int item = osd;
2974 for (int j = (int)stack.size() - 2; j >= 0; --j) {
2975 int type = stack[j].first;
2976 item = get_parent_of_type(item, type);
2977 ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
2978 << " is " << item << dendl;
2979 underfull_buckets[j].insert(item);
2980 }
2981 }
2982 ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
2983
2984 for (unsigned j = 0; j < stack.size(); ++j) {
2985 int type = stack[j].first;
2986 int fanout = stack[j].second;
2987 int cum_fanout = cumulative_fanout[j];
2988 ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
2989 << " cumulative " << cum_fanout
2990 << " w " << w << dendl;
2991 vector<int> o;
2992 auto tmpi = i;
2993 for (auto from : w) {
2994 ldout(cct, 10) << " from " << from << dendl;
2995 // identify leaves under each choice. we use this to check whether any of these
2996 // leaves are overfull. (if so, we need to make sure there are underfull candidates
2997 // to swap for them.)
2998 vector<set<int>> leaves;
2999 leaves.resize(fanout);
3000 for (int pos = 0; pos < fanout; ++pos) {
3001 if (type > 0) {
3002 // non-leaf
3003 int item = get_parent_of_type(*tmpi, type);
3004 o.push_back(item);
3005 int n = cum_fanout;
3006 while (n-- && tmpi != orig.end()) {
3007 leaves[pos].insert(*tmpi++);
3008 }
3009 ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
3010 << " of type " << type << " over leaves " << leaves[pos] << dendl;
3011 } else {
3012 // leaf
3013 bool replaced = false;
3014 if (overfull.count(*i)) {
3015 for (auto item : underfull) {
3016 ldout(cct, 10) << __func__ << " pos " << pos
3017 << " was " << *i << " considering " << item
3018 << dendl;
3019 if (used.count(item)) {
3020 ldout(cct, 20) << __func__ << " in used " << used << dendl;
3021 continue;
3022 }
3023 if (!subtree_contains(from, item)) {
3024 ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
3025 continue;
3026 }
3027 if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
3028 ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
3029 continue;
3030 }
3031 o.push_back(item);
3032 used.insert(item);
3033 ldout(cct, 10) << __func__ << " pos " << pos << " replace "
3034 << *i << " -> " << item << dendl;
3035 replaced = true;
3036 ++i;
3037 break;
3038 }
3039 }
3040 if (!replaced) {
3041 ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
3042 << dendl;
3043 o.push_back(*i);
3044 ++i;
3045 }
3046 if (i == orig.end()) {
3047 ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
3048 break;
3049 }
3050 }
3051 }
3052 if (j + 1 < stack.size()) {
3053 // check if any buckets have overfull leaves but no underfull candidates
3054 for (int pos = 0; pos < fanout; ++pos) {
3055 if (underfull_buckets[j].count(o[pos]) == 0) {
3056 // are any leaves overfull?
3057 bool any_overfull = false;
3058 for (auto osd : leaves[pos]) {
3059 if (overfull.count(osd)) {
3060 any_overfull = true;
3061 }
3062 }
3063 if (any_overfull) {
3064 ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
3065 << ">0 leaves " << leaves[pos] << " is overfull; alts "
3066 << underfull_buckets[j]
3067 << dendl;
3068 for (auto alt : underfull_buckets[j]) {
3069 if (std::find(o.begin(), o.end(), alt) == o.end()) {
3070 // see if alt has the same parent
3071 if (j == 0 ||
3072 get_parent_of_type(o[pos], stack[j-1].first) ==
3073 get_parent_of_type(alt, stack[j-1].first)) {
3074 if (j)
3075 ldout(cct, 10) << " replacing " << o[pos]
3076 << " (which has no underfull leaves) with " << alt
3077 << " (same parent "
3078 << get_parent_of_type(alt, stack[j-1].first) << " type "
3079 << type << ")" << dendl;
3080 else
3081 ldout(cct, 10) << " replacing " << o[pos]
3082 << " (which has no underfull leaves) with " << alt
3083 << " (first level)" << dendl;
3084 o[pos] = alt;
3085 break;
3086 } else {
3087 ldout(cct, 30) << " alt " << alt << " for " << o[pos]
3088 << " has different parent, skipping" << dendl;
3089 }
3090 }
3091 }
3092 }
3093 }
3094 }
3095 }
3096 if (i == orig.end()) {
3097 ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
3098 break;
3099 }
3100 }
3101 ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
3102 w.swap(o);
3103 }
3104 *pw = w;
3105 return 0;
3106 }
3107
3108 int CrushWrapper::try_remap_rule(
3109 CephContext *cct,
3110 int ruleno,
3111 int maxout,
3112 const set<int>& overfull,
3113 const vector<int>& underfull,
3114 const vector<int>& orig,
3115 vector<int> *out) const
3116 {
3117 const crush_map *map = crush;
3118 const crush_rule *rule = get_rule(ruleno);
3119 assert(rule);
3120
3121 ldout(cct, 10) << __func__ << " ruleno " << ruleno
3122 << " numrep " << maxout << " overfull " << overfull
3123 << " underfull " << underfull << " orig " << orig
3124 << dendl;
3125 vector<int> w; // working set
3126 out->clear();
3127
3128 auto i = orig.begin();
3129 set<int> used;
3130
3131 vector<pair<int,int>> type_stack; // (type, fan-out)
3132
3133 for (unsigned step = 0; step < rule->len; ++step) {
3134 const crush_rule_step *curstep = &rule->steps[step];
3135 ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
3136 switch (curstep->op) {
3137 case CRUSH_RULE_TAKE:
3138 if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
3139 (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
3140 map->buckets[-1-curstep->arg1])) {
3141 w.clear();
3142 w.push_back(curstep->arg1);
3143 ldout(cct, 10) << __func__ << " take " << w << dendl;
3144 } else {
3145 ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
3146 }
3147 break;
3148
3149 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
3150 case CRUSH_RULE_CHOOSELEAF_INDEP:
3151 {
3152 int numrep = curstep->arg1;
3153 int type = curstep->arg2;
3154 if (numrep <= 0)
3155 numrep += maxout;
3156 type_stack.push_back(make_pair(type, numrep));
3157 type_stack.push_back(make_pair(0, 1));
3158 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3159 i, used, &w);
3160 if (r < 0)
3161 return r;
3162 type_stack.clear();
3163 }
3164 break;
3165
3166 case CRUSH_RULE_CHOOSE_FIRSTN:
3167 case CRUSH_RULE_CHOOSE_INDEP:
3168 {
3169 int numrep = curstep->arg1;
3170 int type = curstep->arg2;
3171 if (numrep <= 0)
3172 numrep += maxout;
3173 type_stack.push_back(make_pair(type, numrep));
3174 }
3175 break;
3176
3177 case CRUSH_RULE_EMIT:
3178 ldout(cct, 10) << " emit " << w << dendl;
3179 if (!type_stack.empty()) {
3180 int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
3181 i, used, &w);
3182 if (r < 0)
3183 return r;
3184 type_stack.clear();
3185 }
3186 for (auto item : w) {
3187 out->push_back(item);
3188 }
3189 w.clear();
3190 break;
3191
3192 default:
3193 // ignore
3194 break;
3195 }
3196 }
3197
3198 return 0;
3199 }
3200
3201
3202 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3203 CephContext *cct,
3204 crush_choose_arg_map cmap,
3205 int bucketid,
3206 int id,
3207 const vector<int>& weight,
3208 ostream *ss)
3209 {
3210 int changed = 0;
3211 int bidx = -1 - bucketid;
3212 crush_bucket *b = crush->buckets[bidx];
3213 if (bidx >= (int)cmap.size) {
3214 if (ss)
3215 *ss << "no weight-set for bucket " << b->id;
3216 ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
3217 << dendl;
3218 return 0;
3219 }
3220 crush_choose_arg *carg = &cmap.args[bidx];
3221 if (carg->weight_set == NULL) {
3222 if (ss)
3223 *ss << "no weight-set for bucket " << b->id;
3224 ldout(cct, 10) << __func__ << " no weight_set for bucket " << b->id
3225 << dendl;
3226 return 0;
3227 }
3228 if (carg->weight_set_size != weight.size()) {
3229 if (ss)
3230 *ss << "weight_set_size != " << weight.size() << " for bucket " << b->id;
3231 ldout(cct, 10) << __func__ << " weight_set_size != " << weight.size()
3232 << " for bucket " << b->id << dendl;
3233 return 0;
3234 }
3235 for (unsigned i = 0; i < b->size; i++) {
3236 if (b->items[i] == id) {
3237 for (unsigned j = 0; j < weight.size(); ++j) {
3238 carg->weight_set[j].weights[i] = weight[j];
3239 }
3240 ldout(cct, 5) << __func__ << " set " << id << " to " << weight
3241 << " in bucket " << b->id << dendl;
3242 changed++;
3243 }
3244 }
3245 if (changed) {
3246 vector<int> bucket_weight(weight.size(), 0);
3247 for (unsigned i = 0; i < b->size; i++) {
3248 for (unsigned j = 0; j < weight.size(); ++j) {
3249 bucket_weight[j] += carg->weight_set[j].weights[i];
3250 }
3251 }
3252 choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
3253 }
3254 return changed;
3255 }
3256
3257 int CrushWrapper::choose_args_adjust_item_weight(
3258 CephContext *cct,
3259 crush_choose_arg_map cmap,
3260 int id,
3261 const vector<int>& weight,
3262 ostream *ss)
3263 {
3264 ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
3265 int changed = 0;
3266 for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
3267 crush_bucket *b = crush->buckets[bidx];
3268 if (b == nullptr) {
3269 continue;
3270 }
3271 changed += _choose_args_adjust_item_weight_in_bucket(
3272 cct, cmap, b->id, id, weight, ss);
3273 }
3274 if (!changed) {
3275 if (ss)
3276 *ss << "item " << id << " not found in crush map";
3277 return -ENOENT;
3278 }
3279 return changed;
3280 }