1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
14 #define dout_subsys ceph_subsys_crush
16 bool CrushWrapper::has_legacy_rulesets() const
18 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
19 crush_rule
*r
= crush
->rules
[i
];
21 r
->mask
.ruleset
!= i
) {
28 int CrushWrapper::renumber_rules_by_ruleset()
31 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
32 crush_rule
*r
= crush
->rules
[i
];
33 if (r
&& r
->mask
.ruleset
>= max_ruleset
) {
34 max_ruleset
= r
->mask
.ruleset
+ 1;
37 struct crush_rule
**newrules
=
38 (crush_rule
**)calloc(1, max_ruleset
* sizeof(crush_rule
*));
39 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
40 crush_rule
*r
= crush
->rules
[i
];
43 if (newrules
[r
->mask
.ruleset
]) {
44 // collision, we can't do it.
48 newrules
[r
->mask
.ruleset
] = r
;
53 crush
->rules
= newrules
;
54 crush
->max_rules
= max_ruleset
;
58 bool CrushWrapper::has_multirule_rulesets() const
60 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
61 crush_rule
*r
= crush
->rules
[i
];
64 for (unsigned j
=i
+1; j
<crush
->max_rules
; j
++) {
65 crush_rule
*s
= crush
->rules
[j
];
68 if (r
->mask
.ruleset
== s
->mask
.ruleset
)
75 bool CrushWrapper::has_non_straw2_buckets() const
77 for (int i
=0; i
<crush
->max_buckets
; ++i
) {
78 crush_bucket
*b
= crush
->buckets
[i
];
81 if (b
->alg
!= CRUSH_BUCKET_STRAW2
)
87 bool CrushWrapper::has_v2_rules() const
89 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
97 bool CrushWrapper::is_v2_rule(unsigned ruleid
) const
99 // check rule for use of indep or new SET_* rule steps
100 if (ruleid
>= crush
->max_rules
)
102 crush_rule
*r
= crush
->rules
[ruleid
];
105 for (unsigned j
=0; j
<r
->len
; j
++) {
106 if (r
->steps
[j
].op
== CRUSH_RULE_CHOOSE_INDEP
||
107 r
->steps
[j
].op
== CRUSH_RULE_CHOOSELEAF_INDEP
||
108 r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSE_TRIES
||
109 r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_TRIES
) {
116 bool CrushWrapper::has_v3_rules() const
118 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
126 bool CrushWrapper::is_v3_rule(unsigned ruleid
) const
128 // check rule for use of SET_CHOOSELEAF_VARY_R step
129 if (ruleid
>= crush
->max_rules
)
131 crush_rule
*r
= crush
->rules
[ruleid
];
134 for (unsigned j
=0; j
<r
->len
; j
++) {
135 if (r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_VARY_R
) {
142 bool CrushWrapper::has_v4_buckets() const
144 for (int i
=0; i
<crush
->max_buckets
; ++i
) {
145 crush_bucket
*b
= crush
->buckets
[i
];
148 if (b
->alg
== CRUSH_BUCKET_STRAW2
)
154 bool CrushWrapper::has_v5_rules() const
156 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
164 bool CrushWrapper::is_v5_rule(unsigned ruleid
) const
166 // check rule for use of SET_CHOOSELEAF_STABLE step
167 if (ruleid
>= crush
->max_rules
)
169 crush_rule
*r
= crush
->rules
[ruleid
];
172 for (unsigned j
=0; j
<r
->len
; j
++) {
173 if (r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_STABLE
) {
180 bool CrushWrapper::has_choose_args() const
182 return !choose_args
.empty();
185 bool CrushWrapper::has_incompat_choose_args() const
187 if (choose_args
.empty())
189 if (choose_args
.size() > 1)
191 if (choose_args
.begin()->first
!= DEFAULT_CHOOSE_ARGS
)
193 crush_choose_arg_map arg_map
= choose_args
.begin()->second
;
194 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
195 crush_choose_arg
*arg
= &arg_map
.args
[i
];
196 if (arg
->weight_set_size
== 0 &&
199 if (arg
->weight_set_size
!= 1)
201 if (arg
->ids_size
!= 0)
207 int CrushWrapper::split_id_class(int i
, int *idout
, int *classout
) const
211 string name
= get_item_name(i
);
212 size_t pos
= name
.find("~");
213 if (pos
== string::npos
) {
218 string name_no_class
= name
.substr(0, pos
);
219 if (!name_exists(name_no_class
))
221 string class_name
= name
.substr(pos
+ 1);
222 if (!class_exists(class_name
))
224 *idout
= get_item_id(name_no_class
);
225 *classout
= get_class_id(class_name
);
229 int CrushWrapper::can_rename_item(const string
& srcname
,
230 const string
& dstname
,
233 if (name_exists(srcname
)) {
234 if (name_exists(dstname
)) {
235 *ss
<< "dstname = '" << dstname
<< "' already exists";
238 if (is_valid_crush_name(dstname
)) {
241 *ss
<< "dstname = '" << dstname
<< "' does not match [-_.0-9a-zA-Z]+";
245 if (name_exists(dstname
)) {
246 *ss
<< "srcname = '" << srcname
<< "' does not exist "
247 << "and dstname = '" << dstname
<< "' already exists";
250 *ss
<< "srcname = '" << srcname
<< "' does not exist";
256 int CrushWrapper::rename_item(const string
& srcname
,
257 const string
& dstname
,
260 int ret
= can_rename_item(srcname
, dstname
, ss
);
263 int oldid
= get_item_id(srcname
);
264 return set_item_name(oldid
, dstname
);
267 int CrushWrapper::can_rename_bucket(const string
& srcname
,
268 const string
& dstname
,
271 int ret
= can_rename_item(srcname
, dstname
, ss
);
274 int srcid
= get_item_id(srcname
);
276 *ss
<< "srcname = '" << srcname
<< "' is not a bucket "
277 << "because its id = " << srcid
<< " is >= 0";
283 int CrushWrapper::rename_bucket(const string
& srcname
,
284 const string
& dstname
,
287 int ret
= can_rename_bucket(srcname
, dstname
, ss
);
290 int oldid
= get_item_id(srcname
);
291 return set_item_name(oldid
, dstname
);
294 void CrushWrapper::find_takes(set
<int>& roots
) const
296 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
297 crush_rule
*r
= crush
->rules
[i
];
300 for (unsigned j
=0; j
<r
->len
; j
++) {
301 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
)
302 roots
.insert(r
->steps
[j
].arg1
);
307 void CrushWrapper::find_roots(set
<int>& roots
) const
309 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
310 if (!crush
->buckets
[i
])
312 crush_bucket
*b
= crush
->buckets
[i
];
313 if (!_search_item_exists(b
->id
))
318 bool CrushWrapper::subtree_contains(int root
, int item
) const
324 return false; // root is a leaf
326 const crush_bucket
*b
= get_bucket(root
);
330 for (unsigned j
=0; j
<b
->size
; j
++) {
331 if (subtree_contains(b
->items
[j
], item
))
337 bool CrushWrapper::_maybe_remove_last_instance(CephContext
*cct
, int item
, bool unlink_only
)
340 if (_search_item_exists(item
)) {
343 if (item
< 0 && _bucket_is_in_use(item
)) {
347 if (item
< 0 && !unlink_only
) {
348 crush_bucket
*t
= get_bucket(item
);
349 ldout(cct
, 5) << "_maybe_remove_last_instance removing bucket " << item
<< dendl
;
350 crush_remove_bucket(crush
, t
);
351 if (class_bucket
.count(item
) != 0)
352 class_bucket
.erase(item
);
353 class_remove_item(item
);
355 if ((item
>= 0 || !unlink_only
) && name_map
.count(item
)) {
356 ldout(cct
, 5) << "_maybe_remove_last_instance removing name for item " << item
<< dendl
;
357 name_map
.erase(item
);
359 if (item
>= 0 && !unlink_only
) {
360 class_remove_item(item
);
363 rebuild_roots_with_classes();
367 int CrushWrapper::remove_root(int item
, bool unused
)
369 if (unused
&& _bucket_is_in_use(item
))
372 crush_bucket
*b
= get_bucket(item
);
374 // should be idempotent
375 // e.g.: we use 'crush link' to link same host into
376 // different roots, which as a result can cause different
377 // shadow trees reference same hosts too. This means
378 // we may need to destory the same buckets(hosts, racks, etc.)
379 // multiple times during rebuilding all shadow trees.
383 for (unsigned n
= 0; n
< b
->size
; n
++) {
384 if (b
->items
[n
] >= 0)
386 int r
= remove_root(b
->items
[n
], unused
);
391 crush_remove_bucket(crush
, b
);
392 if (name_map
.count(item
) != 0) {
393 name_map
.erase(item
);
396 if (class_bucket
.count(item
) != 0)
397 class_bucket
.erase(item
);
398 class_remove_item(item
);
402 int CrushWrapper::remove_item(CephContext
*cct
, int item
, bool unlink_only
)
404 ldout(cct
, 5) << "remove_item " << item
405 << (unlink_only
? " unlink_only":"") << dendl
;
409 if (item
< 0 && !unlink_only
) {
410 crush_bucket
*t
= get_bucket(item
);
412 ldout(cct
, 1) << "remove_item bucket " << item
<< " does not exist"
418 ldout(cct
, 1) << "remove_item bucket " << item
<< " has " << t
->size
419 << " items, not empty" << dendl
;
422 if (_bucket_is_in_use(item
)) {
427 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
428 if (!crush
->buckets
[i
])
430 crush_bucket
*b
= crush
->buckets
[i
];
432 for (unsigned i
=0; i
<b
->size
; ++i
) {
433 int id
= b
->items
[i
];
435 ldout(cct
, 5) << "remove_item removing item " << item
436 << " from bucket " << b
->id
<< dendl
;
437 for (auto& p
: choose_args
) {
438 // weight down each weight-set to 0 before we remove the item
439 vector
<int> weightv(get_choose_args_positions(p
.second
), 0);
440 choose_args_adjust_item_weight(cct
, p
.second
, item
, weightv
, nullptr);
442 bucket_remove_item(b
, item
);
443 adjust_item_weight(cct
, b
->id
, b
->weight
);
449 if (_maybe_remove_last_instance(cct
, item
, unlink_only
))
455 bool CrushWrapper::_search_item_exists(int item
) const
457 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
458 if (!crush
->buckets
[i
])
460 crush_bucket
*b
= crush
->buckets
[i
];
461 for (unsigned j
=0; j
<b
->size
; ++j
) {
462 if (b
->items
[j
] == item
)
469 bool CrushWrapper::_bucket_is_in_use(int item
)
471 for (auto &i
: class_bucket
)
472 for (auto &j
: i
.second
)
473 if (j
.second
== item
)
475 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
476 crush_rule
*r
= crush
->rules
[i
];
479 for (unsigned j
= 0; j
< r
->len
; ++j
) {
480 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
481 int step_item
= r
->steps
[j
].arg1
;
484 int res
= split_id_class(step_item
, &original_item
, &c
);
487 if (step_item
== item
|| original_item
== item
)
495 int CrushWrapper::_remove_item_under(
496 CephContext
*cct
, int item
, int ancestor
, bool unlink_only
)
498 ldout(cct
, 5) << "_remove_item_under " << item
<< " under " << ancestor
499 << (unlink_only
? " unlink_only":"") << dendl
;
505 if (!bucket_exists(ancestor
))
510 crush_bucket
*b
= get_bucket(ancestor
);
511 for (unsigned i
=0; i
<b
->size
; ++i
) {
512 int id
= b
->items
[i
];
514 ldout(cct
, 5) << "_remove_item_under removing item " << item
515 << " from bucket " << b
->id
<< dendl
;
516 bucket_remove_item(b
, item
);
517 for (auto& p
: choose_args
) {
518 // weight down each weight-set to 0 before we remove the item
519 vector
<int> weightv(get_choose_args_positions(p
.second
), 0);
520 _choose_args_adjust_item_weight_in_bucket(
521 cct
, p
.second
, b
->id
, item
, weightv
, nullptr);
523 adjust_item_weight(cct
, b
->id
, b
->weight
);
526 int r
= remove_item_under(cct
, item
, id
, unlink_only
);
534 int CrushWrapper::remove_item_under(
535 CephContext
*cct
, int item
, int ancestor
, bool unlink_only
)
537 ldout(cct
, 5) << "remove_item_under " << item
<< " under " << ancestor
538 << (unlink_only
? " unlink_only":"") << dendl
;
540 if (!unlink_only
&& _bucket_is_in_use(item
)) {
544 int ret
= _remove_item_under(cct
, item
, ancestor
, unlink_only
);
548 if (item
< 0 && !unlink_only
) {
549 crush_bucket
*t
= get_bucket(item
);
551 ldout(cct
, 1) << "remove_item_under bucket " << item
552 << " does not exist" << dendl
;
557 ldout(cct
, 1) << "remove_item_under bucket " << item
<< " has " << t
->size
558 << " items, not empty" << dendl
;
563 if (_maybe_remove_last_instance(cct
, item
, unlink_only
))
569 int CrushWrapper::get_common_ancestor_distance(CephContext
*cct
, int id
,
570 const std::multimap
<string
,string
>& loc
)
572 ldout(cct
, 5) << __func__
<< " " << id
<< " " << loc
<< dendl
;
573 if (!item_exists(id
))
575 map
<string
,string
> id_loc
= get_full_location(id
);
576 ldout(cct
, 20) << " id is at " << id_loc
<< dendl
;
578 for (map
<int,string
>::const_iterator p
= type_map
.begin();
581 map
<string
,string
>::iterator ip
= id_loc
.find(p
->second
);
582 if (ip
== id_loc
.end())
584 for (std::multimap
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
587 if (q
->first
!= p
->second
)
589 if (q
->second
== ip
->second
)
596 int CrushWrapper::parse_loc_map(const std::vector
<string
>& args
,
597 std::map
<string
,string
> *ploc
)
600 for (unsigned i
= 0; i
< args
.size(); ++i
) {
601 const char *s
= args
[i
].c_str();
602 const char *pos
= strchr(s
, '=');
605 string
key(s
, 0, pos
-s
);
608 (*ploc
)[key
] = value
;
615 int CrushWrapper::parse_loc_multimap(const std::vector
<string
>& args
,
616 std::multimap
<string
,string
> *ploc
)
619 for (unsigned i
= 0; i
< args
.size(); ++i
) {
620 const char *s
= args
[i
].c_str();
621 const char *pos
= strchr(s
, '=');
624 string
key(s
, 0, pos
-s
);
627 ploc
->insert(make_pair(key
, value
));
634 bool CrushWrapper::check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
,
637 ldout(cct
, 5) << "check_item_loc item " << item
<< " loc " << loc
<< dendl
;
639 for (map
<int,string
>::const_iterator p
= type_map
.begin(); p
!= type_map
.end(); ++p
) {
644 // ignore types that aren't specified in loc
645 map
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
646 if (q
== loc
.end()) {
647 ldout(cct
, 2) << "warning: did not specify location for '" << p
->second
<< "' level (levels are "
648 << type_map
<< ")" << dendl
;
652 if (!name_exists(q
->second
)) {
653 ldout(cct
, 5) << "check_item_loc bucket " << q
->second
<< " dne" << dendl
;
657 int id
= get_item_id(q
->second
);
659 ldout(cct
, 5) << "check_item_loc requested " << q
->second
<< " for type " << p
->second
660 << " is a device, not bucket" << dendl
;
664 assert(bucket_exists(id
));
665 crush_bucket
*b
= get_bucket(id
);
667 // see if item exists in this bucket
668 for (unsigned j
=0; j
<b
->size
; j
++) {
669 if (b
->items
[j
] == item
) {
670 ldout(cct
, 2) << "check_item_loc " << item
<< " exists in bucket " << b
->id
<< dendl
;
672 *weight
= crush_get_bucket_item_weight(b
, j
);
679 ldout(cct
, 1) << "check_item_loc item " << item
<< " loc " << loc
<< dendl
;
683 map
<string
, string
> CrushWrapper::get_full_location(int id
)
685 vector
<pair
<string
, string
> > full_location_ordered
;
686 map
<string
,string
> full_location
;
688 get_full_location_ordered(id
, full_location_ordered
);
690 std::copy(full_location_ordered
.begin(),
691 full_location_ordered
.end(),
692 std::inserter(full_location
, full_location
.begin()));
694 return full_location
;
697 int CrushWrapper::get_full_location_ordered(int id
, vector
<pair
<string
, string
> >& path
)
699 if (!item_exists(id
))
704 pair
<string
, string
> parent_coord
= get_immediate_parent(cur
, &ret
);
707 path
.push_back(parent_coord
);
708 cur
= get_item_id(parent_coord
.second
);
713 string
CrushWrapper::get_full_location_ordered_string(int id
)
715 vector
<pair
<string
, string
> > full_location_ordered
;
716 string full_location
;
717 get_full_location_ordered(id
, full_location_ordered
);
718 reverse(begin(full_location_ordered
), end(full_location_ordered
));
719 for(auto i
= full_location_ordered
.begin(); i
!= full_location_ordered
.end(); i
++) {
720 full_location
= full_location
+ i
->first
+ "=" + i
->second
;
721 if (i
!= full_location_ordered
.end() - 1) {
722 full_location
= full_location
+ ",";
725 return full_location
;
728 map
<int, string
> CrushWrapper::get_parent_hierarchy(int id
)
730 map
<int,string
> parent_hierarchy
;
731 pair
<string
, string
> parent_coord
= get_immediate_parent(id
);
734 // get the integer type for id and create a counter from there
735 int type_counter
= get_bucket_type(id
);
737 // if we get a negative type then we can assume that we have an OSD
738 // change behavior in get_item_type FIXME
739 if (type_counter
< 0)
742 // read the type map and get the name of the type with the largest ID
744 for (map
<int, string
>::iterator it
= type_map
.begin(); it
!= type_map
.end(); ++it
){
745 if ( (*it
).first
> high_type
)
746 high_type
= (*it
).first
;
749 parent_id
= get_item_id(parent_coord
.second
);
751 while (type_counter
< high_type
) {
753 parent_hierarchy
[ type_counter
] = parent_coord
.first
;
755 if (type_counter
< high_type
){
756 // get the coordinate information for the next parent
757 parent_coord
= get_immediate_parent(parent_id
);
758 parent_id
= get_item_id(parent_coord
.second
);
762 return parent_hierarchy
;
765 int CrushWrapper::get_children(int id
, list
<int> *children
)
772 crush_bucket
*b
= get_bucket(id
);
777 for (unsigned n
=0; n
<b
->size
; n
++) {
778 children
->push_back(b
->items
[n
]);
783 int CrushWrapper::_get_leaves(int id
, list
<int> *leaves
)
789 leaves
->push_back(id
);
793 crush_bucket
*b
= get_bucket(id
);
798 for (unsigned n
= 0; n
< b
->size
; n
++) {
799 if (b
->items
[n
] >= 0) {
800 leaves
->push_back(b
->items
[n
]);
802 // is a bucket, do recursive call
803 int r
= _get_leaves(b
->items
[n
], leaves
);
810 return 0; // all is well
813 int CrushWrapper::get_leaves(const string
&name
, set
<int> *leaves
)
818 if (!name_exists(name
)) {
822 int id
= get_item_id(name
);
830 int r
= _get_leaves(id
, &unordered
);
835 for (auto &p
: unordered
) {
842 int CrushWrapper::insert_item(
843 CephContext
*cct
, int item
, float weight
, string name
,
844 const map
<string
,string
>& loc
) // typename -> bucketname
846 ldout(cct
, 5) << "insert_item item " << item
<< " weight " << weight
847 << " name " << name
<< " loc " << loc
<< dendl
;
849 if (!is_valid_crush_name(name
))
852 if (!is_valid_crush_loc(cct
, loc
))
855 int r
= validate_weightf(weight
);
860 if (name_exists(name
)) {
861 if (get_item_id(name
) != item
) {
862 ldout(cct
, 10) << "device name '" << name
<< "' already exists as id "
863 << get_item_id(name
) << dendl
;
867 set_item_name(item
, name
);
872 // create locations if locations don't exist and add child in
873 // location with 0 weight the more detail in the insert_item method
874 // declaration in CrushWrapper.h
875 for (auto p
= type_map
.begin(); p
!= type_map
.end(); ++p
) {
876 // ignore device type
880 // skip types that are unspecified
881 map
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
882 if (q
== loc
.end()) {
883 ldout(cct
, 2) << "warning: did not specify location for '"
884 << p
->second
<< "' level (levels are "
885 << type_map
<< ")" << dendl
;
889 if (!name_exists(q
->second
)) {
890 ldout(cct
, 5) << "insert_item creating bucket " << q
->second
<< dendl
;
891 int empty
= 0, newid
;
892 int r
= add_bucket(0, 0,
893 CRUSH_HASH_DEFAULT
, p
->first
, 1, &cur
, &empty
, &newid
);
895 ldout(cct
, 1) << "add_bucket failure error: " << cpp_strerror(r
)
899 set_item_name(newid
, q
->second
);
905 // add to an existing bucket
906 int id
= get_item_id(q
->second
);
907 if (!bucket_exists(id
)) {
908 ldout(cct
, 1) << "insert_item doesn't have bucket " << id
<< dendl
;
912 // check that we aren't creating a cycle.
913 if (subtree_contains(id
, cur
)) {
914 ldout(cct
, 1) << "insert_item item " << cur
<< " already exists beneath "
919 // we have done sanity check above
920 crush_bucket
*b
= get_bucket(id
);
922 if (p
->first
!= b
->type
) {
923 ldout(cct
, 1) << "insert_item existing bucket has type "
924 << "'" << type_map
[b
->type
] << "' != "
925 << "'" << type_map
[p
->first
] << "'" << dendl
;
929 // are we forming a loop?
930 if (subtree_contains(cur
, b
->id
)) {
931 ldout(cct
, 1) << "insert_item " << cur
<< " already contains " << b
->id
932 << "; cannot form loop" << dendl
;
936 ldout(cct
, 5) << "insert_item adding " << cur
<< " weight " << weight
937 << " to bucket " << id
<< dendl
;
938 int r
= bucket_add_item(b
, cur
, 0);
943 // adjust the item's weight in location
944 if (adjust_item_weightf_in_loc(cct
, item
, weight
, loc
) > 0) {
945 if (item
>= crush
->max_devices
) {
946 crush
->max_devices
= item
+ 1;
947 ldout(cct
, 5) << "insert_item max_devices now " << crush
->max_devices
950 r
= rebuild_roots_with_classes();
952 ldout(cct
, 0) << __func__
<< " unable to rebuild roots with classes: "
953 << cpp_strerror(r
) << dendl
;
959 ldout(cct
, 1) << "error: didn't find anywhere to add item " << item
960 << " in " << loc
<< dendl
;
965 int CrushWrapper::move_bucket(
966 CephContext
*cct
, int id
, const map
<string
,string
>& loc
)
968 // sorry this only works for buckets
972 if (!item_exists(id
))
975 // get the name of the bucket we are trying to move for later
976 string id_name
= get_item_name(id
);
979 int bucket_weight
= detach_bucket(cct
, id
);
981 // insert the bucket back into the hierarchy
982 return insert_item(cct
, id
, bucket_weight
/ (float)0x10000, id_name
, loc
);
985 int CrushWrapper::detach_bucket(CephContext
*cct
, int item
)
993 // check that the bucket that we want to detach exists
994 assert(bucket_exists(item
));
996 // get the bucket's weight
997 crush_bucket
*b
= get_bucket(item
);
998 unsigned bucket_weight
= b
->weight
;
1000 // get where the bucket is located
1001 pair
<string
, string
> bucket_location
= get_immediate_parent(item
);
1003 // get the id of the parent bucket
1004 int parent_id
= get_item_id(bucket_location
.second
);
1006 // get the parent bucket
1007 crush_bucket
*parent_bucket
= get_bucket(parent_id
);
1009 if (!IS_ERR(parent_bucket
)) {
1010 // zero out the bucket weight
1011 bucket_adjust_item_weight(cct
, parent_bucket
, item
, 0);
1012 adjust_item_weight(cct
, parent_bucket
->id
, parent_bucket
->weight
);
1013 for (auto& p
: choose_args
) {
1014 // weight down each weight-set to 0 before we remove the item
1015 vector
<int> weightv(get_choose_args_positions(p
.second
), 0);
1016 choose_args_adjust_item_weight(cct
, p
.second
, item
, weightv
, nullptr);
1019 // remove the bucket from the parent
1020 bucket_remove_item(parent_bucket
, item
);
1021 } else if (PTR_ERR(parent_bucket
) != -ENOENT
) {
1022 return PTR_ERR(parent_bucket
);
1025 // check that we're happy
1026 int test_weight
= 0;
1027 map
<string
,string
> test_location
;
1028 test_location
[ bucket_location
.first
] = (bucket_location
.second
);
1030 bool successful_detach
= !(check_item_loc(cct
, item
, test_location
,
1032 assert(successful_detach
);
1033 assert(test_weight
== 0);
1035 return bucket_weight
;
1038 int CrushWrapper::swap_bucket(CephContext
*cct
, int src
, int dst
)
1040 if (src
>= 0 || dst
>= 0)
1042 if (!item_exists(src
) || !item_exists(dst
))
1044 crush_bucket
*a
= get_bucket(src
);
1045 crush_bucket
*b
= get_bucket(dst
);
1046 unsigned aw
= a
->weight
;
1047 unsigned bw
= b
->weight
;
1050 adjust_item_weight(cct
, a
->id
, bw
);
1051 adjust_item_weight(cct
, b
->id
, aw
);
1054 map
<int,unsigned> tmp
;
1055 unsigned as
= a
->size
;
1056 unsigned bs
= b
->size
;
1057 for (unsigned i
= 0; i
< as
; ++i
) {
1058 int item
= a
->items
[0];
1059 int itemw
= crush_get_bucket_item_weight(a
, 0);
1061 bucket_remove_item(a
, item
);
1063 assert(a
->size
== 0);
1064 assert(b
->size
== bs
);
1065 for (unsigned i
= 0; i
< bs
; ++i
) {
1066 int item
= b
->items
[0];
1067 int itemw
= crush_get_bucket_item_weight(b
, 0);
1068 bucket_remove_item(b
, item
);
1069 bucket_add_item(a
, item
, itemw
);
1071 assert(a
->size
== bs
);
1072 assert(b
->size
== 0);
1073 for (auto t
: tmp
) {
1074 bucket_add_item(b
, t
.first
, t
.second
);
1076 assert(a
->size
== bs
);
1077 assert(b
->size
== as
);
1080 swap_names(src
, dst
);
1084 int CrushWrapper::link_bucket(
1085 CephContext
*cct
, int id
, const map
<string
,string
>& loc
)
1087 // sorry this only works for buckets
1091 if (!item_exists(id
))
1094 // get the name of the bucket we are trying to move for later
1095 string id_name
= get_item_name(id
);
1097 crush_bucket
*b
= get_bucket(id
);
1098 unsigned bucket_weight
= b
->weight
;
1100 return insert_item(cct
, id
, bucket_weight
/ (float)0x10000, id_name
, loc
);
1103 int CrushWrapper::create_or_move_item(
1104 CephContext
*cct
, int item
, float weight
, string name
,
1105 const map
<string
,string
>& loc
) // typename -> bucketname
1110 if (!is_valid_crush_name(name
))
1113 if (check_item_loc(cct
, item
, loc
, &old_iweight
)) {
1114 ldout(cct
, 5) << "create_or_move_item " << item
<< " already at " << loc
1117 if (_search_item_exists(item
)) {
1118 weight
= get_item_weightf(item
);
1119 ldout(cct
, 10) << "create_or_move_item " << item
1120 << " exists with weight " << weight
<< dendl
;
1121 remove_item(cct
, item
, true);
1123 ldout(cct
, 5) << "create_or_move_item adding " << item
1124 << " weight " << weight
1125 << " at " << loc
<< dendl
;
1126 ret
= insert_item(cct
, item
, weight
, name
, loc
);
1133 int CrushWrapper::update_item(
1134 CephContext
*cct
, int item
, float weight
, string name
,
1135 const map
<string
,string
>& loc
) // typename -> bucketname
1137 ldout(cct
, 5) << "update_item item " << item
<< " weight " << weight
1138 << " name " << name
<< " loc " << loc
<< dendl
;
1141 if (!is_valid_crush_name(name
))
1144 if (!is_valid_crush_loc(cct
, loc
))
1147 ret
= validate_weightf(weight
);
1152 // compare quantized (fixed-point integer) weights!
1153 int iweight
= (int)(weight
* (float)0x10000);
1155 if (check_item_loc(cct
, item
, loc
, &old_iweight
)) {
1156 ldout(cct
, 5) << "update_item " << item
<< " already at " << loc
<< dendl
;
1157 if (old_iweight
!= iweight
) {
1158 ldout(cct
, 5) << "update_item " << item
<< " adjusting weight "
1159 << ((float)old_iweight
/(float)0x10000) << " -> " << weight
1161 adjust_item_weight_in_loc(cct
, item
, iweight
, loc
);
1164 if (get_item_name(item
) != name
) {
1165 ldout(cct
, 5) << "update_item setting " << item
<< " name to " << name
1167 set_item_name(item
, name
);
1171 if (item_exists(item
)) {
1172 remove_item(cct
, item
, true);
1174 ldout(cct
, 5) << "update_item adding " << item
<< " weight " << weight
1175 << " at " << loc
<< dendl
;
1176 ret
= insert_item(cct
, item
, weight
, name
, loc
);
1183 int CrushWrapper::get_item_weight(int id
) const
1185 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1186 crush_bucket
*b
= crush
->buckets
[bidx
];
1191 for (unsigned i
= 0; i
< b
->size
; i
++)
1192 if (b
->items
[i
] == id
)
1193 return crush_get_bucket_item_weight(b
, i
);
1198 int CrushWrapper::get_item_weight_in_loc(int id
, const map
<string
,string
> &loc
)
1200 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
1202 int bid
= get_item_id(l
->second
);
1203 if (!bucket_exists(bid
))
1205 crush_bucket
*b
= get_bucket(bid
);
1206 for (unsigned int i
= 0; i
< b
->size
; i
++) {
1207 if (b
->items
[i
] == id
) {
1208 return crush_get_bucket_item_weight(b
, i
);
1215 int CrushWrapper::adjust_item_weight(CephContext
*cct
, int id
, int weight
)
1217 ldout(cct
, 5) << "adjust_item_weight " << id
<< " weight " << weight
<< dendl
;
1219 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1220 crush_bucket
*b
= crush
->buckets
[bidx
];
1223 for (unsigned i
= 0; i
< b
->size
; i
++) {
1224 if (b
->items
[i
] == id
) {
1225 int diff
= bucket_adjust_item_weight(cct
, b
, id
, weight
);
1226 ldout(cct
, 5) << "adjust_item_weight " << id
<< " diff " << diff
1227 << " in bucket " << bidx
<< dendl
;
1228 adjust_item_weight(cct
, -1 - bidx
, b
->weight
);
1238 int CrushWrapper::adjust_item_weight_in_loc(CephContext
*cct
, int id
, int weight
, const map
<string
,string
>& loc
)
1240 ldout(cct
, 5) << "adjust_item_weight_in_loc " << id
<< " weight " << weight
1241 << " in " << loc
<< dendl
;
1244 for (auto l
= loc
.begin(); l
!= loc
.end(); ++l
) {
1245 int bid
= get_item_id(l
->second
);
1246 if (!bucket_exists(bid
))
1248 crush_bucket
*b
= get_bucket(bid
);
1249 for (unsigned int i
= 0; i
< b
->size
; i
++) {
1250 if (b
->items
[i
] == id
) {
1251 int diff
= bucket_adjust_item_weight(cct
, b
, id
, weight
);
1252 ldout(cct
, 5) << "adjust_item_weight_in_loc " << id
<< " diff " << diff
1253 << " in bucket " << bid
<< dendl
;
1254 adjust_item_weight(cct
, bid
, b
->weight
);
1264 int CrushWrapper::adjust_subtree_weight(CephContext
*cct
, int id
, int weight
)
1266 ldout(cct
, 5) << __func__
<< " " << id
<< " weight " << weight
<< dendl
;
1267 crush_bucket
*b
= get_bucket(id
);
1271 list
<crush_bucket
*> q
;
1273 while (!q
.empty()) {
1276 int local_changed
= 0;
1277 for (unsigned i
=0; i
<b
->size
; ++i
) {
1278 int n
= b
->items
[i
];
1280 bucket_adjust_item_weight(cct
, b
, n
, weight
);
1284 crush_bucket
*sub
= get_bucket(n
);
1290 if (local_changed
) {
1291 adjust_item_weight(cct
, b
->id
, b
->weight
);
1297 bool CrushWrapper::check_item_present(int id
) const
1301 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1302 crush_bucket
*b
= crush
->buckets
[bidx
];
1305 for (unsigned i
= 0; i
< b
->size
; i
++)
1306 if (b
->items
[i
] == id
)
1313 pair
<string
,string
> CrushWrapper::get_immediate_parent(int id
, int *_ret
)
1316 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1317 crush_bucket
*b
= crush
->buckets
[bidx
];
1320 if (is_shadow_item(b
->id
))
1322 for (unsigned i
= 0; i
< b
->size
; i
++)
1323 if (b
->items
[i
] == id
) {
1324 string parent_id
= name_map
[b
->id
];
1325 string parent_bucket_type
= type_map
[b
->type
];
1328 return make_pair(parent_bucket_type
, parent_id
);
1335 return pair
<string
, string
>();
1338 int CrushWrapper::get_immediate_parent_id(int id
, int *parent
) const
1340 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1341 crush_bucket
*b
= crush
->buckets
[bidx
];
1344 if (is_shadow_item(b
->id
))
1346 for (unsigned i
= 0; i
< b
->size
; i
++) {
1347 if (b
->items
[i
] == id
) {
1356 int CrushWrapper::get_parent_of_type(int item
, int type
) const
1359 int r
= get_immediate_parent_id(item
, &item
);
1363 } while (get_bucket_type(item
) != type
);
1367 int CrushWrapper::populate_classes(
1368 const std::map
<int32_t, map
<int32_t, int32_t>>& old_class_bucket
)
1370 // build set of previous used shadow ids
1371 set
<int32_t> used_ids
;
1372 for (auto& p
: old_class_bucket
) {
1373 for (auto& q
: p
.second
) {
1374 used_ids
.insert(q
.second
);
1378 find_nonshadow_roots(roots
);
1379 for (auto &r
: roots
) {
1382 for (auto &c
: class_name
) {
1384 int res
= device_class_clone(r
, c
.first
, old_class_bucket
, used_ids
,
1393 int CrushWrapper::trim_roots_with_class(bool unused
)
1396 find_shadow_roots(roots
);
1397 for (auto &r
: roots
) {
1400 int res
= remove_root(r
, unused
);
1404 // there is no need to reweight because we only remove from the
1409 int32_t CrushWrapper::_alloc_class_id() const {
1410 if (class_name
.empty()) {
1413 int32_t class_id
= class_name
.rbegin()->first
+ 1;
1414 if (class_id
>= 0) {
1417 // wrapped, pick a random start and do exhaustive search
1418 uint32_t upperlimit
= numeric_limits
<int32_t>::max();
1420 class_id
= rand() % upperlimit
;
1421 const auto start
= class_id
;
1423 if (!class_name
.count(class_id
)) {
1431 } while (class_id
!= start
);
1432 assert(0 == "no available class id");
1435 void CrushWrapper::reweight(CephContext
*cct
)
1439 for (set
<int>::iterator p
= roots
.begin(); p
!= roots
.end(); ++p
) {
1442 crush_bucket
*b
= get_bucket(*p
);
1443 ldout(cct
, 5) << "reweight bucket " << *p
<< dendl
;
1444 int r
= crush_reweight_bucket(crush
, b
);
1449 int CrushWrapper::add_simple_rule_at(
1450 string name
, string root_name
,
1451 string failure_domain_name
,
1452 string device_class
,
1453 string mode
, int rule_type
,
1457 if (rule_exists(name
)) {
1459 *err
<< "rule " << name
<< " exists";
1463 if (rule_exists(rno
)) {
1465 *err
<< "rule with ruleno " << rno
<< " exists";
1468 if (ruleset_exists(rno
)) {
1470 *err
<< "ruleset " << rno
<< " exists";
1474 for (rno
= 0; rno
< get_max_rules(); rno
++) {
1475 if (!rule_exists(rno
) && !ruleset_exists(rno
))
1479 if (!name_exists(root_name
)) {
1481 *err
<< "root item " << root_name
<< " does not exist";
1484 int root
= get_item_id(root_name
);
1486 if (failure_domain_name
.length()) {
1487 type
= get_type_id(failure_domain_name
);
1490 *err
<< "unknown type " << failure_domain_name
;
1494 if (device_class
.size()) {
1495 if (!class_exists(device_class
)) {
1497 *err
<< "device class " << device_class
<< " does not exist";
1500 int c
= get_class_id(device_class
);
1501 if (class_bucket
.count(root
) == 0 ||
1502 class_bucket
[root
].count(c
) == 0) {
1504 *err
<< "root " << root_name
<< " has no devices with class "
1508 root
= class_bucket
[root
][c
];
1510 if (mode
!= "firstn" && mode
!= "indep") {
1512 *err
<< "unknown mode " << mode
;
1517 if (mode
== "indep")
1519 int min_rep
= mode
== "firstn" ? 1 : 3;
1520 int max_rep
= mode
== "firstn" ? 10 : 20;
1521 //set the ruleset the same as rule_id(rno)
1522 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_rep
, max_rep
);
1525 if (mode
== "indep") {
1526 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1527 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1529 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, root
, 0);
1531 crush_rule_set_step(rule
, step
++,
1532 mode
== "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN
:
1533 CRUSH_RULE_CHOOSELEAF_INDEP
,
1537 crush_rule_set_step(rule
, step
++,
1538 mode
== "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN
:
1539 CRUSH_RULE_CHOOSE_INDEP
,
1542 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1544 int ret
= crush_add_rule(crush
, rule
, rno
);
1546 *err
<< "failed to add rule " << rno
<< " because " << cpp_strerror(ret
);
1549 set_rule_name(rno
, name
);
1554 int CrushWrapper::add_simple_rule(
1555 string name
, string root_name
,
1556 string failure_domain_name
,
1557 string device_class
,
1558 string mode
, int rule_type
,
1561 return add_simple_rule_at(name
, root_name
, failure_domain_name
, device_class
,
1563 rule_type
, -1, err
);
1566 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno
, map
<int,float> *pmap
)
1568 if (ruleno
>= crush
->max_rules
)
1570 if (crush
->rules
[ruleno
] == NULL
)
1572 crush_rule
*rule
= crush
->rules
[ruleno
];
1574 // build a weight map for each TAKE in the rule, and then merge them
1576 // FIXME: if there are multiple takes that place a different number of
1577 // objects we do not take that into account. (Also, note that doing this
1578 // right is also a function of the pool, since the crush rule
1579 // might choose 2 + choose 2 but pool size may only be 3.)
1580 for (unsigned i
=0; i
<rule
->len
; ++i
) {
1583 if (rule
->steps
[i
].op
== CRUSH_RULE_TAKE
) {
1584 int n
= rule
->steps
[i
].arg1
;
1591 //breadth first iterate the OSD tree
1592 while (!q
.empty()) {
1593 int bno
= q
.front();
1595 crush_bucket
*b
= crush
->buckets
[-1-bno
];
1597 for (unsigned j
=0; j
<b
->size
; ++j
) {
1598 int item_id
= b
->items
[j
];
1599 if (item_id
>= 0) { //it's an OSD
1600 float w
= crush_get_bucket_item_weight(b
, j
);
1603 } else { //not an OSD, expand the child later
1604 q
.push_back(item_id
);
1610 for (map
<int,float>::iterator p
= m
.begin(); p
!= m
.end(); ++p
) {
1611 map
<int,float>::iterator q
= pmap
->find(p
->first
);
1612 if (q
== pmap
->end()) {
1613 (*pmap
)[p
->first
] = p
->second
/ sum
;
1615 q
->second
+= p
->second
/ sum
;
1623 int CrushWrapper::remove_rule(int ruleno
)
1625 if (ruleno
>= (int)crush
->max_rules
)
1627 if (crush
->rules
[ruleno
] == NULL
)
1629 crush_destroy_rule(crush
->rules
[ruleno
]);
1630 crush
->rules
[ruleno
] = NULL
;
1631 rule_name_map
.erase(ruleno
);
1636 int CrushWrapper::bucket_adjust_item_weight(CephContext
*cct
, crush_bucket
*bucket
, int item
, int weight
)
1638 if (cct
->_conf
->osd_crush_update_weight_set
) {
1640 for (position
= 0; position
< bucket
->size
; position
++)
1641 if (bucket
->items
[position
] == item
)
1643 assert(position
!= bucket
->size
);
1644 for (auto w
: choose_args
) {
1645 crush_choose_arg_map arg_map
= w
.second
;
1646 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
1647 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1648 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1649 weight_set
->weights
[position
] = weight
;
1653 return crush_bucket_adjust_item_weight(crush
, bucket
, item
, weight
);
1656 int CrushWrapper::add_bucket(
1657 int bucketno
, int alg
, int hash
, int type
, int size
,
1658 int *items
, int *weights
, int *idout
)
1661 alg
= get_default_bucket_alg();
1665 crush_bucket
*b
= crush_make_bucket(crush
, alg
, hash
, type
, size
, items
,
1668 int r
= crush_add_bucket(crush
, bucketno
, b
, idout
);
1669 for (auto& p
: choose_args
) {
1670 crush_choose_arg_map
& cmap
= p
.second
;
1672 if ((int)cmap
.size
<= *idout
) {
1673 cmap
.args
= (crush_choose_arg
*)realloc(
1675 sizeof(crush_choose_arg
) * (*idout
+ 1));
1676 memset(&cmap
.args
[cmap
.size
], 0,
1677 sizeof(crush_choose_arg
) * (*idout
+ 1 - cmap
.size
));
1678 cmap
.size
= *idout
+ 1;
1681 cmap
.args
= (crush_choose_arg
*)calloc(sizeof(crush_choose_arg
),
1683 cmap
.size
= *idout
+ 1;
1686 int positions
= get_choose_args_positions(cmap
);
1687 crush_choose_arg
& carg
= cmap
.args
[*idout
];
1688 carg
.weight_set
= (crush_weight_set
*)calloc(sizeof(crush_weight_set
),
1690 carg
.weight_set_size
= positions
;
1691 for (int ppos
= 0; ppos
< positions
; ++ppos
) {
1692 carg
.weight_set
[ppos
].weights
= (__u32
*)calloc(sizeof(__u32
), size
);
1693 carg
.weight_set
[ppos
].size
= size
;
1694 for (int bpos
= 0; bpos
< size
; ++bpos
) {
1695 carg
.weight_set
[ppos
].weights
[bpos
] = weights
[bpos
];
1703 int CrushWrapper::bucket_add_item(crush_bucket
*bucket
, int item
, int weight
)
1705 __u32 new_size
= bucket
->size
+ 1;
1706 for (auto w
: choose_args
) {
1707 crush_choose_arg_map arg_map
= w
.second
;
1708 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
1709 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1710 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1711 weight_set
->weights
= (__u32
*)realloc(weight_set
->weights
,
1712 new_size
* sizeof(__u32
));
1713 assert(weight_set
->size
+ 1 == new_size
);
1714 weight_set
->weights
[weight_set
->size
] = weight
;
1715 weight_set
->size
= new_size
;
1717 if (arg
->ids_size
) {
1718 arg
->ids
= (__s32
*)realloc(arg
->ids
, new_size
* sizeof(__s32
));
1719 assert(arg
->ids_size
+ 1 == new_size
);
1720 arg
->ids
[arg
->ids_size
] = item
;
1721 arg
->ids_size
= new_size
;
1724 return crush_bucket_add_item(crush
, bucket
, item
, weight
);
1727 int CrushWrapper::bucket_remove_item(crush_bucket
*bucket
, int item
)
1729 __u32 new_size
= bucket
->size
- 1;
1731 for (position
= 0; position
< bucket
->size
; position
++)
1732 if (bucket
->items
[position
] == item
)
1734 assert(position
!= bucket
->size
);
1735 for (auto w
: choose_args
) {
1736 crush_choose_arg_map arg_map
= w
.second
;
1737 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
1738 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1739 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1740 assert(weight_set
->size
- 1 == new_size
);
1741 for (__u32 k
= position
; k
< new_size
; k
++)
1742 weight_set
->weights
[k
] = weight_set
->weights
[k
+1];
1743 weight_set
->weights
= (__u32
*)realloc(weight_set
->weights
,
1744 new_size
* sizeof(__u32
));
1745 weight_set
->size
= new_size
;
1747 if (arg
->ids_size
) {
1748 assert(arg
->ids_size
- 1 == new_size
);
1749 for (__u32 k
= position
; k
< new_size
; k
++)
1750 arg
->ids
[k
] = arg
->ids
[k
+1];
1751 arg
->ids
= (__s32
*)realloc(arg
->ids
, new_size
* sizeof(__s32
));
1752 arg
->ids_size
= new_size
;
1755 return crush_bucket_remove_item(crush
, bucket
, item
);
1758 int CrushWrapper::update_device_class(int id
,
1759 const string
& class_name
,
1763 assert(item_exists(id
));
1764 auto old_class_name
= get_item_class(id
);
1765 if (old_class_name
&& old_class_name
!= class_name
) {
1766 *ss
<< "osd." << id
<< " has already bound to class '" << old_class_name
1767 << "', can not reset class to '" << class_name
<< "'; "
1768 << "use 'ceph osd crush rm-device-class <osd>' to "
1769 << "remove old class first";
1773 int class_id
= get_or_create_class_id(class_name
);
1775 *ss
<< name
<< " id " << id
<< " is negative";
1779 if (class_map
.count(id
) != 0 && class_map
[id
] == class_id
) {
1780 *ss
<< name
<< " already set to class " << class_name
;
1784 set_item_class(id
, class_id
);
1786 int r
= rebuild_roots_with_classes();
1792 int CrushWrapper::remove_device_class(CephContext
*cct
, int id
, ostream
*ss
)
1795 const char *name
= get_item_name(id
);
1797 *ss
<< "osd." << id
<< " does not have a name";
1801 const char *class_name
= get_item_class(id
);
1803 *ss
<< "osd." << id
<< " has not been bound to a specific class yet";
1806 class_remove_item(id
);
1808 int r
= rebuild_roots_with_classes();
1810 *ss
<< "unable to rebuild roots with class '" << class_name
<< "' "
1811 << "of osd." << id
<< ": " << cpp_strerror(r
);
1817 int CrushWrapper::device_class_clone(
1818 int original_id
, int device_class
,
1819 const std::map
<int32_t, map
<int32_t, int32_t>>& old_class_bucket
,
1820 const std::set
<int32_t>& used_ids
,
1823 const char *item_name
= get_item_name(original_id
);
1824 if (item_name
== NULL
)
1826 const char *class_name
= get_class_name(device_class
);
1827 if (class_name
== NULL
)
1829 string copy_name
= item_name
+ string("~") + class_name
;
1830 if (name_exists(copy_name
)) {
1831 *clone
= get_item_id(copy_name
);
1834 crush_bucket
*original
= get_bucket(original_id
);
1835 assert(!IS_ERR(original
));
1836 crush_bucket
*copy
= crush_make_bucket(crush
,
1842 for (unsigned i
= 0; i
< original
->size
; i
++) {
1843 int item
= original
->items
[i
];
1844 int weight
= crush_get_bucket_item_weight(original
, i
);
1846 if (class_map
.count(item
) != 0 && class_map
[item
] == device_class
) {
1847 int res
= bucket_add_item(copy
, item
, weight
);
1853 int res
= device_class_clone(item
, device_class
, old_class_bucket
,
1854 used_ids
, &child_copy_id
);
1857 crush_bucket
*child_copy
= get_bucket(child_copy_id
);
1858 assert(!IS_ERR(child_copy
));
1859 res
= bucket_add_item(copy
, child_copy_id
, child_copy
->weight
);
1865 if (old_class_bucket
.count(original_id
) &&
1866 old_class_bucket
.at(original_id
).count(device_class
)) {
1867 bno
= old_class_bucket
.at(original_id
).at(device_class
);
1869 // pick a new shadow bucket id that is not used by the current map
1870 // *or* any previous shadow buckets.
1872 while (((-1-bno
) < crush
->max_buckets
&& crush
->buckets
[-1-bno
]) ||
1873 used_ids
.count(bno
)) {
1877 int res
= crush_add_bucket(crush
, bno
, copy
, clone
);
1880 assert(!bno
|| bno
== *clone
);
1881 res
= set_item_class(*clone
, device_class
);
1884 // we do not use set_item_name because the name is intentionally invalid
1885 name_map
[*clone
] = copy_name
;
1887 name_rmap
[copy_name
] = *clone
;
1888 class_bucket
[original_id
][device_class
] = *clone
;
1892 bool CrushWrapper::_class_is_dead(int class_id
)
1894 for (auto &p
: class_map
) {
1895 if (p
.first
>= 0 && p
.second
== class_id
) {
1899 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
1900 crush_rule
*r
= crush
->rules
[i
];
1903 for (unsigned j
= 0; j
< r
->len
; ++j
) {
1904 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
1905 int root
= r
->steps
[j
].arg1
;
1906 for (auto &p
: class_bucket
) {
1908 if (q
.count(class_id
) && q
[class_id
] == root
) {
1915 // no more referenced by any devices or crush rules
1919 void CrushWrapper::cleanup_dead_classes()
1921 for (auto &c
: class_name
) {
1922 if (_class_is_dead(c
.first
))
1923 remove_class_name(c
.second
);
1927 int CrushWrapper::rebuild_roots_with_classes()
1929 std::map
<int32_t, map
<int32_t, int32_t> > old_class_bucket
= class_bucket
;
1930 cleanup_dead_classes();
1931 int r
= trim_roots_with_class(false);
1934 class_bucket
.clear();
1935 return populate_classes(old_class_bucket
);
1938 void CrushWrapper::encode(bufferlist
& bl
, uint64_t features
) const
1942 __u32 magic
= CRUSH_MAGIC
;
1943 ::encode(magic
, bl
);
1945 ::encode(crush
->max_buckets
, bl
);
1946 ::encode(crush
->max_rules
, bl
);
1947 ::encode(crush
->max_devices
, bl
);
1949 bool encode_compat_choose_args
= false;
1950 crush_choose_arg_map arg_map
;
1951 memset(&arg_map
, '\0', sizeof(arg_map
));
1952 if (has_choose_args() &&
1953 !HAVE_FEATURE(features
, CRUSH_CHOOSE_ARGS
)) {
1954 assert(!has_incompat_choose_args());
1955 encode_compat_choose_args
= true;
1956 arg_map
= choose_args
.begin()->second
;
1960 for (int i
=0; i
<crush
->max_buckets
; i
++) {
1962 if (crush
->buckets
[i
]) alg
= crush
->buckets
[i
]->alg
;
1967 ::encode(crush
->buckets
[i
]->id
, bl
);
1968 ::encode(crush
->buckets
[i
]->type
, bl
);
1969 ::encode(crush
->buckets
[i
]->alg
, bl
);
1970 ::encode(crush
->buckets
[i
]->hash
, bl
);
1971 ::encode(crush
->buckets
[i
]->weight
, bl
);
1972 ::encode(crush
->buckets
[i
]->size
, bl
);
1973 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++)
1974 ::encode(crush
->buckets
[i
]->items
[j
], bl
);
1976 switch (crush
->buckets
[i
]->alg
) {
1977 case CRUSH_BUCKET_UNIFORM
:
1978 ::encode((reinterpret_cast<crush_bucket_uniform
*>(crush
->buckets
[i
]))->item_weight
, bl
);
1981 case CRUSH_BUCKET_LIST
:
1982 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
1983 ::encode((reinterpret_cast<crush_bucket_list
*>(crush
->buckets
[i
]))->item_weights
[j
], bl
);
1984 ::encode((reinterpret_cast<crush_bucket_list
*>(crush
->buckets
[i
]))->sum_weights
[j
], bl
);
1988 case CRUSH_BUCKET_TREE
:
1989 ::encode((reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->num_nodes
, bl
);
1990 for (unsigned j
=0; j
<(reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->num_nodes
; j
++)
1991 ::encode((reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->node_weights
[j
], bl
);
1994 case CRUSH_BUCKET_STRAW
:
1995 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
1996 ::encode((reinterpret_cast<crush_bucket_straw
*>(crush
->buckets
[i
]))->item_weights
[j
], bl
);
1997 ::encode((reinterpret_cast<crush_bucket_straw
*>(crush
->buckets
[i
]))->straws
[j
], bl
);
2001 case CRUSH_BUCKET_STRAW2
:
2004 if (encode_compat_choose_args
&&
2005 arg_map
.args
[i
].weight_set_size
> 0) {
2006 weights
= arg_map
.args
[i
].weight_set
[0].weights
;
2008 weights
= (reinterpret_cast<crush_bucket_straw2
*>(crush
->buckets
[i
]))->item_weights
;
2010 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
2011 ::encode(weights
[j
], bl
);
2023 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
2024 __u32 yes
= crush
->rules
[i
] ? 1:0;
2029 ::encode(crush
->rules
[i
]->len
, bl
);
2030 ::encode(crush
->rules
[i
]->mask
, bl
);
2031 for (unsigned j
=0; j
<crush
->rules
[i
]->len
; j
++)
2032 ::encode(crush
->rules
[i
]->steps
[j
], bl
);
2036 ::encode(type_map
, bl
);
2037 ::encode(name_map
, bl
);
2038 ::encode(rule_name_map
, bl
);
2041 ::encode(crush
->choose_local_tries
, bl
);
2042 ::encode(crush
->choose_local_fallback_tries
, bl
);
2043 ::encode(crush
->choose_total_tries
, bl
);
2044 ::encode(crush
->chooseleaf_descend_once
, bl
);
2045 ::encode(crush
->chooseleaf_vary_r
, bl
);
2046 ::encode(crush
->straw_calc_version
, bl
);
2047 ::encode(crush
->allowed_bucket_algs
, bl
);
2048 if (features
& CEPH_FEATURE_CRUSH_TUNABLES5
) {
2049 ::encode(crush
->chooseleaf_stable
, bl
);
2052 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2054 ::encode(class_map
, bl
);
2055 ::encode(class_name
, bl
);
2056 ::encode(class_bucket
, bl
);
2059 __u32 size
= (__u32
)choose_args
.size();
2061 for (auto c
: choose_args
) {
2062 ::encode(c
.first
, bl
);
2063 crush_choose_arg_map arg_map
= c
.second
;
2065 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
2066 crush_choose_arg
*arg
= &arg_map
.args
[i
];
2067 if (arg
->weight_set_size
== 0 &&
2073 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
2074 crush_choose_arg
*arg
= &arg_map
.args
[i
];
2075 if (arg
->weight_set_size
== 0 &&
2079 ::encode(arg
->weight_set_size
, bl
);
2080 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
2081 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
2082 ::encode(weight_set
->size
, bl
);
2083 for (__u32 k
= 0; k
< weight_set
->size
; k
++)
2084 ::encode(weight_set
->weights
[k
], bl
);
2086 ::encode(arg
->ids_size
, bl
);
2087 for (__u32 j
= 0; j
< arg
->ids_size
; j
++)
2088 ::encode(arg
->ids
[j
], bl
);
2094 static void decode_32_or_64_string_map(map
<int32_t,string
>& m
, bufferlist::iterator
& blp
)
2104 ::decode(strlen
, blp
);
2106 // der, key was actually 64-bits!
2107 ::decode(strlen
, blp
);
2109 ::decode_nohead(strlen
, m
[key
], blp
);
2113 void CrushWrapper::decode(bufferlist::iterator
& blp
)
2118 ::decode(magic
, blp
);
2119 if (magic
!= CRUSH_MAGIC
)
2120 throw buffer::malformed_input("bad magic number");
2122 ::decode(crush
->max_buckets
, blp
);
2123 ::decode(crush
->max_rules
, blp
);
2124 ::decode(crush
->max_devices
, blp
);
2126 // legacy tunables, unless we decode something newer
2127 set_tunables_legacy();
2131 crush
->buckets
= (crush_bucket
**)calloc(1, crush
->max_buckets
* sizeof(crush_bucket
*));
2132 for (int i
=0; i
<crush
->max_buckets
; i
++) {
2133 decode_crush_bucket(&crush
->buckets
[i
], blp
);
2137 crush
->rules
= (crush_rule
**)calloc(1, crush
->max_rules
* sizeof(crush_rule
*));
2138 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
2142 crush
->rules
[i
] = NULL
;
2148 crush
->rules
[i
] = reinterpret_cast<crush_rule
*>(calloc(1, crush_rule_size(len
)));
2149 crush
->rules
[i
]->len
= len
;
2150 ::decode(crush
->rules
[i
]->mask
, blp
);
2151 for (unsigned j
=0; j
<crush
->rules
[i
]->len
; j
++)
2152 ::decode(crush
->rules
[i
]->steps
[j
], blp
);
2156 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2157 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2158 // both by assuming the string is always non-empty.
2159 decode_32_or_64_string_map(type_map
, blp
);
2160 decode_32_or_64_string_map(name_map
, blp
);
2161 decode_32_or_64_string_map(rule_name_map
, blp
);
2165 ::decode(crush
->choose_local_tries
, blp
);
2166 ::decode(crush
->choose_local_fallback_tries
, blp
);
2167 ::decode(crush
->choose_total_tries
, blp
);
2170 ::decode(crush
->chooseleaf_descend_once
, blp
);
2173 ::decode(crush
->chooseleaf_vary_r
, blp
);
2176 ::decode(crush
->straw_calc_version
, blp
);
2179 ::decode(crush
->allowed_bucket_algs
, blp
);
2182 ::decode(crush
->chooseleaf_stable
, blp
);
2185 ::decode(class_map
, blp
);
2186 ::decode(class_name
, blp
);
2187 for (auto &c
: class_name
)
2188 class_rname
[c
.second
] = c
.first
;
2189 ::decode(class_bucket
, blp
);
2192 __u32 choose_args_size
;
2193 ::decode(choose_args_size
, blp
);
2194 for (__u32 i
= 0; i
< choose_args_size
; i
++) {
2195 uint64_t choose_args_index
;
2196 ::decode(choose_args_index
, blp
);
2197 crush_choose_arg_map arg_map
;
2198 arg_map
.size
= crush
->max_buckets
;
2199 arg_map
.args
= (crush_choose_arg
*)calloc(
2200 arg_map
.size
, sizeof(crush_choose_arg
));
2202 ::decode(size
, blp
);
2203 for (__u32 j
= 0; j
< size
; j
++) {
2205 ::decode(bucket_index
, blp
);
2206 assert(bucket_index
< arg_map
.size
);
2207 crush_choose_arg
*arg
= &arg_map
.args
[bucket_index
];
2208 ::decode(arg
->weight_set_size
, blp
);
2209 if (arg
->weight_set_size
) {
2210 arg
->weight_set
= (crush_weight_set
*)calloc(
2211 arg
->weight_set_size
, sizeof(crush_weight_set
));
2212 for (__u32 k
= 0; k
< arg
->weight_set_size
; k
++) {
2213 crush_weight_set
*weight_set
= &arg
->weight_set
[k
];
2214 ::decode(weight_set
->size
, blp
);
2215 weight_set
->weights
= (__u32
*)calloc(
2216 weight_set
->size
, sizeof(__u32
));
2217 for (__u32 l
= 0; l
< weight_set
->size
; l
++)
2218 ::decode(weight_set
->weights
[l
], blp
);
2221 ::decode(arg
->ids_size
, blp
);
2222 if (arg
->ids_size
) {
2223 assert(arg
->ids_size
== crush
->buckets
[bucket_index
]->size
);
2224 arg
->ids
= (__s32
*)calloc(arg
->ids_size
, sizeof(__s32
));
2225 for (__u32 k
= 0; k
< arg
->ids_size
; k
++)
2226 ::decode(arg
->ids
[k
], blp
);
2229 choose_args
[choose_args_index
] = arg_map
;
2235 crush_destroy(crush
);
2240 void CrushWrapper::decode_crush_bucket(crush_bucket
** bptr
, bufferlist::iterator
&blp
)
2251 case CRUSH_BUCKET_UNIFORM
:
2252 size
= sizeof(crush_bucket_uniform
);
2254 case CRUSH_BUCKET_LIST
:
2255 size
= sizeof(crush_bucket_list
);
2257 case CRUSH_BUCKET_TREE
:
2258 size
= sizeof(crush_bucket_tree
);
2260 case CRUSH_BUCKET_STRAW
:
2261 size
= sizeof(crush_bucket_straw
);
2263 case CRUSH_BUCKET_STRAW2
:
2264 size
= sizeof(crush_bucket_straw2
);
2269 snprintf(str
, sizeof(str
), "unsupported bucket algorithm: %d", alg
);
2270 throw buffer::malformed_input(str
);
2273 crush_bucket
*bucket
= reinterpret_cast<crush_bucket
*>(calloc(1, size
));
2276 ::decode(bucket
->id
, blp
);
2277 ::decode(bucket
->type
, blp
);
2278 ::decode(bucket
->alg
, blp
);
2279 ::decode(bucket
->hash
, blp
);
2280 ::decode(bucket
->weight
, blp
);
2281 ::decode(bucket
->size
, blp
);
2283 bucket
->items
= (__s32
*)calloc(1, bucket
->size
* sizeof(__s32
));
2284 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2285 ::decode(bucket
->items
[j
], blp
);
2288 switch (bucket
->alg
) {
2289 case CRUSH_BUCKET_UNIFORM
:
2290 ::decode((reinterpret_cast<crush_bucket_uniform
*>(bucket
))->item_weight
, blp
);
2293 case CRUSH_BUCKET_LIST
: {
2294 crush_bucket_list
* cbl
= reinterpret_cast<crush_bucket_list
*>(bucket
);
2295 cbl
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2296 cbl
->sum_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2298 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2299 ::decode(cbl
->item_weights
[j
], blp
);
2300 ::decode(cbl
->sum_weights
[j
], blp
);
2305 case CRUSH_BUCKET_TREE
: {
2306 crush_bucket_tree
* cbt
= reinterpret_cast<crush_bucket_tree
*>(bucket
);
2307 ::decode(cbt
->num_nodes
, blp
);
2308 cbt
->node_weights
= (__u32
*)calloc(1, cbt
->num_nodes
* sizeof(__u32
));
2309 for (unsigned j
=0; j
<cbt
->num_nodes
; j
++) {
2310 ::decode(cbt
->node_weights
[j
], blp
);
2315 case CRUSH_BUCKET_STRAW
: {
2316 crush_bucket_straw
* cbs
= reinterpret_cast<crush_bucket_straw
*>(bucket
);
2317 cbs
->straws
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2318 cbs
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2319 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2320 ::decode(cbs
->item_weights
[j
], blp
);
2321 ::decode(cbs
->straws
[j
], blp
);
2326 case CRUSH_BUCKET_STRAW2
: {
2327 crush_bucket_straw2
* cbs
= reinterpret_cast<crush_bucket_straw2
*>(bucket
);
2328 cbs
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2329 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2330 ::decode(cbs
->item_weights
[j
], blp
);
2336 // We should have handled this case in the first switch statement
2343 void CrushWrapper::dump(Formatter
*f
) const
2345 f
->open_array_section("devices");
2346 for (int i
=0; i
<get_max_devices(); i
++) {
2347 f
->open_object_section("device");
2348 f
->dump_int("id", i
);
2349 const char *n
= get_item_name(i
);
2351 f
->dump_string("name", n
);
2354 sprintf(name
, "device%d", i
);
2355 f
->dump_string("name", name
);
2357 const char *device_class
= get_item_class(i
);
2358 if (device_class
!= NULL
)
2359 f
->dump_string("class", device_class
);
2364 f
->open_array_section("types");
2365 int n
= get_num_type_names();
2366 for (int i
=0; n
; i
++) {
2367 const char *name
= get_type_name(i
);
2370 f
->open_object_section("type");
2371 f
->dump_int("type_id", 0);
2372 f
->dump_string("name", "device");
2378 f
->open_object_section("type");
2379 f
->dump_int("type_id", i
);
2380 f
->dump_string("name", name
);
2385 f
->open_array_section("buckets");
2386 for (int bucket
= -1; bucket
> -1-get_max_buckets(); --bucket
) {
2387 if (!bucket_exists(bucket
))
2389 f
->open_object_section("bucket");
2390 f
->dump_int("id", bucket
);
2391 if (get_item_name(bucket
))
2392 f
->dump_string("name", get_item_name(bucket
));
2393 f
->dump_int("type_id", get_bucket_type(bucket
));
2394 if (get_type_name(get_bucket_type(bucket
)))
2395 f
->dump_string("type_name", get_type_name(get_bucket_type(bucket
)));
2396 f
->dump_int("weight", get_bucket_weight(bucket
));
2397 f
->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket
)));
2398 f
->dump_string("hash", crush_hash_name(get_bucket_hash(bucket
)));
2399 f
->open_array_section("items");
2400 for (int j
=0; j
<get_bucket_size(bucket
); j
++) {
2401 f
->open_object_section("item");
2402 f
->dump_int("id", get_bucket_item(bucket
, j
));
2403 f
->dump_int("weight", get_bucket_item_weight(bucket
, j
));
2404 f
->dump_int("pos", j
);
2412 f
->open_array_section("rules");
2416 f
->open_object_section("tunables");
2420 dump_choose_args(f
);
2424 // depth first walker
2426 typedef CrushTreeDumper::Item Item
;
2427 const CrushWrapper
*crush
;
2428 const CrushTreeDumper::name_map_t
& weight_set_names
;
2430 explicit TreeDumper(const CrushWrapper
*crush
,
2431 const CrushTreeDumper::name_map_t
& wsnames
)
2432 : crush(crush
), weight_set_names(wsnames
) {}
2434 void dump(Formatter
*f
) {
2436 crush
->find_roots(roots
);
2437 for (set
<int>::iterator root
= roots
.begin(); root
!= roots
.end(); ++root
) {
2438 dump_item(Item(*root
, 0, 0, crush
->get_bucket_weightf(*root
)), f
);
2443 void dump_item(const Item
& qi
, Formatter
* f
) {
2444 if (qi
.is_bucket()) {
2445 f
->open_object_section("bucket");
2446 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
2447 dump_bucket_children(qi
, f
);
2450 f
->open_object_section("device");
2451 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
2456 void dump_bucket_children(const Item
& parent
, Formatter
* f
) {
2457 f
->open_array_section("items");
2458 const int max_pos
= crush
->get_bucket_size(parent
.id
);
2459 for (int pos
= 0; pos
< max_pos
; pos
++) {
2460 int id
= crush
->get_bucket_item(parent
.id
, pos
);
2461 float weight
= crush
->get_bucket_item_weightf(parent
.id
, pos
);
2462 dump_item(Item(id
, parent
.id
, parent
.depth
+ 1, weight
), f
);
2469 void CrushWrapper::dump_tree(
2471 const CrushTreeDumper::name_map_t
& weight_set_names
) const
2474 TreeDumper(this, weight_set_names
).dump(f
);
2477 void CrushWrapper::dump_tunables(Formatter
*f
) const
2479 f
->dump_int("choose_local_tries", get_choose_local_tries());
2480 f
->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2481 f
->dump_int("choose_total_tries", get_choose_total_tries());
2482 f
->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2483 f
->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2484 f
->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2485 f
->dump_int("straw_calc_version", get_straw_calc_version());
2486 f
->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2488 // be helpful about it
2489 if (has_jewel_tunables())
2490 f
->dump_string("profile", "jewel");
2491 else if (has_hammer_tunables())
2492 f
->dump_string("profile", "hammer");
2493 else if (has_firefly_tunables())
2494 f
->dump_string("profile", "firefly");
2495 else if (has_bobtail_tunables())
2496 f
->dump_string("profile", "bobtail");
2497 else if (has_argonaut_tunables())
2498 f
->dump_string("profile", "argonaut");
2500 f
->dump_string("profile", "unknown");
2501 f
->dump_int("optimal_tunables", (int)has_optimal_tunables());
2502 f
->dump_int("legacy_tunables", (int)has_legacy_tunables());
2504 // be helpful about minimum version required
2505 f
->dump_string("minimum_required_version", get_min_required_version());
2507 f
->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2508 f
->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2509 f
->dump_int("has_v2_rules", (int)has_v2_rules());
2510 f
->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2511 f
->dump_int("has_v3_rules", (int)has_v3_rules());
2512 f
->dump_int("has_v4_buckets", (int)has_v4_buckets());
2513 f
->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2514 f
->dump_int("has_v5_rules", (int)has_v5_rules());
2517 void CrushWrapper::dump_choose_args(Formatter
*f
) const
2519 f
->open_object_section("choose_args");
2520 for (auto c
: choose_args
) {
2521 crush_choose_arg_map arg_map
= c
.second
;
2522 f
->open_array_section(stringify(c
.first
).c_str());
2523 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
2524 crush_choose_arg
*arg
= &arg_map
.args
[i
];
2525 if (arg
->weight_set_size
== 0 &&
2528 f
->open_object_section("choose_args");
2529 int bucket_index
= i
;
2530 f
->dump_int("bucket_id", -1-bucket_index
);
2531 if (arg
->weight_set_size
> 0) {
2532 f
->open_array_section("weight_set");
2533 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
2534 f
->open_array_section("weights");
2535 __u32
*weights
= arg
->weight_set
[j
].weights
;
2536 __u32 size
= arg
->weight_set
[j
].size
;
2537 for (__u32 k
= 0; k
< size
; k
++) {
2538 f
->dump_float("weight", (float)weights
[k
]/(float)0x10000);
2544 if (arg
->ids_size
> 0) {
2545 f
->open_array_section("ids");
2546 for (__u32 j
= 0; j
< arg
->ids_size
; j
++)
2547 f
->dump_int("id", arg
->ids
[j
]);
2557 void CrushWrapper::dump_rules(Formatter
*f
) const
2559 for (int i
=0; i
<get_max_rules(); i
++) {
2560 if (!rule_exists(i
))
2566 void CrushWrapper::dump_rule(int ruleset
, Formatter
*f
) const
2568 f
->open_object_section("rule");
2569 f
->dump_int("rule_id", ruleset
);
2570 if (get_rule_name(ruleset
))
2571 f
->dump_string("rule_name", get_rule_name(ruleset
));
2572 f
->dump_int("ruleset", get_rule_mask_ruleset(ruleset
));
2573 f
->dump_int("type", get_rule_mask_type(ruleset
));
2574 f
->dump_int("min_size", get_rule_mask_min_size(ruleset
));
2575 f
->dump_int("max_size", get_rule_mask_max_size(ruleset
));
2576 f
->open_array_section("steps");
2577 for (int j
=0; j
<get_rule_len(ruleset
); j
++) {
2578 f
->open_object_section("step");
2579 switch (get_rule_op(ruleset
, j
)) {
2580 case CRUSH_RULE_NOOP
:
2581 f
->dump_string("op", "noop");
2583 case CRUSH_RULE_TAKE
:
2584 f
->dump_string("op", "take");
2586 int item
= get_rule_arg1(ruleset
, j
);
2587 f
->dump_int("item", item
);
2589 const char *name
= get_item_name(item
);
2590 f
->dump_string("item_name", name
? name
: "");
2593 case CRUSH_RULE_EMIT
:
2594 f
->dump_string("op", "emit");
2596 case CRUSH_RULE_CHOOSE_FIRSTN
:
2597 f
->dump_string("op", "choose_firstn");
2598 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2599 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2601 case CRUSH_RULE_CHOOSE_INDEP
:
2602 f
->dump_string("op", "choose_indep");
2603 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2604 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2606 case CRUSH_RULE_CHOOSELEAF_FIRSTN
:
2607 f
->dump_string("op", "chooseleaf_firstn");
2608 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2609 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2611 case CRUSH_RULE_CHOOSELEAF_INDEP
:
2612 f
->dump_string("op", "chooseleaf_indep");
2613 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2614 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2616 case CRUSH_RULE_SET_CHOOSE_TRIES
:
2617 f
->dump_string("op", "set_choose_tries");
2618 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2620 case CRUSH_RULE_SET_CHOOSELEAF_TRIES
:
2621 f
->dump_string("op", "set_chooseleaf_tries");
2622 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2625 f
->dump_int("opcode", get_rule_op(ruleset
, j
));
2626 f
->dump_int("arg1", get_rule_arg1(ruleset
, j
));
2627 f
->dump_int("arg2", get_rule_arg2(ruleset
, j
));
2635 void CrushWrapper::list_rules(Formatter
*f
) const
2637 for (int rule
= 0; rule
< get_max_rules(); rule
++) {
2638 if (!rule_exists(rule
))
2640 f
->dump_string("name", get_rule_name(rule
));
2644 void CrushWrapper::list_rules(ostream
*ss
) const
2646 for (int rule
= 0; rule
< get_max_rules(); rule
++) {
2647 if (!rule_exists(rule
))
2649 *ss
<< get_rule_name(rule
) << "\n";
2653 class CrushTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
2655 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
2657 explicit CrushTreePlainDumper(const CrushWrapper
*crush
,
2658 const CrushTreeDumper::name_map_t
& wsnames
)
2659 : Parent(crush
, wsnames
) {}
2660 explicit CrushTreePlainDumper(const CrushWrapper
*crush
,
2661 const CrushTreeDumper::name_map_t
& wsnames
,
2663 : Parent(crush
, wsnames
, show_shadow
) {}
2666 void dump(TextTable
*tbl
) {
2667 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
2668 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
2669 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
2670 for (auto& p
: crush
->choose_args
) {
2671 if (p
.first
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
2672 tbl
->define_column("(compat)", TextTable::LEFT
, TextTable::RIGHT
);
2675 auto q
= weight_set_names
.find(p
.first
);
2676 name
= q
!= weight_set_names
.end() ? q
->second
:
2678 tbl
->define_column(name
.c_str(), TextTable::LEFT
, TextTable::RIGHT
);
2681 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
2686 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
2687 const char *c
= crush
->get_item_class(qi
.id
);
2692 << weightf_t(qi
.weight
);
2693 for (auto& p
: crush
->choose_args
) {
2694 if (qi
.parent
< 0) {
2695 const crush_choose_arg_map cmap
= crush
->choose_args_get(p
.first
);
2696 int bidx
= -1 - qi
.parent
;
2697 const crush_bucket
*b
= crush
->get_bucket(qi
.parent
);
2699 bidx
< (int)cmap
.size
&&
2700 cmap
.args
[bidx
].weight_set
&&
2701 cmap
.args
[bidx
].weight_set_size
>= 1) {
2704 pos
< (int)cmap
.args
[bidx
].weight_set
[0].size
&&
2705 b
->items
[pos
] != qi
.id
;
2707 *tbl
<< weightf_t((float)cmap
.args
[bidx
].weight_set
[0].weights
[pos
] /
2715 for (int k
=0; k
< qi
.depth
; k
++) {
2718 if (qi
.is_bucket()) {
2719 ss
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
2720 << crush
->get_item_name(qi
.id
);
2722 ss
<< "osd." << qi
.id
;
2725 *tbl
<< TextTable::endrow
;
2730 class CrushTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
2732 typedef CrushTreeDumper::FormattingDumper Parent
;
2734 explicit CrushTreeFormattingDumper(
2735 const CrushWrapper
*crush
,
2736 const CrushTreeDumper::name_map_t
& wsnames
)
2737 : Parent(crush
, wsnames
) {}
2739 explicit CrushTreeFormattingDumper(
2740 const CrushWrapper
*crush
,
2741 const CrushTreeDumper::name_map_t
& wsnames
,
2743 : Parent(crush
, wsnames
, show_shadow
) {}
2745 void dump(Formatter
*f
) {
2746 f
->open_array_section("nodes");
2749 f
->open_array_section("stray");
2755 void CrushWrapper::dump_tree(
2758 const CrushTreeDumper::name_map_t
& weight_set_names
,
2759 bool show_shadow
) const
2763 CrushTreePlainDumper(this, weight_set_names
, show_shadow
).dump(&tbl
);
2767 CrushTreeFormattingDumper(this, weight_set_names
, show_shadow
).dump(f
);
2771 void CrushWrapper::generate_test_instances(list
<CrushWrapper
*>& o
)
2773 o
.push_back(new CrushWrapper
);
2778 * Determine the default CRUSH ruleset ID to be used with
2779 * newly created replicated pools.
2781 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
2783 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext
*cct
)
2785 int crush_ruleset
= cct
->_conf
->osd_pool_default_crush_rule
;
2786 if (crush_ruleset
< 0) {
2787 crush_ruleset
= find_first_ruleset(pg_pool_t::TYPE_REPLICATED
);
2788 } else if (!ruleset_exists(crush_ruleset
)) {
2789 crush_ruleset
= -1; // match find_first_ruleset() retval
2791 return crush_ruleset
;
2794 bool CrushWrapper::is_valid_crush_name(const string
& s
)
2798 for (string::const_iterator p
= s
.begin(); p
!= s
.end(); ++p
) {
2802 !(*p
>= '0' && *p
<= '9') &&
2803 !(*p
>= 'A' && *p
<= 'Z') &&
2804 !(*p
>= 'a' && *p
<= 'z'))
2810 bool CrushWrapper::is_valid_crush_loc(CephContext
*cct
,
2811 const map
<string
,string
>& loc
)
2813 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
2814 if (!is_valid_crush_name(l
->first
) ||
2815 !is_valid_crush_name(l
->second
)) {
2816 ldout(cct
, 1) << "loc["
2817 << l
->first
<< "] = '"
2818 << l
->second
<< "' not a valid crush name ([A-Za-z0-9_-.]+)"
2826 int CrushWrapper::_choose_type_stack(
2828 const vector
<pair
<int,int>>& stack
,
2829 const set
<int>& overfull
,
2830 const vector
<int>& underfull
,
2831 const vector
<int>& orig
,
2832 vector
<int>::const_iterator
& i
,
2834 vector
<int> *pw
) const
2836 vector
<int> w
= *pw
;
2839 ldout(cct
, 10) << __func__
<< " stack " << stack
2845 vector
<int> cumulative_fanout(stack
.size());
2847 for (int j
= (int)stack
.size() - 1; j
>= 0; --j
) {
2848 cumulative_fanout
[j
] = f
;
2849 f
*= stack
[j
].second
;
2851 ldout(cct
, 10) << __func__
<< " cumulative_fanout " << cumulative_fanout
2854 // identify underful targets for each intermediate level.
2855 // this serves two purposes:
2856 // 1. we can tell when we are selecting a bucket that does not have any underfull
2857 // devices beneath it. that means that if the current input includes an overfull
2858 // device, we won't be able to find an underfull device with this parent to
2860 // 2. when we decide we should reject a bucket due to the above, this list gives us
2861 // a list of peers to consider that *do* have underfull devices available.. (we
2862 // are careful to pick one that has the same parent.)
2863 vector
<set
<int>> underfull_buckets
; // level -> set of buckets with >0 underfull item(s)
2864 underfull_buckets
.resize(stack
.size() - 1);
2865 for (auto osd
: underfull
) {
2867 for (int j
= (int)stack
.size() - 2; j
>= 0; --j
) {
2868 int type
= stack
[j
].first
;
2869 item
= get_parent_of_type(item
, type
);
2870 ldout(cct
, 10) << __func__
<< " underfull " << osd
<< " type " << type
2871 << " is " << item
<< dendl
;
2872 underfull_buckets
[j
].insert(item
);
2875 ldout(cct
, 20) << __func__
<< " underfull_buckets " << underfull_buckets
<< dendl
;
2877 for (unsigned j
= 0; j
< stack
.size(); ++j
) {
2878 int type
= stack
[j
].first
;
2879 int fanout
= stack
[j
].second
;
2880 int cum_fanout
= cumulative_fanout
[j
];
2881 ldout(cct
, 10) << " level " << j
<< ": type " << type
<< " fanout " << fanout
2882 << " cumulative " << cum_fanout
2883 << " w " << w
<< dendl
;
2886 for (auto from
: w
) {
2887 ldout(cct
, 10) << " from " << from
<< dendl
;
2888 // identify leaves under each choice. we use this to check whether any of these
2889 // leaves are overfull. (if so, we need to make sure there are underfull candidates
2890 // to swap for them.)
2891 vector
<set
<int>> leaves
;
2892 leaves
.resize(fanout
);
2893 for (int pos
= 0; pos
< fanout
; ++pos
) {
2896 int item
= get_parent_of_type(*tmpi
, type
);
2899 while (n
-- && tmpi
!= orig
.end()) {
2900 leaves
[pos
].insert(*tmpi
++);
2902 ldout(cct
, 10) << __func__
<< " from " << *tmpi
<< " got " << item
2903 << " of type " << type
<< " over leaves " << leaves
[pos
] << dendl
;
2906 bool replaced
= false;
2907 if (overfull
.count(*i
)) {
2908 for (auto item
: underfull
) {
2909 ldout(cct
, 10) << __func__
<< " pos " << pos
2910 << " was " << *i
<< " considering " << item
2912 if (used
.count(item
)) {
2913 ldout(cct
, 20) << __func__
<< " in used " << used
<< dendl
;
2916 if (!subtree_contains(from
, item
)) {
2917 ldout(cct
, 20) << __func__
<< " not in subtree " << from
<< dendl
;
2920 if (std::find(orig
.begin(), orig
.end(), item
) != orig
.end()) {
2921 ldout(cct
, 20) << __func__
<< " in orig " << orig
<< dendl
;
2926 ldout(cct
, 10) << __func__
<< " pos " << pos
<< " replace "
2927 << *i
<< " -> " << item
<< dendl
;
2934 ldout(cct
, 10) << __func__
<< " pos " << pos
<< " keep " << *i
2939 if (i
== orig
.end()) {
2940 ldout(cct
, 10) << __func__
<< " end of orig, break 1" << dendl
;
2945 if (j
+ 1 < stack
.size()) {
2946 // check if any buckets have overfull leaves but no underfull candidates
2947 for (int pos
= 0; pos
< fanout
; ++pos
) {
2948 if (underfull_buckets
[j
].count(o
[pos
]) == 0) {
2949 // are any leaves overfull?
2950 bool any_overfull
= false;
2951 for (auto osd
: leaves
[pos
]) {
2952 if (overfull
.count(osd
)) {
2953 any_overfull
= true;
2957 ldout(cct
, 10) << " bucket " << o
[pos
] << " has no underfull targets and "
2958 << ">0 leaves " << leaves
[pos
] << " is overfull; alts "
2959 << underfull_buckets
[j
]
2961 for (auto alt
: underfull_buckets
[j
]) {
2962 if (std::find(o
.begin(), o
.end(), alt
) == o
.end()) {
2963 // see if alt has the same parent
2965 get_parent_of_type(o
[pos
], stack
[j
-1].first
) ==
2966 get_parent_of_type(alt
, stack
[j
-1].first
)) {
2968 ldout(cct
, 10) << " replacing " << o
[pos
]
2969 << " (which has no underfull leaves) with " << alt
2971 << get_parent_of_type(alt
, stack
[j
-1].first
) << " type "
2972 << type
<< ")" << dendl
;
2974 ldout(cct
, 10) << " replacing " << o
[pos
]
2975 << " (which has no underfull leaves) with " << alt
2976 << " (first level)" << dendl
;
2980 ldout(cct
, 30) << " alt " << alt
<< " for " << o
[pos
]
2981 << " has different parent, skipping" << dendl
;
2989 if (i
== orig
.end()) {
2990 ldout(cct
, 10) << __func__
<< " end of orig, break 2" << dendl
;
2994 ldout(cct
, 10) << __func__
<< " w <- " << o
<< " was " << w
<< dendl
;
3001 int CrushWrapper::try_remap_rule(
3005 const set
<int>& overfull
,
3006 const vector
<int>& underfull
,
3007 const vector
<int>& orig
,
3008 vector
<int> *out
) const
3010 const crush_map
*map
= crush
;
3011 const crush_rule
*rule
= get_rule(ruleno
);
3014 ldout(cct
, 10) << __func__
<< " ruleno " << ruleno
3015 << " numrep " << maxout
<< " overfull " << overfull
3016 << " underfull " << underfull
<< " orig " << orig
3018 vector
<int> w
; // working set
3021 auto i
= orig
.begin();
3024 vector
<pair
<int,int>> type_stack
; // (type, fan-out)
3026 for (unsigned step
= 0; step
< rule
->len
; ++step
) {
3027 const crush_rule_step
*curstep
= &rule
->steps
[step
];
3028 ldout(cct
, 10) << __func__
<< " step " << step
<< " w " << w
<< dendl
;
3029 switch (curstep
->op
) {
3030 case CRUSH_RULE_TAKE
:
3031 if ((curstep
->arg1
>= 0 && curstep
->arg1
< map
->max_devices
) ||
3032 (-1-curstep
->arg1
>= 0 && -1-curstep
->arg1
< map
->max_buckets
&&
3033 map
->buckets
[-1-curstep
->arg1
])) {
3035 w
.push_back(curstep
->arg1
);
3036 ldout(cct
, 10) << __func__
<< " take " << w
<< dendl
;
3038 ldout(cct
, 1) << " bad take value " << curstep
->arg1
<< dendl
;
3042 case CRUSH_RULE_CHOOSELEAF_FIRSTN
:
3043 case CRUSH_RULE_CHOOSELEAF_INDEP
:
3045 int numrep
= curstep
->arg1
;
3046 int type
= curstep
->arg2
;
3049 type_stack
.push_back(make_pair(type
, numrep
));
3050 type_stack
.push_back(make_pair(0, 1));
3051 int r
= _choose_type_stack(cct
, type_stack
, overfull
, underfull
, orig
,
3059 case CRUSH_RULE_CHOOSE_FIRSTN
:
3060 case CRUSH_RULE_CHOOSE_INDEP
:
3062 int numrep
= curstep
->arg1
;
3063 int type
= curstep
->arg2
;
3066 type_stack
.push_back(make_pair(type
, numrep
));
3070 case CRUSH_RULE_EMIT
:
3071 ldout(cct
, 10) << " emit " << w
<< dendl
;
3072 if (!type_stack
.empty()) {
3073 int r
= _choose_type_stack(cct
, type_stack
, overfull
, underfull
, orig
,
3079 for (auto item
: w
) {
3080 out
->push_back(item
);
3095 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3097 crush_choose_arg_map cmap
,
3100 const vector
<int>& weight
,
3104 int bidx
= -1 - bucketid
;
3105 crush_bucket
*b
= crush
->buckets
[bidx
];
3106 if (bidx
>= (int)cmap
.size
) {
3108 *ss
<< "no weight-set for bucket " << b
->id
;
3109 ldout(cct
, 10) << __func__
<< " no crush_choose_arg for bucket " << b
->id
3113 crush_choose_arg
*carg
= &cmap
.args
[bidx
];
3114 if (carg
->weight_set
== NULL
) {
3116 *ss
<< "no weight-set for bucket " << b
->id
;
3117 ldout(cct
, 10) << __func__
<< " no weight_set for bucket " << b
->id
3121 if (carg
->weight_set_size
!= weight
.size()) {
3123 *ss
<< "weight_set_size != " << weight
.size() << " for bucket " << b
->id
;
3124 ldout(cct
, 10) << __func__
<< " weight_set_size != " << weight
.size()
3125 << " for bucket " << b
->id
<< dendl
;
3128 for (unsigned i
= 0; i
< b
->size
; i
++) {
3129 if (b
->items
[i
] == id
) {
3130 for (unsigned j
= 0; j
< weight
.size(); ++j
) {
3131 carg
->weight_set
[j
].weights
[i
] = weight
[j
];
3133 ldout(cct
, 5) << __func__
<< " set " << id
<< " to " << weight
3134 << " in bucket " << b
->id
<< dendl
;
3139 vector
<int> bucket_weight(weight
.size(), 0);
3140 for (unsigned i
= 0; i
< b
->size
; i
++) {
3141 for (unsigned j
= 0; j
< weight
.size(); ++j
) {
3142 bucket_weight
[j
] += carg
->weight_set
[j
].weights
[i
];
3145 choose_args_adjust_item_weight(cct
, cmap
, b
->id
, bucket_weight
, nullptr);
3150 int CrushWrapper::choose_args_adjust_item_weight(
3152 crush_choose_arg_map cmap
,
3154 const vector
<int>& weight
,
3157 ldout(cct
, 5) << __func__
<< " " << id
<< " weight " << weight
<< dendl
;
3159 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
3160 crush_bucket
*b
= crush
->buckets
[bidx
];
3164 changed
+= _choose_args_adjust_item_weight_in_bucket(
3165 cct
, cmap
, b
->id
, id
, weight
, ss
);
3169 *ss
<< "item " << id
<< " not found in crush map";