1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "common/TextTable.h"
9 #include "include/stringify.h"
11 #include "CrushWrapper.h"
12 #include "CrushTreeDumper.h"
14 #define dout_subsys ceph_subsys_crush
16 bool CrushWrapper::has_legacy_rule_ids() const
18 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
19 crush_rule
*r
= crush
->rules
[i
];
21 r
->mask
.ruleset
!= i
) {
28 std::map
<int, int> CrushWrapper::renumber_rules()
30 std::map
<int, int> result
;
31 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
32 crush_rule
*r
= crush
->rules
[i
];
33 if (r
&& r
->mask
.ruleset
!= i
) {
34 result
[r
->mask
.ruleset
] = i
;
41 bool CrushWrapper::has_non_straw2_buckets() const
43 for (int i
=0; i
<crush
->max_buckets
; ++i
) {
44 crush_bucket
*b
= crush
->buckets
[i
];
47 if (b
->alg
!= CRUSH_BUCKET_STRAW2
)
53 bool CrushWrapper::has_v2_rules() const
55 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
63 bool CrushWrapper::is_v2_rule(unsigned ruleid
) const
65 // check rule for use of indep or new SET_* rule steps
66 if (ruleid
>= crush
->max_rules
)
68 crush_rule
*r
= crush
->rules
[ruleid
];
71 for (unsigned j
=0; j
<r
->len
; j
++) {
72 if (r
->steps
[j
].op
== CRUSH_RULE_CHOOSE_INDEP
||
73 r
->steps
[j
].op
== CRUSH_RULE_CHOOSELEAF_INDEP
||
74 r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSE_TRIES
||
75 r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_TRIES
) {
82 bool CrushWrapper::has_v3_rules() const
84 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
92 bool CrushWrapper::is_v3_rule(unsigned ruleid
) const
94 // check rule for use of SET_CHOOSELEAF_VARY_R step
95 if (ruleid
>= crush
->max_rules
)
97 crush_rule
*r
= crush
->rules
[ruleid
];
100 for (unsigned j
=0; j
<r
->len
; j
++) {
101 if (r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_VARY_R
) {
108 bool CrushWrapper::has_v4_buckets() const
110 for (int i
=0; i
<crush
->max_buckets
; ++i
) {
111 crush_bucket
*b
= crush
->buckets
[i
];
114 if (b
->alg
== CRUSH_BUCKET_STRAW2
)
120 bool CrushWrapper::has_v5_rules() const
122 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
130 bool CrushWrapper::is_v5_rule(unsigned ruleid
) const
132 // check rule for use of SET_CHOOSELEAF_STABLE step
133 if (ruleid
>= crush
->max_rules
)
135 crush_rule
*r
= crush
->rules
[ruleid
];
138 for (unsigned j
=0; j
<r
->len
; j
++) {
139 if (r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_STABLE
) {
146 bool CrushWrapper::has_choose_args() const
148 return !choose_args
.empty();
151 bool CrushWrapper::has_incompat_choose_args() const
153 if (choose_args
.empty())
155 if (choose_args
.size() > 1)
157 if (choose_args
.begin()->first
!= DEFAULT_CHOOSE_ARGS
)
159 crush_choose_arg_map arg_map
= choose_args
.begin()->second
;
160 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
161 crush_choose_arg
*arg
= &arg_map
.args
[i
];
162 if (arg
->weight_set_positions
== 0 &&
165 if (arg
->weight_set_positions
!= 1)
167 if (arg
->ids_size
!= 0)
173 int CrushWrapper::split_id_class(int i
, int *idout
, int *classout
) const
177 string name
= get_item_name(i
);
178 size_t pos
= name
.find("~");
179 if (pos
== string::npos
) {
184 string name_no_class
= name
.substr(0, pos
);
185 if (!name_exists(name_no_class
))
187 string class_name
= name
.substr(pos
+ 1);
188 if (!class_exists(class_name
))
190 *idout
= get_item_id(name_no_class
);
191 *classout
= get_class_id(class_name
);
195 int CrushWrapper::can_rename_item(const string
& srcname
,
196 const string
& dstname
,
199 if (name_exists(srcname
)) {
200 if (name_exists(dstname
)) {
201 *ss
<< "dstname = '" << dstname
<< "' already exists";
204 if (is_valid_crush_name(dstname
)) {
207 *ss
<< "dstname = '" << dstname
<< "' does not match [-_.0-9a-zA-Z]+";
211 if (name_exists(dstname
)) {
212 *ss
<< "srcname = '" << srcname
<< "' does not exist "
213 << "and dstname = '" << dstname
<< "' already exists";
216 *ss
<< "srcname = '" << srcname
<< "' does not exist";
222 int CrushWrapper::rename_item(const string
& srcname
,
223 const string
& dstname
,
226 int ret
= can_rename_item(srcname
, dstname
, ss
);
229 int oldid
= get_item_id(srcname
);
230 return set_item_name(oldid
, dstname
);
233 int CrushWrapper::can_rename_bucket(const string
& srcname
,
234 const string
& dstname
,
237 int ret
= can_rename_item(srcname
, dstname
, ss
);
240 int srcid
= get_item_id(srcname
);
242 *ss
<< "srcname = '" << srcname
<< "' is not a bucket "
243 << "because its id = " << srcid
<< " is >= 0";
249 int CrushWrapper::rename_bucket(const string
& srcname
,
250 const string
& dstname
,
253 int ret
= can_rename_bucket(srcname
, dstname
, ss
);
256 int oldid
= get_item_id(srcname
);
257 return set_item_name(oldid
, dstname
);
260 int CrushWrapper::rename_rule(const string
& srcname
,
261 const string
& dstname
,
264 if (!rule_exists(srcname
)) {
266 *ss
<< "source rule name '" << srcname
<< "' does not exist";
270 if (rule_exists(dstname
)) {
272 *ss
<< "destination rule name '" << dstname
<< "' already exists";
276 int rule_id
= get_rule_id(srcname
);
277 auto it
= rule_name_map
.find(rule_id
);
278 assert(it
!= rule_name_map
.end());
279 it
->second
= dstname
;
281 rule_name_rmap
.erase(srcname
);
282 rule_name_rmap
[dstname
] = rule_id
;
287 void CrushWrapper::find_takes(set
<int> *roots
) const
289 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
290 crush_rule
*r
= crush
->rules
[i
];
293 for (unsigned j
=0; j
<r
->len
; j
++) {
294 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
)
295 roots
->insert(r
->steps
[j
].arg1
);
300 void CrushWrapper::find_takes_by_rule(int rule
, set
<int> *roots
) const
302 if (rule
< 0 || rule
>= (int)crush
->max_rules
)
304 crush_rule
*r
= crush
->rules
[rule
];
307 for (unsigned i
= 0; i
< r
->len
; i
++) {
308 if (r
->steps
[i
].op
== CRUSH_RULE_TAKE
)
309 roots
->insert(r
->steps
[i
].arg1
);
313 void CrushWrapper::find_roots(set
<int> *roots
) const
315 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
316 if (!crush
->buckets
[i
])
318 crush_bucket
*b
= crush
->buckets
[i
];
319 if (!_search_item_exists(b
->id
))
320 roots
->insert(b
->id
);
324 bool CrushWrapper::subtree_contains(int root
, int item
) const
330 return false; // root is a leaf
332 const crush_bucket
*b
= get_bucket(root
);
336 for (unsigned j
=0; j
<b
->size
; j
++) {
337 if (subtree_contains(b
->items
[j
], item
))
343 bool CrushWrapper::_maybe_remove_last_instance(CephContext
*cct
, int item
, bool unlink_only
)
346 if (_search_item_exists(item
)) {
349 if (item
< 0 && _bucket_is_in_use(item
)) {
353 if (item
< 0 && !unlink_only
) {
354 crush_bucket
*t
= get_bucket(item
);
355 ldout(cct
, 5) << "_maybe_remove_last_instance removing bucket " << item
<< dendl
;
356 crush_remove_bucket(crush
, t
);
357 if (class_bucket
.count(item
) != 0)
358 class_bucket
.erase(item
);
359 class_remove_item(item
);
360 update_choose_args(cct
);
362 if ((item
>= 0 || !unlink_only
) && name_map
.count(item
)) {
363 ldout(cct
, 5) << "_maybe_remove_last_instance removing name for item " << item
<< dendl
;
364 name_map
.erase(item
);
366 if (item
>= 0 && !unlink_only
) {
367 class_remove_item(item
);
370 rebuild_roots_with_classes();
374 int CrushWrapper::remove_root(int item
)
376 crush_bucket
*b
= get_bucket(item
);
378 // should be idempotent
379 // e.g.: we use 'crush link' to link same host into
380 // different roots, which as a result can cause different
381 // shadow trees reference same hosts too. This means
382 // we may need to destory the same buckets(hosts, racks, etc.)
383 // multiple times during rebuilding all shadow trees.
387 for (unsigned n
= 0; n
< b
->size
; n
++) {
388 if (b
->items
[n
] >= 0)
390 int r
= remove_root(b
->items
[n
]);
395 crush_remove_bucket(crush
, b
);
396 if (name_map
.count(item
) != 0) {
397 name_map
.erase(item
);
400 if (class_bucket
.count(item
) != 0)
401 class_bucket
.erase(item
);
402 class_remove_item(item
);
403 update_choose_args(nullptr);
407 void CrushWrapper::update_choose_args(CephContext
*cct
)
409 for (auto& i
: choose_args
) {
410 crush_choose_arg_map
&arg_map
= i
.second
;
411 unsigned positions
= get_choose_args_positions(arg_map
);
412 for (int j
= 0; j
< crush
->max_buckets
; ++j
) {
413 crush_bucket
*b
= crush
->buckets
[j
];
414 auto& carg
= arg_map
.args
[j
];
415 // strip out choose_args for any buckets that no longer exist
416 if (!b
|| b
->alg
!= CRUSH_BUCKET_STRAW2
) {
419 ldout(cct
,0) << __func__
<< " removing " << i
.first
<< " bucket "
420 << (-1-j
) << " ids" << dendl
;
425 if (carg
.weight_set
) {
427 ldout(cct
,0) << __func__
<< " removing " << i
.first
<< " bucket "
428 << (-1-j
) << " weight_sets" << dendl
;
429 for (unsigned p
= 0; p
< carg
.weight_set_positions
; ++p
) {
430 free(carg
.weight_set
[p
].weights
);
432 free(carg
.weight_set
);
434 carg
.weight_set_positions
= 0;
438 if (carg
.weight_set_positions
== 0) {
441 if (carg
.weight_set_positions
!= positions
) {
443 lderr(cct
) << __func__
<< " " << i
.first
<< " bucket "
444 << (-1-j
) << " positions " << carg
.weight_set_positions
445 << " -> " << positions
<< dendl
;
446 continue; // wth... skip!
448 // mis-sized weight_sets? this shouldn't ever happen.
449 for (unsigned p
= 0; p
< positions
; ++p
) {
450 if (carg
.weight_set
[p
].size
!= b
->size
) {
452 lderr(cct
) << __func__
<< " fixing " << i
.first
<< " bucket "
453 << (-1-j
) << " position " << p
454 << " size " << carg
.weight_set
[p
].size
<< " -> "
456 auto old_ws
= carg
.weight_set
[p
];
457 carg
.weight_set
[p
].size
= b
->size
;
458 carg
.weight_set
[p
].weights
= (__u32
*)calloc(b
->size
, sizeof(__u32
));
459 auto max
= std::min
<unsigned>(old_ws
.size
, b
->size
);
460 for (unsigned k
= 0; k
< max
; ++k
) {
461 carg
.weight_set
[p
].weights
[k
] = old_ws
.weights
[k
];
463 free(old_ws
.weights
);
470 int CrushWrapper::remove_item(CephContext
*cct
, int item
, bool unlink_only
)
472 ldout(cct
, 5) << "remove_item " << item
473 << (unlink_only
? " unlink_only":"") << dendl
;
477 if (item
< 0 && !unlink_only
) {
478 crush_bucket
*t
= get_bucket(item
);
480 ldout(cct
, 1) << "remove_item bucket " << item
<< " does not exist"
486 ldout(cct
, 1) << "remove_item bucket " << item
<< " has " << t
->size
487 << " items, not empty" << dendl
;
490 if (_bucket_is_in_use(item
)) {
495 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
496 if (!crush
->buckets
[i
])
498 crush_bucket
*b
= crush
->buckets
[i
];
500 for (unsigned i
=0; i
<b
->size
; ++i
) {
501 int id
= b
->items
[i
];
503 ldout(cct
, 5) << "remove_item removing item " << item
504 << " from bucket " << b
->id
<< dendl
;
505 for (auto& p
: choose_args
) {
506 // weight down each weight-set to 0 before we remove the item
507 vector
<int> weightv(get_choose_args_positions(p
.second
), 0);
508 choose_args_adjust_item_weight(cct
, p
.second
, item
, weightv
, nullptr);
510 bucket_remove_item(b
, item
);
511 adjust_item_weight(cct
, b
->id
, b
->weight
);
517 if (_maybe_remove_last_instance(cct
, item
, unlink_only
))
523 bool CrushWrapper::_search_item_exists(int item
) const
525 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
526 if (!crush
->buckets
[i
])
528 crush_bucket
*b
= crush
->buckets
[i
];
529 for (unsigned j
=0; j
<b
->size
; ++j
) {
530 if (b
->items
[j
] == item
)
537 bool CrushWrapper::_bucket_is_in_use(int item
)
539 for (auto &i
: class_bucket
)
540 for (auto &j
: i
.second
)
541 if (j
.second
== item
)
543 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
544 crush_rule
*r
= crush
->rules
[i
];
547 for (unsigned j
= 0; j
< r
->len
; ++j
) {
548 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
549 int step_item
= r
->steps
[j
].arg1
;
552 int res
= split_id_class(step_item
, &original_item
, &c
);
555 if (step_item
== item
|| original_item
== item
)
563 int CrushWrapper::_remove_item_under(
564 CephContext
*cct
, int item
, int ancestor
, bool unlink_only
)
566 ldout(cct
, 5) << "_remove_item_under " << item
<< " under " << ancestor
567 << (unlink_only
? " unlink_only":"") << dendl
;
573 if (!bucket_exists(ancestor
))
578 crush_bucket
*b
= get_bucket(ancestor
);
579 for (unsigned i
=0; i
<b
->size
; ++i
) {
580 int id
= b
->items
[i
];
582 ldout(cct
, 5) << "_remove_item_under removing item " << item
583 << " from bucket " << b
->id
<< dendl
;
584 for (auto& p
: choose_args
) {
585 // weight down each weight-set to 0 before we remove the item
586 vector
<int> weightv(get_choose_args_positions(p
.second
), 0);
587 _choose_args_adjust_item_weight_in_bucket(
588 cct
, p
.second
, b
->id
, item
, weightv
, nullptr);
590 bucket_remove_item(b
, item
);
591 adjust_item_weight(cct
, b
->id
, b
->weight
);
594 int r
= remove_item_under(cct
, item
, id
, unlink_only
);
602 int CrushWrapper::remove_item_under(
603 CephContext
*cct
, int item
, int ancestor
, bool unlink_only
)
605 ldout(cct
, 5) << "remove_item_under " << item
<< " under " << ancestor
606 << (unlink_only
? " unlink_only":"") << dendl
;
608 if (!unlink_only
&& _bucket_is_in_use(item
)) {
612 int ret
= _remove_item_under(cct
, item
, ancestor
, unlink_only
);
616 if (item
< 0 && !unlink_only
) {
617 crush_bucket
*t
= get_bucket(item
);
619 ldout(cct
, 1) << "remove_item_under bucket " << item
620 << " does not exist" << dendl
;
625 ldout(cct
, 1) << "remove_item_under bucket " << item
<< " has " << t
->size
626 << " items, not empty" << dendl
;
631 if (_maybe_remove_last_instance(cct
, item
, unlink_only
))
637 int CrushWrapper::get_common_ancestor_distance(CephContext
*cct
, int id
,
638 const std::multimap
<string
,string
>& loc
)
640 ldout(cct
, 5) << __func__
<< " " << id
<< " " << loc
<< dendl
;
641 if (!item_exists(id
))
643 map
<string
,string
> id_loc
= get_full_location(id
);
644 ldout(cct
, 20) << " id is at " << id_loc
<< dendl
;
646 for (map
<int,string
>::const_iterator p
= type_map
.begin();
649 map
<string
,string
>::iterator ip
= id_loc
.find(p
->second
);
650 if (ip
== id_loc
.end())
652 for (std::multimap
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
655 if (q
->first
!= p
->second
)
657 if (q
->second
== ip
->second
)
664 int CrushWrapper::parse_loc_map(const std::vector
<string
>& args
,
665 std::map
<string
,string
> *ploc
)
668 for (unsigned i
= 0; i
< args
.size(); ++i
) {
669 const char *s
= args
[i
].c_str();
670 const char *pos
= strchr(s
, '=');
673 string
key(s
, 0, pos
-s
);
676 (*ploc
)[key
] = value
;
683 int CrushWrapper::parse_loc_multimap(const std::vector
<string
>& args
,
684 std::multimap
<string
,string
> *ploc
)
687 for (unsigned i
= 0; i
< args
.size(); ++i
) {
688 const char *s
= args
[i
].c_str();
689 const char *pos
= strchr(s
, '=');
692 string
key(s
, 0, pos
-s
);
695 ploc
->insert(make_pair(key
, value
));
702 bool CrushWrapper::check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
,
705 ldout(cct
, 5) << "check_item_loc item " << item
<< " loc " << loc
<< dendl
;
707 for (map
<int,string
>::const_iterator p
= type_map
.begin(); p
!= type_map
.end(); ++p
) {
712 // ignore types that aren't specified in loc
713 map
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
714 if (q
== loc
.end()) {
715 ldout(cct
, 2) << "warning: did not specify location for '" << p
->second
<< "' level (levels are "
716 << type_map
<< ")" << dendl
;
720 if (!name_exists(q
->second
)) {
721 ldout(cct
, 5) << "check_item_loc bucket " << q
->second
<< " dne" << dendl
;
725 int id
= get_item_id(q
->second
);
727 ldout(cct
, 5) << "check_item_loc requested " << q
->second
<< " for type " << p
->second
728 << " is a device, not bucket" << dendl
;
732 assert(bucket_exists(id
));
733 crush_bucket
*b
= get_bucket(id
);
735 // see if item exists in this bucket
736 for (unsigned j
=0; j
<b
->size
; j
++) {
737 if (b
->items
[j
] == item
) {
738 ldout(cct
, 2) << "check_item_loc " << item
<< " exists in bucket " << b
->id
<< dendl
;
740 *weight
= crush_get_bucket_item_weight(b
, j
);
747 ldout(cct
, 2) << __func__
<< " item " << item
<< " loc " << loc
<< dendl
;
751 map
<string
, string
> CrushWrapper::get_full_location(int id
)
753 vector
<pair
<string
, string
> > full_location_ordered
;
754 map
<string
,string
> full_location
;
756 get_full_location_ordered(id
, full_location_ordered
);
758 std::copy(full_location_ordered
.begin(),
759 full_location_ordered
.end(),
760 std::inserter(full_location
, full_location
.begin()));
762 return full_location
;
765 int CrushWrapper::get_full_location_ordered(int id
, vector
<pair
<string
, string
> >& path
)
767 if (!item_exists(id
))
772 pair
<string
, string
> parent_coord
= get_immediate_parent(cur
, &ret
);
775 path
.push_back(parent_coord
);
776 cur
= get_item_id(parent_coord
.second
);
781 string
CrushWrapper::get_full_location_ordered_string(int id
)
783 vector
<pair
<string
, string
> > full_location_ordered
;
784 string full_location
;
785 get_full_location_ordered(id
, full_location_ordered
);
786 reverse(begin(full_location_ordered
), end(full_location_ordered
));
787 for(auto i
= full_location_ordered
.begin(); i
!= full_location_ordered
.end(); i
++) {
788 full_location
= full_location
+ i
->first
+ "=" + i
->second
;
789 if (i
!= full_location_ordered
.end() - 1) {
790 full_location
= full_location
+ ",";
793 return full_location
;
796 map
<int, string
> CrushWrapper::get_parent_hierarchy(int id
)
798 map
<int,string
> parent_hierarchy
;
799 pair
<string
, string
> parent_coord
= get_immediate_parent(id
);
802 // get the integer type for id and create a counter from there
803 int type_counter
= get_bucket_type(id
);
805 // if we get a negative type then we can assume that we have an OSD
806 // change behavior in get_item_type FIXME
807 if (type_counter
< 0)
810 // read the type map and get the name of the type with the largest ID
812 for (map
<int, string
>::iterator it
= type_map
.begin(); it
!= type_map
.end(); ++it
){
813 if ( (*it
).first
> high_type
)
814 high_type
= (*it
).first
;
817 parent_id
= get_item_id(parent_coord
.second
);
819 while (type_counter
< high_type
) {
821 parent_hierarchy
[ type_counter
] = parent_coord
.first
;
823 if (type_counter
< high_type
){
824 // get the coordinate information for the next parent
825 parent_coord
= get_immediate_parent(parent_id
);
826 parent_id
= get_item_id(parent_coord
.second
);
830 return parent_hierarchy
;
833 int CrushWrapper::get_children(int id
, list
<int> *children
)
840 crush_bucket
*b
= get_bucket(id
);
845 for (unsigned n
=0; n
<b
->size
; n
++) {
846 children
->push_back(b
->items
[n
]);
851 void CrushWrapper::get_children_of_type(int id
,
854 bool exclude_shadow
) const
859 children
->insert(id
);
863 auto b
= get_bucket(id
);
867 if (b
->type
< type
) {
870 } else if (b
->type
== type
) {
871 if (!is_shadow_item(b
->id
) || !exclude_shadow
) {
872 children
->insert(b
->id
);
876 for (unsigned n
= 0; n
< b
->size
; n
++) {
877 get_children_of_type(b
->items
[n
], type
, children
, exclude_shadow
);
881 int CrushWrapper::get_rule_failure_domain(int rule_id
)
883 crush_rule
*rule
= get_rule(rule_id
);
887 int type
= 0; // default to osd-level
888 for (unsigned s
= 0; s
< rule
->len
; ++s
) {
889 if ((rule
->steps
[s
].op
== CRUSH_RULE_CHOOSE_FIRSTN
||
890 rule
->steps
[s
].op
== CRUSH_RULE_CHOOSE_INDEP
||
891 rule
->steps
[s
].op
== CRUSH_RULE_CHOOSELEAF_FIRSTN
||
892 rule
->steps
[s
].op
== CRUSH_RULE_CHOOSELEAF_INDEP
) &&
893 rule
->steps
[s
].arg2
> type
) {
894 type
= rule
->steps
[s
].arg2
;
900 int CrushWrapper::_get_leaves(int id
, list
<int> *leaves
)
906 leaves
->push_back(id
);
910 crush_bucket
*b
= get_bucket(id
);
915 for (unsigned n
= 0; n
< b
->size
; n
++) {
916 if (b
->items
[n
] >= 0) {
917 leaves
->push_back(b
->items
[n
]);
919 // is a bucket, do recursive call
920 int r
= _get_leaves(b
->items
[n
], leaves
);
927 return 0; // all is well
930 int CrushWrapper::get_leaves(const string
&name
, set
<int> *leaves
)
935 if (!name_exists(name
)) {
939 int id
= get_item_id(name
);
947 int r
= _get_leaves(id
, &unordered
);
952 for (auto &p
: unordered
) {
959 int CrushWrapper::insert_item(
960 CephContext
*cct
, int item
, float weight
, string name
,
961 const map
<string
,string
>& loc
) // typename -> bucketname
963 ldout(cct
, 5) << "insert_item item " << item
<< " weight " << weight
964 << " name " << name
<< " loc " << loc
<< dendl
;
966 if (!is_valid_crush_name(name
))
969 if (!is_valid_crush_loc(cct
, loc
))
972 int r
= validate_weightf(weight
);
977 if (name_exists(name
)) {
978 if (get_item_id(name
) != item
) {
979 ldout(cct
, 10) << "device name '" << name
<< "' already exists as id "
980 << get_item_id(name
) << dendl
;
984 set_item_name(item
, name
);
989 // create locations if locations don't exist and add child in
990 // location with 0 weight the more detail in the insert_item method
991 // declaration in CrushWrapper.h
992 for (auto p
= type_map
.begin(); p
!= type_map
.end(); ++p
) {
993 // ignore device type
997 // skip types that are unspecified
998 map
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
999 if (q
== loc
.end()) {
1000 ldout(cct
, 2) << "warning: did not specify location for '"
1001 << p
->second
<< "' level (levels are "
1002 << type_map
<< ")" << dendl
;
1006 if (!name_exists(q
->second
)) {
1007 ldout(cct
, 5) << "insert_item creating bucket " << q
->second
<< dendl
;
1008 int empty
= 0, newid
;
1009 int r
= add_bucket(0, 0,
1010 CRUSH_HASH_DEFAULT
, p
->first
, 1, &cur
, &empty
, &newid
);
1012 ldout(cct
, 1) << "add_bucket failure error: " << cpp_strerror(r
)
1016 set_item_name(newid
, q
->second
);
1022 // add to an existing bucket
1023 int id
= get_item_id(q
->second
);
1024 if (!bucket_exists(id
)) {
1025 ldout(cct
, 1) << "insert_item doesn't have bucket " << id
<< dendl
;
1029 // check that we aren't creating a cycle.
1030 if (subtree_contains(id
, cur
)) {
1031 ldout(cct
, 1) << "insert_item item " << cur
<< " already exists beneath "
1036 // we have done sanity check above
1037 crush_bucket
*b
= get_bucket(id
);
1039 if (p
->first
!= b
->type
) {
1040 ldout(cct
, 1) << "insert_item existing bucket has type "
1041 << "'" << type_map
[b
->type
] << "' != "
1042 << "'" << type_map
[p
->first
] << "'" << dendl
;
1046 // are we forming a loop?
1047 if (subtree_contains(cur
, b
->id
)) {
1048 ldout(cct
, 1) << "insert_item " << cur
<< " already contains " << b
->id
1049 << "; cannot form loop" << dendl
;
1053 ldout(cct
, 5) << "insert_item adding " << cur
<< " weight " << weight
1054 << " to bucket " << id
<< dendl
;
1055 int r
= bucket_add_item(b
, cur
, 0);
1060 // adjust the item's weight in location
1061 if (adjust_item_weightf_in_loc(cct
, item
, weight
, loc
) > 0) {
1062 if (item
>= crush
->max_devices
) {
1063 crush
->max_devices
= item
+ 1;
1064 ldout(cct
, 5) << "insert_item max_devices now " << crush
->max_devices
1067 r
= rebuild_roots_with_classes();
1069 ldout(cct
, 0) << __func__
<< " unable to rebuild roots with classes: "
1070 << cpp_strerror(r
) << dendl
;
1076 ldout(cct
, 1) << "error: didn't find anywhere to add item " << item
1077 << " in " << loc
<< dendl
;
1082 int CrushWrapper::move_bucket(
1083 CephContext
*cct
, int id
, const map
<string
,string
>& loc
)
1085 // sorry this only works for buckets
1089 if (!item_exists(id
))
1092 // get the name of the bucket we are trying to move for later
1093 string id_name
= get_item_name(id
);
1095 // detach the bucket
1096 int bucket_weight
= detach_bucket(cct
, id
);
1098 // insert the bucket back into the hierarchy
1099 return insert_item(cct
, id
, bucket_weight
/ (float)0x10000, id_name
, loc
);
1102 int CrushWrapper::detach_bucket(CephContext
*cct
, int item
)
1110 // check that the bucket that we want to detach exists
1111 assert(bucket_exists(item
));
1113 // get the bucket's weight
1114 crush_bucket
*b
= get_bucket(item
);
1115 unsigned bucket_weight
= b
->weight
;
1117 // get where the bucket is located
1118 pair
<string
, string
> bucket_location
= get_immediate_parent(item
);
1120 // get the id of the parent bucket
1121 int parent_id
= get_item_id(bucket_location
.second
);
1123 // get the parent bucket
1124 crush_bucket
*parent_bucket
= get_bucket(parent_id
);
1126 if (!IS_ERR(parent_bucket
)) {
1127 // zero out the bucket weight
1128 bucket_adjust_item_weight(cct
, parent_bucket
, item
, 0);
1129 adjust_item_weight(cct
, parent_bucket
->id
, parent_bucket
->weight
);
1130 for (auto& p
: choose_args
) {
1131 // weight down each weight-set to 0 before we remove the item
1132 vector
<int> weightv(get_choose_args_positions(p
.second
), 0);
1133 choose_args_adjust_item_weight(cct
, p
.second
, item
, weightv
, nullptr);
1136 // remove the bucket from the parent
1137 bucket_remove_item(parent_bucket
, item
);
1138 } else if (PTR_ERR(parent_bucket
) != -ENOENT
) {
1139 return PTR_ERR(parent_bucket
);
1142 // check that we're happy
1143 int test_weight
= 0;
1144 map
<string
,string
> test_location
;
1145 test_location
[ bucket_location
.first
] = (bucket_location
.second
);
1147 bool successful_detach
= !(check_item_loc(cct
, item
, test_location
,
1149 assert(successful_detach
);
1150 assert(test_weight
== 0);
1152 return bucket_weight
;
1155 int CrushWrapper::swap_bucket(CephContext
*cct
, int src
, int dst
)
1157 if (src
>= 0 || dst
>= 0)
1159 if (!item_exists(src
) || !item_exists(dst
))
1161 crush_bucket
*a
= get_bucket(src
);
1162 crush_bucket
*b
= get_bucket(dst
);
1163 unsigned aw
= a
->weight
;
1164 unsigned bw
= b
->weight
;
1167 adjust_item_weight(cct
, a
->id
, bw
);
1168 adjust_item_weight(cct
, b
->id
, aw
);
1171 map
<int,unsigned> tmp
;
1172 unsigned as
= a
->size
;
1173 unsigned bs
= b
->size
;
1174 for (unsigned i
= 0; i
< as
; ++i
) {
1175 int item
= a
->items
[0];
1176 int itemw
= crush_get_bucket_item_weight(a
, 0);
1178 bucket_remove_item(a
, item
);
1180 assert(a
->size
== 0);
1181 assert(b
->size
== bs
);
1182 for (unsigned i
= 0; i
< bs
; ++i
) {
1183 int item
= b
->items
[0];
1184 int itemw
= crush_get_bucket_item_weight(b
, 0);
1185 bucket_remove_item(b
, item
);
1186 bucket_add_item(a
, item
, itemw
);
1188 assert(a
->size
== bs
);
1189 assert(b
->size
== 0);
1190 for (auto t
: tmp
) {
1191 bucket_add_item(b
, t
.first
, t
.second
);
1193 assert(a
->size
== bs
);
1194 assert(b
->size
== as
);
1197 swap_names(src
, dst
);
1198 return rebuild_roots_with_classes();
1201 int CrushWrapper::link_bucket(
1202 CephContext
*cct
, int id
, const map
<string
,string
>& loc
)
1204 // sorry this only works for buckets
1208 if (!item_exists(id
))
1211 // get the name of the bucket we are trying to move for later
1212 string id_name
= get_item_name(id
);
1214 crush_bucket
*b
= get_bucket(id
);
1215 unsigned bucket_weight
= b
->weight
;
1217 return insert_item(cct
, id
, bucket_weight
/ (float)0x10000, id_name
, loc
);
1220 int CrushWrapper::create_or_move_item(
1221 CephContext
*cct
, int item
, float weight
, string name
,
1222 const map
<string
,string
>& loc
) // typename -> bucketname
1227 if (!is_valid_crush_name(name
))
1230 if (check_item_loc(cct
, item
, loc
, &old_iweight
)) {
1231 ldout(cct
, 5) << "create_or_move_item " << item
<< " already at " << loc
1234 if (_search_item_exists(item
)) {
1235 weight
= get_item_weightf(item
);
1236 ldout(cct
, 10) << "create_or_move_item " << item
1237 << " exists with weight " << weight
<< dendl
;
1238 remove_item(cct
, item
, true);
1240 ldout(cct
, 5) << "create_or_move_item adding " << item
1241 << " weight " << weight
1242 << " at " << loc
<< dendl
;
1243 ret
= insert_item(cct
, item
, weight
, name
, loc
);
1250 int CrushWrapper::update_item(
1251 CephContext
*cct
, int item
, float weight
, string name
,
1252 const map
<string
,string
>& loc
) // typename -> bucketname
1254 ldout(cct
, 5) << "update_item item " << item
<< " weight " << weight
1255 << " name " << name
<< " loc " << loc
<< dendl
;
1258 if (!is_valid_crush_name(name
))
1261 if (!is_valid_crush_loc(cct
, loc
))
1264 ret
= validate_weightf(weight
);
1269 // compare quantized (fixed-point integer) weights!
1270 int iweight
= (int)(weight
* (float)0x10000);
1272 if (check_item_loc(cct
, item
, loc
, &old_iweight
)) {
1273 ldout(cct
, 5) << "update_item " << item
<< " already at " << loc
<< dendl
;
1274 if (old_iweight
!= iweight
) {
1275 ldout(cct
, 5) << "update_item " << item
<< " adjusting weight "
1276 << ((float)old_iweight
/(float)0x10000) << " -> " << weight
1278 adjust_item_weight_in_loc(cct
, item
, iweight
, loc
);
1281 if (get_item_name(item
) != name
) {
1282 ldout(cct
, 5) << "update_item setting " << item
<< " name to " << name
1284 set_item_name(item
, name
);
1288 if (item_exists(item
)) {
1289 remove_item(cct
, item
, true);
1291 ldout(cct
, 5) << "update_item adding " << item
<< " weight " << weight
1292 << " at " << loc
<< dendl
;
1293 ret
= insert_item(cct
, item
, weight
, name
, loc
);
1300 int CrushWrapper::get_item_weight(int id
) const
1302 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1303 crush_bucket
*b
= crush
->buckets
[bidx
];
1308 for (unsigned i
= 0; i
< b
->size
; i
++)
1309 if (b
->items
[i
] == id
)
1310 return crush_get_bucket_item_weight(b
, i
);
1315 int CrushWrapper::get_item_weight_in_loc(int id
, const map
<string
,string
> &loc
)
1317 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
1319 int bid
= get_item_id(l
->second
);
1320 if (!bucket_exists(bid
))
1322 crush_bucket
*b
= get_bucket(bid
);
1323 for (unsigned int i
= 0; i
< b
->size
; i
++) {
1324 if (b
->items
[i
] == id
) {
1325 return crush_get_bucket_item_weight(b
, i
);
1332 int CrushWrapper::adjust_item_weight(CephContext
*cct
, int id
, int weight
)
1334 ldout(cct
, 5) << "adjust_item_weight " << id
<< " weight " << weight
<< dendl
;
1336 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1337 crush_bucket
*b
= crush
->buckets
[bidx
];
1340 for (unsigned i
= 0; i
< b
->size
; i
++) {
1341 if (b
->items
[i
] == id
) {
1342 int diff
= bucket_adjust_item_weight(cct
, b
, id
, weight
);
1343 ldout(cct
, 5) << "adjust_item_weight " << id
<< " diff " << diff
1344 << " in bucket " << bidx
<< dendl
;
1345 adjust_item_weight(cct
, -1 - bidx
, b
->weight
);
1355 int CrushWrapper::adjust_item_weight_in_loc(CephContext
*cct
, int id
, int weight
, const map
<string
,string
>& loc
)
1357 ldout(cct
, 5) << "adjust_item_weight_in_loc " << id
<< " weight " << weight
1358 << " in " << loc
<< dendl
;
1361 for (auto l
= loc
.begin(); l
!= loc
.end(); ++l
) {
1362 int bid
= get_item_id(l
->second
);
1363 if (!bucket_exists(bid
))
1365 crush_bucket
*b
= get_bucket(bid
);
1366 for (unsigned int i
= 0; i
< b
->size
; i
++) {
1367 if (b
->items
[i
] == id
) {
1368 int diff
= bucket_adjust_item_weight(cct
, b
, id
, weight
);
1369 ldout(cct
, 5) << "adjust_item_weight_in_loc " << id
<< " diff " << diff
1370 << " in bucket " << bid
<< dendl
;
1371 adjust_item_weight(cct
, bid
, b
->weight
);
1381 int CrushWrapper::adjust_subtree_weight(CephContext
*cct
, int id
, int weight
)
1383 ldout(cct
, 5) << __func__
<< " " << id
<< " weight " << weight
<< dendl
;
1384 crush_bucket
*b
= get_bucket(id
);
1388 list
<crush_bucket
*> q
;
1390 while (!q
.empty()) {
1393 int local_changed
= 0;
1394 for (unsigned i
=0; i
<b
->size
; ++i
) {
1395 int n
= b
->items
[i
];
1397 bucket_adjust_item_weight(cct
, b
, n
, weight
);
1401 crush_bucket
*sub
= get_bucket(n
);
1407 if (local_changed
) {
1408 adjust_item_weight(cct
, b
->id
, b
->weight
);
1414 bool CrushWrapper::check_item_present(int id
) const
1418 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1419 crush_bucket
*b
= crush
->buckets
[bidx
];
1422 for (unsigned i
= 0; i
< b
->size
; i
++)
1423 if (b
->items
[i
] == id
)
1430 pair
<string
,string
> CrushWrapper::get_immediate_parent(int id
, int *_ret
)
1433 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1434 crush_bucket
*b
= crush
->buckets
[bidx
];
1437 if (is_shadow_item(b
->id
))
1439 for (unsigned i
= 0; i
< b
->size
; i
++)
1440 if (b
->items
[i
] == id
) {
1441 string parent_id
= name_map
[b
->id
];
1442 string parent_bucket_type
= type_map
[b
->type
];
1445 return make_pair(parent_bucket_type
, parent_id
);
1452 return pair
<string
, string
>();
1455 int CrushWrapper::get_immediate_parent_id(int id
, int *parent
) const
1457 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1458 crush_bucket
*b
= crush
->buckets
[bidx
];
1461 if (is_shadow_item(b
->id
))
1463 for (unsigned i
= 0; i
< b
->size
; i
++) {
1464 if (b
->items
[i
] == id
) {
1473 int CrushWrapper::get_parent_of_type(int item
, int type
, int rule
) const
1476 // no rule specified
1478 int r
= get_immediate_parent_id(item
, &item
);
1482 } while (get_bucket_type(item
) != type
);
1486 find_takes_by_rule(rule
, &roots
);
1487 for (auto root
: roots
) {
1488 set
<int> candidates
;
1489 get_children_of_type(root
, type
, &candidates
, false);
1490 for (auto candidate
: candidates
) {
1491 if (subtree_contains(candidate
, item
)) {
1492 // note that here we assure that no two different buckets
1493 // from a single crush rule will share a same device,
1494 // which should generally be true.
1499 return 0; // not found
1502 int CrushWrapper::rename_class(const string
& srcname
, const string
& dstname
)
1504 auto i
= class_rname
.find(srcname
);
1505 if (i
== class_rname
.end())
1507 auto j
= class_rname
.find(dstname
);
1508 if (j
!= class_rname
.end())
1511 int class_id
= i
->second
;
1512 assert(class_name
.count(class_id
));
1513 // rename any shadow buckets of old class name
1514 for (auto &it
: class_map
) {
1515 if (it
.first
< 0 && it
.second
== class_id
) {
1516 string old_name
= get_item_name(it
.first
);
1517 size_t pos
= old_name
.find("~");
1518 assert(pos
!= string::npos
);
1519 string name_no_class
= old_name
.substr(0, pos
);
1520 string old_class_name
= old_name
.substr(pos
+ 1);
1521 assert(old_class_name
== srcname
);
1522 string new_name
= name_no_class
+ "~" + dstname
;
1523 // we do not use set_item_name
1524 // because the name is intentionally invalid
1525 name_map
[it
.first
] = new_name
;
1531 class_rname
.erase(srcname
);
1532 class_name
.erase(class_id
);
1533 class_rname
[dstname
] = class_id
;
1534 class_name
[class_id
] = dstname
;
1538 int CrushWrapper::populate_classes(
1539 const std::map
<int32_t, map
<int32_t, int32_t>>& old_class_bucket
)
1541 // build set of previous used shadow ids
1542 set
<int32_t> used_ids
;
1543 for (auto& p
: old_class_bucket
) {
1544 for (auto& q
: p
.second
) {
1545 used_ids
.insert(q
.second
);
1548 // accumulate weight values for each carg and bucket as we go. because it is
1549 // depth first, we will have the nested bucket weights we need when we
1550 // finish constructing the containing buckets.
1551 map
<int,map
<int,vector
<int>>> cmap_item_weight
; // cargs -> bno -> [bucket weight for each position]
1553 find_nonshadow_roots(&roots
);
1554 for (auto &r
: roots
) {
1557 for (auto &c
: class_name
) {
1559 int res
= device_class_clone(r
, c
.first
, old_class_bucket
, used_ids
,
1560 &clone
, &cmap_item_weight
);
1568 int CrushWrapper::trim_roots_with_class()
1571 find_shadow_roots(&roots
);
1572 for (auto &r
: roots
) {
1575 int res
= remove_root(r
);
1579 // there is no need to reweight because we only remove from the
1584 int32_t CrushWrapper::_alloc_class_id() const {
1585 if (class_name
.empty()) {
1588 int32_t class_id
= class_name
.rbegin()->first
+ 1;
1589 if (class_id
>= 0) {
1592 // wrapped, pick a random start and do exhaustive search
1593 uint32_t upperlimit
= numeric_limits
<int32_t>::max();
1595 class_id
= rand() % upperlimit
;
1596 const auto start
= class_id
;
1598 if (!class_name
.count(class_id
)) {
1606 } while (class_id
!= start
);
1607 assert(0 == "no available class id");
1610 int CrushWrapper::set_subtree_class(
1611 const string
& subtree
,
1612 const string
& new_class
)
1614 if (!name_exists(subtree
)) {
1618 int new_class_id
= -1;
1619 if (class_exists(new_class
)) {
1620 new_class_id
= get_class_id(new_class
);
1622 for (new_class_id
= 1; class_name
.count(new_class_id
); ++new_class_id
) ;
1623 class_name
[new_class_id
] = new_class
;
1624 class_rname
[new_class
] = new_class_id
;
1627 int id
= get_item_id(subtree
);
1628 list
<int> q
= { id
};
1629 while (!q
.empty()) {
1632 crush_bucket
*b
= get_bucket(id
);
1636 for (unsigned i
= 0; i
< b
->size
; ++i
) {
1637 int item
= b
->items
[i
];
1639 class_map
[item
] = new_class_id
;
1648 int CrushWrapper::reclassify(
1651 const map
<string
,string
>& classify_root
,
1652 const map
<string
,pair
<string
,string
>>& classify_bucket
1655 map
<int,string
> reclassified_bucket
; // orig_id -> class
1658 for (auto& i
: classify_root
) {
1659 string root
= i
.first
;
1660 if (!name_exists(root
)) {
1661 out
<< "root " << root
<< " does not exist" << std::endl
;
1664 int root_id
= get_item_id(root
);
1665 string new_class
= i
.second
;
1666 int new_class_id
= -1;
1667 out
<< "classify_root " << root
<< " (" << root_id
1668 << ") as " << new_class
<< std::endl
;
1669 if (class_exists(new_class
)) {
1670 new_class_id
= get_class_id(new_class
);
1671 out
<< " new class " << new_class
<< " exists as " << new_class_id
1674 for (new_class_id
= 1; class_name
.count(new_class_id
); ++new_class_id
) ;
1675 class_name
[new_class_id
] = new_class
;
1676 class_rname
[new_class
] = new_class_id
;
1677 out
<< " created new class " << new_class
<< " as " << new_class_id
1682 for (unsigned j
= 0; j
< crush
->max_rules
; j
++) {
1683 if (crush
->rules
[j
]) {
1684 auto rule
= crush
->rules
[j
];
1685 for (unsigned k
= 0; k
< rule
->len
; ++k
) {
1686 if (rule
->steps
[k
].op
== CRUSH_RULE_TAKE
) {
1687 int step_item
= get_rule_arg1(j
, k
);
1690 int res
= split_id_class(step_item
, &original_item
, &c
);
1694 if (original_item
== root_id
) {
1695 out
<< " rule " << j
<< " includes take on root "
1696 << root
<< " class " << c
<< std::endl
;
1705 // rebuild new buckets for root
1706 //cout << "before class_bucket: " << class_bucket << std::endl;
1707 map
<int,int> renumber
;
1709 q
.push_back(root_id
);
1710 while (!q
.empty()) {
1713 crush_bucket
*bucket
= get_bucket(id
);
1714 if (IS_ERR(bucket
)) {
1715 out
<< "cannot find bucket " << id
1716 << ": " << cpp_strerror(PTR_ERR(bucket
)) << std::endl
;
1717 return PTR_ERR(bucket
);
1721 int new_id
= get_new_bucket_id();
1722 out
<< " renumbering bucket " << id
<< " -> " << new_id
<< std::endl
;
1723 renumber
[id
] = new_id
;
1724 crush
->buckets
[-1-new_id
] = bucket
;
1725 bucket
->id
= new_id
;
1726 crush
->buckets
[-1-id
] = crush_make_bucket(crush
,
1731 crush
->buckets
[-1-id
]->id
= id
;
1732 for (auto& i
: choose_args
) {
1733 i
.second
.args
[-1-new_id
] = i
.second
.args
[-1-id
];
1734 memset(&i
.second
.args
[-1-id
], 0, sizeof(i
.second
.args
[0]));
1736 class_bucket
.erase(id
);
1737 class_bucket
[new_id
][new_class_id
] = id
;
1738 name_map
[new_id
] = string(get_item_name(id
));
1739 name_map
[id
] = string(get_item_name(id
)) + "~" + new_class
;
1741 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
1742 if (bucket
->items
[j
] < 0) {
1743 q
.push_front(bucket
->items
[j
]);
1745 // we don't reclassify the device here; if the users wants that,
1746 // they can pass --set-subtree-class separately.
1750 //cout << "mid class_bucket: " << class_bucket << std::endl;
1752 for (int i
= 0; i
< crush
->max_buckets
; ++i
) {
1753 crush_bucket
*b
= crush
->buckets
[i
];
1757 for (unsigned j
= 0; j
< b
->size
; ++j
) {
1758 if (renumber
.count(b
->items
[j
])) {
1759 b
->items
[j
] = renumber
[b
->items
[j
]];
1764 int r
= rebuild_roots_with_classes();
1766 out
<< "failed to rebuild_roots_with_classes: " << cpp_strerror(r
)
1770 //cout << "final class_bucket: " << class_bucket << std::endl;
1774 map
<int,int> send_to
; // source bucket -> dest bucket
1775 map
<int,map
<int,int>> new_class_bucket
;
1776 map
<int,string
> new_bucket_names
;
1777 map
<int,map
<string
,string
>> new_buckets
;
1778 map
<string
,int> new_bucket_by_name
;
1779 for (auto& i
: classify_bucket
) {
1780 const string
& match
= i
.first
; // prefix% or %suffix
1781 const string
& new_class
= i
.second
.first
;
1782 const string
& default_parent
= i
.second
.second
;
1783 if (!name_exists(default_parent
)) {
1784 out
<< "default parent " << default_parent
<< " does not exist"
1788 int default_parent_id
= get_item_id(default_parent
);
1789 crush_bucket
*default_parent_bucket
= get_bucket(default_parent_id
);
1790 assert(default_parent_bucket
);
1791 string default_parent_type_name
= get_type_name(default_parent_bucket
->type
);
1793 out
<< "classify_bucket " << match
<< " as " << new_class
1794 << " default bucket " << default_parent
1795 << " (" << default_parent_type_name
<< ")" << std::endl
;
1797 int new_class_id
= -1;
1798 if (class_exists(new_class
)) {
1799 new_class_id
= get_class_id(new_class
);
1800 out
<< " new class " << new_class
<< " exists as " << new_class_id
1803 for (new_class_id
= 1; class_name
.count(new_class_id
); ++new_class_id
) {
1805 class_name
[new_class_id
] = new_class
;
1806 class_rname
[new_class
] = new_class_id
;
1807 out
<< " created new class " << new_class
<< " as " << new_class_id
1811 for (int j
= 0; j
< crush
->max_buckets
; ++j
) {
1812 crush_bucket
*b
= crush
->buckets
[j
];
1813 if (!b
|| is_shadow_item(b
->id
)) {
1816 string name
= get_item_name(b
->id
);
1817 if (name
.length() < match
.length()) {
1821 if (match
[0] == '%') {
1822 if (match
.substr(1) != name
.substr(name
.size() - match
.size() + 1)) {
1825 basename
= name
.substr(0, name
.size() - match
.size() + 1);
1826 } else if (match
[match
.size() - 1] == '%') {
1827 if (match
.substr(0, match
.size() - 1) !=
1828 name
.substr(0, match
.size() - 1)) {
1831 basename
= name
.substr(match
.size() - 1);
1832 } else if (match
== name
) {
1833 basename
= default_parent
;
1837 cout
<< "match " << match
<< " to " << name
<< " basename " << basename
1839 // look up or create basename bucket
1841 if (name_exists(basename
)) {
1842 base_id
= get_item_id(basename
);
1843 cout
<< " have base " << base_id
<< std::endl
;
1844 } else if (new_bucket_by_name
.count(basename
)) {
1845 base_id
= new_bucket_by_name
[basename
];
1846 cout
<< " already creating base " << base_id
<< std::endl
;
1848 base_id
= get_new_bucket_id();
1849 crush
->buckets
[-1-base_id
] = crush_make_bucket(crush
,
1854 crush
->buckets
[-1-base_id
]->id
= base_id
;
1855 name_map
[base_id
] = basename
;
1856 new_bucket_by_name
[basename
] = base_id
;
1857 cout
<< " created base " << base_id
<< std::endl
;
1859 new_buckets
[base_id
][default_parent_type_name
] = default_parent
;
1861 send_to
[b
->id
] = base_id
;
1862 new_class_bucket
[base_id
][new_class_id
] = b
->id
;
1863 new_bucket_names
[b
->id
] = basename
+ "~" + get_class_name(new_class_id
);
1865 // make sure devices are classified
1866 for (unsigned i
= 0; i
< b
->size
; ++i
) {
1867 int item
= b
->items
[i
];
1869 class_map
[item
] = new_class_id
;
1875 // no name_exists() works below,
1878 // copy items around
1879 //cout << "send_to " << send_to << std::endl;
1882 for (auto& i
: send_to
) {
1883 crush_bucket
*from
= get_bucket(i
.first
);
1884 crush_bucket
*to
= get_bucket(i
.second
);
1885 cout
<< "moving items from " << from
->id
<< " (" << get_item_name(from
->id
)
1886 << ") to " << to
->id
<< " (" << get_item_name(to
->id
) << ")"
1888 for (unsigned j
= 0; j
< from
->size
; ++j
) {
1889 int item
= from
->items
[j
];
1891 map
<string
,string
> to_loc
;
1892 to_loc
[get_type_name(to
->type
)] = get_item_name(to
->id
);
1894 if (subtree_contains(to
->id
, item
)) {
1897 map
<string
,string
> from_loc
;
1898 from_loc
[get_type_name(from
->type
)] = get_item_name(from
->id
);
1899 auto w
= get_item_weightf_in_loc(item
, from_loc
);
1900 r
= insert_item(cct
, item
,
1902 get_item_name(item
),
1905 if (!send_to
.count(item
)) {
1906 lderr(cct
) << "item " << item
<< " in bucket " << from
->id
1907 << " is not also a reclassified bucket" << dendl
;
1910 int newitem
= send_to
[item
];
1911 if (subtree_contains(to
->id
, newitem
)) {
1914 r
= link_bucket(cct
, newitem
, to_loc
);
1917 cout
<< __func__
<< " err from insert_item: " << cpp_strerror(r
)
1924 // make sure new buckets have parents
1925 for (auto& i
: new_buckets
) {
1927 if (get_immediate_parent_id(i
.first
, &parent
) < 0) {
1928 cout
<< "new bucket " << i
.first
<< " missing parent, adding at "
1929 << i
.second
<< std::endl
;
1930 int r
= link_bucket(cct
, i
.first
, i
.second
);
1932 cout
<< __func__
<< " err from insert_item: " << cpp_strerror(r
)
1939 // set class mappings
1940 //cout << "pre class_bucket: " << class_bucket << std::endl;
1941 for (auto& i
: new_class_bucket
) {
1942 for (auto& j
: i
.second
) {
1943 class_bucket
[i
.first
][j
.first
] = j
.second
;
1947 //cout << "post class_bucket: " << class_bucket << std::endl;
1948 for (auto& i
: new_bucket_names
) {
1949 name_map
[i
.first
] = i
.second
;
1952 int r
= rebuild_roots_with_classes();
1954 out
<< "failed to rebuild_roots_with_classes: " << cpp_strerror(r
)
1958 //cout << "final class_bucket: " << class_bucket << std::endl;
1963 int CrushWrapper::get_new_bucket_id()
1966 while (crush
->buckets
[-1-id
] &&
1967 -1-id
< crush
->max_buckets
) {
1970 if (-1-id
== crush
->max_buckets
) {
1971 ++crush
->max_buckets
;
1972 crush
->buckets
= (struct crush_bucket
**)realloc(
1974 sizeof(crush
->buckets
[0]) * crush
->max_buckets
);
1975 for (auto& i
: choose_args
) {
1976 assert(i
.second
.size
== crush
->max_buckets
- 1);
1978 i
.second
.args
= (struct crush_choose_arg
*)realloc(
1980 sizeof(i
.second
.args
[0]) * i
.second
.size
);
1986 void CrushWrapper::reweight(CephContext
*cct
)
1989 find_nonshadow_roots(&roots
);
1990 for (auto id
: roots
) {
1993 crush_bucket
*b
= get_bucket(id
);
1994 ldout(cct
, 5) << "reweight root bucket " << id
<< dendl
;
1995 int r
= crush_reweight_bucket(crush
, b
);
1998 for (auto& i
: choose_args
) {
1999 //cout << "carg " << i.first << std::endl;
2000 vector
<uint32_t> w
; // discard top-level weights
2001 reweight_bucket(b
, i
.second
, &w
);
2004 int r
= rebuild_roots_with_classes();
2005 ceph_assert(r
== 0);
2008 void CrushWrapper::reweight_bucket(
2010 crush_choose_arg_map
& arg_map
,
2011 vector
<uint32_t> *weightv
)
2013 int idx
= -1 - b
->id
;
2014 unsigned npos
= arg_map
.args
[idx
].weight_set_positions
;
2015 //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
2016 weightv
->resize(npos
);
2017 for (unsigned i
= 0; i
< b
->size
; ++i
) {
2018 int item
= b
->items
[i
];
2020 for (unsigned pos
= 0; pos
< npos
; ++pos
) {
2021 (*weightv
)[pos
] += arg_map
.args
[idx
].weight_set
->weights
[i
];
2024 vector
<uint32_t> subw(npos
);
2025 crush_bucket
*sub
= get_bucket(item
);
2027 reweight_bucket(sub
, arg_map
, &subw
);
2028 for (unsigned pos
= 0; pos
< npos
; ++pos
) {
2029 (*weightv
)[pos
] += subw
[pos
];
2030 // strash the real bucket weight as the weights for this reference
2031 arg_map
.args
[idx
].weight_set
->weights
[i
] = subw
[pos
];
2035 //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
2038 int CrushWrapper::add_simple_rule_at(
2039 string name
, string root_name
,
2040 string failure_domain_name
,
2041 string device_class
,
2042 string mode
, int rule_type
,
2046 if (rule_exists(name
)) {
2048 *err
<< "rule " << name
<< " exists";
2052 if (rule_exists(rno
)) {
2054 *err
<< "rule with ruleno " << rno
<< " exists";
2057 if (ruleset_exists(rno
)) {
2059 *err
<< "ruleset " << rno
<< " exists";
2063 for (rno
= 0; rno
< get_max_rules(); rno
++) {
2064 if (!rule_exists(rno
) && !ruleset_exists(rno
))
2068 if (!name_exists(root_name
)) {
2070 *err
<< "root item " << root_name
<< " does not exist";
2073 int root
= get_item_id(root_name
);
2075 if (failure_domain_name
.length()) {
2076 type
= get_type_id(failure_domain_name
);
2079 *err
<< "unknown type " << failure_domain_name
;
2083 if (device_class
.size()) {
2084 if (!class_exists(device_class
)) {
2086 *err
<< "device class " << device_class
<< " does not exist";
2089 int c
= get_class_id(device_class
);
2090 if (class_bucket
.count(root
) == 0 ||
2091 class_bucket
[root
].count(c
) == 0) {
2093 *err
<< "root " << root_name
<< " has no devices with class "
2097 root
= class_bucket
[root
][c
];
2099 if (mode
!= "firstn" && mode
!= "indep") {
2101 *err
<< "unknown mode " << mode
;
2106 if (mode
== "indep")
2108 int min_rep
= mode
== "firstn" ? 1 : 3;
2109 int max_rep
= mode
== "firstn" ? 10 : 20;
2110 //set the ruleset the same as rule_id(rno)
2111 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_rep
, max_rep
);
2114 if (mode
== "indep") {
2115 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
2116 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
2118 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, root
, 0);
2120 crush_rule_set_step(rule
, step
++,
2121 mode
== "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN
:
2122 CRUSH_RULE_CHOOSELEAF_INDEP
,
2126 crush_rule_set_step(rule
, step
++,
2127 mode
== "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN
:
2128 CRUSH_RULE_CHOOSE_INDEP
,
2131 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
2133 int ret
= crush_add_rule(crush
, rule
, rno
);
2135 *err
<< "failed to add rule " << rno
<< " because " << cpp_strerror(ret
);
2138 set_rule_name(rno
, name
);
2143 int CrushWrapper::add_simple_rule(
2144 string name
, string root_name
,
2145 string failure_domain_name
,
2146 string device_class
,
2147 string mode
, int rule_type
,
2150 return add_simple_rule_at(name
, root_name
, failure_domain_name
, device_class
,
2152 rule_type
, -1, err
);
2155 float CrushWrapper::_get_take_weight_osd_map(int root
,
2156 map
<int,float> *pmap
) const
2161 //breadth first iterate the OSD tree
2162 while (!q
.empty()) {
2163 int bno
= q
.front();
2165 crush_bucket
*b
= crush
->buckets
[-1-bno
];
2167 for (unsigned j
=0; j
<b
->size
; ++j
) {
2168 int item_id
= b
->items
[j
];
2169 if (item_id
>= 0) { //it's an OSD
2170 float w
= crush_get_bucket_item_weight(b
, j
);
2171 (*pmap
)[item_id
] = w
;
2173 } else { //not an OSD, expand the child later
2174 q
.push_back(item_id
);
2181 void CrushWrapper::_normalize_weight_map(float sum
,
2182 const map
<int,float>& m
,
2183 map
<int,float> *pmap
) const
2186 map
<int,float>::iterator q
= pmap
->find(p
.first
);
2187 if (q
== pmap
->end()) {
2188 (*pmap
)[p
.first
] = p
.second
/ sum
;
2190 q
->second
+= p
.second
/ sum
;
2195 int CrushWrapper::get_take_weight_osd_map(int root
, map
<int,float> *pmap
) const
2198 float sum
= _get_take_weight_osd_map(root
, &m
);
2199 _normalize_weight_map(sum
, m
, pmap
);
2203 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno
,
2204 map
<int,float> *pmap
) const
2206 if (ruleno
>= crush
->max_rules
)
2208 if (crush
->rules
[ruleno
] == NULL
)
2210 crush_rule
*rule
= crush
->rules
[ruleno
];
2212 // build a weight map for each TAKE in the rule, and then merge them
2214 // FIXME: if there are multiple takes that place a different number of
2215 // objects we do not take that into account. (Also, note that doing this
2216 // right is also a function of the pool, since the crush rule
2217 // might choose 2 + choose 2 but pool size may only be 3.)
2218 for (unsigned i
=0; i
<rule
->len
; ++i
) {
2221 if (rule
->steps
[i
].op
== CRUSH_RULE_TAKE
) {
2222 int n
= rule
->steps
[i
].arg1
;
2227 sum
+= _get_take_weight_osd_map(n
, &m
);
2230 _normalize_weight_map(sum
, m
, pmap
);
2236 int CrushWrapper::remove_rule(int ruleno
)
2238 if (ruleno
>= (int)crush
->max_rules
)
2240 if (crush
->rules
[ruleno
] == NULL
)
2242 crush_destroy_rule(crush
->rules
[ruleno
]);
2243 crush
->rules
[ruleno
] = NULL
;
2244 rule_name_map
.erase(ruleno
);
2246 return rebuild_roots_with_classes();
2249 int CrushWrapper::bucket_adjust_item_weight(CephContext
*cct
, crush_bucket
*bucket
, int item
, int weight
)
2251 if (cct
->_conf
->osd_crush_update_weight_set
) {
2253 for (position
= 0; position
< bucket
->size
; position
++)
2254 if (bucket
->items
[position
] == item
)
2256 assert(position
!= bucket
->size
);
2257 for (auto &w
: choose_args
) {
2258 crush_choose_arg_map
&arg_map
= w
.second
;
2259 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
2260 for (__u32 j
= 0; j
< arg
->weight_set_positions
; j
++) {
2261 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
2262 weight_set
->weights
[position
] = weight
;
2266 return crush_bucket_adjust_item_weight(crush
, bucket
, item
, weight
);
2269 int CrushWrapper::add_bucket(
2270 int bucketno
, int alg
, int hash
, int type
, int size
,
2271 int *items
, int *weights
, int *idout
)
2274 alg
= get_default_bucket_alg();
2278 crush_bucket
*b
= crush_make_bucket(crush
, alg
, hash
, type
, size
, items
,
2282 int r
= crush_add_bucket(crush
, bucketno
, b
, idout
);
2283 int pos
= -1 - *idout
;
2284 for (auto& p
: choose_args
) {
2285 crush_choose_arg_map
& cmap
= p
.second
;
2287 if ((int)cmap
.size
<= pos
) {
2288 cmap
.args
= (crush_choose_arg
*)realloc(
2290 sizeof(crush_choose_arg
) * (pos
+ 1));
2292 memset(&cmap
.args
[cmap
.size
], 0,
2293 sizeof(crush_choose_arg
) * (pos
+ 1 - cmap
.size
));
2294 cmap
.size
= pos
+ 1;
2297 cmap
.args
= (crush_choose_arg
*)calloc(sizeof(crush_choose_arg
),
2300 cmap
.size
= pos
+ 1;
2303 int positions
= get_choose_args_positions(cmap
);
2304 crush_choose_arg
& carg
= cmap
.args
[pos
];
2305 carg
.weight_set
= (crush_weight_set
*)calloc(sizeof(crush_weight_set
),
2307 carg
.weight_set_positions
= positions
;
2308 for (int ppos
= 0; ppos
< positions
; ++ppos
) {
2309 carg
.weight_set
[ppos
].weights
= (__u32
*)calloc(sizeof(__u32
), size
);
2310 carg
.weight_set
[ppos
].size
= size
;
2311 for (int bpos
= 0; bpos
< size
; ++bpos
) {
2312 carg
.weight_set
[ppos
].weights
[bpos
] = weights
[bpos
];
2320 int CrushWrapper::bucket_add_item(crush_bucket
*bucket
, int item
, int weight
)
2322 __u32 new_size
= bucket
->size
+ 1;
2323 int r
= crush_bucket_add_item(crush
, bucket
, item
, weight
);
2327 for (auto &w
: choose_args
) {
2328 crush_choose_arg_map
&arg_map
= w
.second
;
2329 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
2330 for (__u32 j
= 0; j
< arg
->weight_set_positions
; j
++) {
2331 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
2332 weight_set
->weights
= (__u32
*)realloc(weight_set
->weights
,
2333 new_size
* sizeof(__u32
));
2334 assert(weight_set
->size
+ 1 == new_size
);
2335 weight_set
->weights
[weight_set
->size
] = weight
;
2336 weight_set
->size
= new_size
;
2338 if (arg
->ids_size
) {
2339 arg
->ids
= (__s32
*)realloc(arg
->ids
, new_size
* sizeof(__s32
));
2340 assert(arg
->ids_size
+ 1 == new_size
);
2341 arg
->ids
[arg
->ids_size
] = item
;
2342 arg
->ids_size
= new_size
;
2348 int CrushWrapper::bucket_remove_item(crush_bucket
*bucket
, int item
)
2350 __u32 new_size
= bucket
->size
- 1;
2352 for (position
= 0; position
< bucket
->size
; position
++)
2353 if (bucket
->items
[position
] == item
)
2355 assert(position
!= bucket
->size
);
2356 int r
= crush_bucket_remove_item(crush
, bucket
, item
);
2360 for (auto &w
: choose_args
) {
2361 crush_choose_arg_map
&arg_map
= w
.second
;
2362 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
2363 for (__u32 j
= 0; j
< arg
->weight_set_positions
; j
++) {
2364 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
2365 assert(weight_set
->size
- 1 == new_size
);
2366 for (__u32 k
= position
; k
< new_size
; k
++)
2367 weight_set
->weights
[k
] = weight_set
->weights
[k
+1];
2369 weight_set
->weights
= (__u32
*)realloc(weight_set
->weights
,
2370 new_size
* sizeof(__u32
));
2372 weight_set
->weights
= NULL
;
2374 weight_set
->size
= new_size
;
2376 if (arg
->ids_size
) {
2377 assert(arg
->ids_size
- 1 == new_size
);
2378 for (__u32 k
= position
; k
< new_size
; k
++)
2379 arg
->ids
[k
] = arg
->ids
[k
+1];
2381 arg
->ids
= (__s32
*)realloc(arg
->ids
, new_size
* sizeof(__s32
));
2385 arg
->ids_size
= new_size
;
2391 int CrushWrapper::bucket_set_alg(int bid
, int alg
)
2393 crush_bucket
*b
= get_bucket(bid
);
2401 int CrushWrapper::update_device_class(int id
,
2402 const string
& class_name
,
2406 assert(item_exists(id
));
2407 auto old_class_name
= get_item_class(id
);
2408 if (old_class_name
&& old_class_name
!= class_name
) {
2409 *ss
<< "osd." << id
<< " has already bound to class '" << old_class_name
2410 << "', can not reset class to '" << class_name
<< "'; "
2411 << "use 'ceph osd crush rm-device-class <osd>' to "
2412 << "remove old class first";
2416 int class_id
= get_or_create_class_id(class_name
);
2418 *ss
<< name
<< " id " << id
<< " is negative";
2422 if (class_map
.count(id
) != 0 && class_map
[id
] == class_id
) {
2423 *ss
<< name
<< " already set to class " << class_name
;
2427 set_item_class(id
, class_id
);
2429 int r
= rebuild_roots_with_classes();
2435 int CrushWrapper::remove_device_class(CephContext
*cct
, int id
, ostream
*ss
)
2438 const char *name
= get_item_name(id
);
2440 *ss
<< "osd." << id
<< " does not have a name";
2444 const char *class_name
= get_item_class(id
);
2446 *ss
<< "osd." << id
<< " has not been bound to a specific class yet";
2449 class_remove_item(id
);
2451 int r
= rebuild_roots_with_classes();
2453 *ss
<< "unable to rebuild roots with class '" << class_name
<< "' "
2454 << "of osd." << id
<< ": " << cpp_strerror(r
);
2460 int CrushWrapper::device_class_clone(
2461 int original_id
, int device_class
,
2462 const std::map
<int32_t, map
<int32_t, int32_t>>& old_class_bucket
,
2463 const std::set
<int32_t>& used_ids
,
2465 map
<int,map
<int,vector
<int>>> *cmap_item_weight
)
2467 const char *item_name
= get_item_name(original_id
);
2468 if (item_name
== NULL
)
2470 const char *class_name
= get_class_name(device_class
);
2471 if (class_name
== NULL
)
2473 string copy_name
= item_name
+ string("~") + class_name
;
2474 if (name_exists(copy_name
)) {
2475 *clone
= get_item_id(copy_name
);
2479 crush_bucket
*original
= get_bucket(original_id
);
2480 assert(!IS_ERR(original
));
2481 crush_bucket
*copy
= crush_make_bucket(crush
,
2488 vector
<unsigned> item_orig_pos
; // new item pos -> orig item pos
2489 for (unsigned i
= 0; i
< original
->size
; i
++) {
2490 int item
= original
->items
[i
];
2491 int weight
= crush_get_bucket_item_weight(original
, i
);
2493 if (class_map
.count(item
) != 0 && class_map
[item
] == device_class
) {
2494 int res
= crush_bucket_add_item(crush
, copy
, item
, weight
);
2502 int res
= device_class_clone(item
, device_class
, old_class_bucket
,
2503 used_ids
, &child_copy_id
,
2507 crush_bucket
*child_copy
= get_bucket(child_copy_id
);
2508 assert(!IS_ERR(child_copy
));
2509 res
= crush_bucket_add_item(crush
, copy
, child_copy_id
,
2510 child_copy
->weight
);
2514 item_orig_pos
.push_back(i
);
2516 assert(item_orig_pos
.size() == copy
->size
);
2519 if (old_class_bucket
.count(original_id
) &&
2520 old_class_bucket
.at(original_id
).count(device_class
)) {
2521 bno
= old_class_bucket
.at(original_id
).at(device_class
);
2523 // pick a new shadow bucket id that is not used by the current map
2524 // *or* any previous shadow buckets.
2526 while (((-1-bno
) < crush
->max_buckets
&& crush
->buckets
[-1-bno
]) ||
2527 used_ids
.count(bno
)) {
2531 int res
= crush_add_bucket(crush
, bno
, copy
, clone
);
2534 assert(!bno
|| bno
== *clone
);
2536 res
= set_item_class(*clone
, device_class
);
2540 // we do not use set_item_name because the name is intentionally invalid
2541 name_map
[*clone
] = copy_name
;
2543 name_rmap
[copy_name
] = *clone
;
2544 class_bucket
[original_id
][device_class
] = *clone
;
2546 // set up choose_args for the new bucket.
2547 for (auto& w
: choose_args
) {
2548 crush_choose_arg_map
& cmap
= w
.second
;
2549 if (-1-bno
>= (int)cmap
.size
) {
2550 unsigned new_size
= -1-bno
+ 1;
2551 cmap
.args
= (crush_choose_arg
*)realloc(cmap
.args
,
2552 new_size
* sizeof(cmap
.args
[0]));
2554 memset(cmap
.args
+ cmap
.size
, 0,
2555 (new_size
- cmap
.size
) * sizeof(cmap
.args
[0]));
2556 cmap
.size
= new_size
;
2558 auto& o
= cmap
.args
[-1-original_id
];
2559 auto& n
= cmap
.args
[-1-bno
];
2560 n
.ids_size
= 0; // FIXME: implement me someday
2561 n
.weight_set_positions
= o
.weight_set_positions
;
2562 n
.weight_set
= (crush_weight_set
*)calloc(
2563 n
.weight_set_positions
, sizeof(crush_weight_set
));
2564 for (size_t s
= 0; s
< n
.weight_set_positions
; ++s
) {
2565 n
.weight_set
[s
].size
= copy
->size
;
2566 n
.weight_set
[s
].weights
= (__u32
*)calloc(copy
->size
, sizeof(__u32
));
2568 for (size_t s
= 0; s
< n
.weight_set_positions
; ++s
) {
2569 vector
<int> bucket_weights(n
.weight_set_positions
);
2570 for (size_t i
= 0; i
< copy
->size
; ++i
) {
2571 int item
= copy
->items
[i
];
2573 n
.weight_set
[s
].weights
[i
] = o
.weight_set
[s
].weights
[item_orig_pos
[i
]];
2574 } else if ((*cmap_item_weight
)[w
.first
].count(item
)) {
2575 n
.weight_set
[s
].weights
[i
] = (*cmap_item_weight
)[w
.first
][item
][s
];
2577 n
.weight_set
[s
].weights
[i
] = 0;
2579 bucket_weights
[s
] += n
.weight_set
[s
].weights
[i
];
2581 (*cmap_item_weight
)[w
.first
][bno
] = bucket_weights
;
2587 int CrushWrapper::get_rules_by_class(const string
&class_name
, set
<int> *rules
)
2591 if (!class_exists(class_name
)) {
2594 int class_id
= get_class_id(class_name
);
2595 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
2596 crush_rule
*r
= crush
->rules
[i
];
2599 for (unsigned j
= 0; j
< r
->len
; ++j
) {
2600 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
2601 int step_item
= r
->steps
[j
].arg1
;
2604 int res
= split_id_class(step_item
, &original_item
, &c
);
2608 if (c
!= -1 && c
== class_id
) {
2618 // return rules that might reference the given osd
2619 int CrushWrapper::get_rules_by_osd(int osd
, set
<int> *rules
)
2626 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
2627 crush_rule
*r
= crush
->rules
[i
];
2630 for (unsigned j
= 0; j
< r
->len
; ++j
) {
2631 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
2632 int step_item
= r
->steps
[j
].arg1
;
2633 list
<int> unordered
;
2634 int rc
= _get_leaves(step_item
, &unordered
);
2636 return rc
; // propagate fatal errors!
2639 for (auto &o
: unordered
) {
2656 bool CrushWrapper::_class_is_dead(int class_id
)
2658 for (auto &p
: class_map
) {
2659 if (p
.first
>= 0 && p
.second
== class_id
) {
2663 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
2664 crush_rule
*r
= crush
->rules
[i
];
2667 for (unsigned j
= 0; j
< r
->len
; ++j
) {
2668 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
2669 int root
= r
->steps
[j
].arg1
;
2670 for (auto &p
: class_bucket
) {
2672 if (q
.count(class_id
) && q
[class_id
] == root
) {
2679 // no more referenced by any devices or crush rules
2683 void CrushWrapper::cleanup_dead_classes()
2685 auto p
= class_name
.begin();
2686 while (p
!= class_name
.end()) {
2687 if (_class_is_dead(p
->first
)) {
2688 string n
= p
->second
;
2690 remove_class_name(n
);
2697 int CrushWrapper::rebuild_roots_with_classes()
2699 std::map
<int32_t, map
<int32_t, int32_t> > old_class_bucket
= class_bucket
;
2700 cleanup_dead_classes();
2701 int r
= trim_roots_with_class();
2704 class_bucket
.clear();
2705 return populate_classes(old_class_bucket
);
2708 void CrushWrapper::encode(bufferlist
& bl
, uint64_t features
) const
2712 __u32 magic
= CRUSH_MAGIC
;
2713 ::encode(magic
, bl
);
2715 ::encode(crush
->max_buckets
, bl
);
2716 ::encode(crush
->max_rules
, bl
);
2717 ::encode(crush
->max_devices
, bl
);
2719 bool encode_compat_choose_args
= false;
2720 crush_choose_arg_map arg_map
;
2721 memset(&arg_map
, '\0', sizeof(arg_map
));
2722 if (has_choose_args() &&
2723 !HAVE_FEATURE(features
, CRUSH_CHOOSE_ARGS
)) {
2724 assert(!has_incompat_choose_args());
2725 encode_compat_choose_args
= true;
2726 arg_map
= choose_args
.begin()->second
;
2730 for (int i
=0; i
<crush
->max_buckets
; i
++) {
2732 if (crush
->buckets
[i
]) alg
= crush
->buckets
[i
]->alg
;
2737 ::encode(crush
->buckets
[i
]->id
, bl
);
2738 ::encode(crush
->buckets
[i
]->type
, bl
);
2739 ::encode(crush
->buckets
[i
]->alg
, bl
);
2740 ::encode(crush
->buckets
[i
]->hash
, bl
);
2741 ::encode(crush
->buckets
[i
]->weight
, bl
);
2742 ::encode(crush
->buckets
[i
]->size
, bl
);
2743 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++)
2744 ::encode(crush
->buckets
[i
]->items
[j
], bl
);
2746 switch (crush
->buckets
[i
]->alg
) {
2747 case CRUSH_BUCKET_UNIFORM
:
2748 ::encode((reinterpret_cast<crush_bucket_uniform
*>(crush
->buckets
[i
]))->item_weight
, bl
);
2751 case CRUSH_BUCKET_LIST
:
2752 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
2753 ::encode((reinterpret_cast<crush_bucket_list
*>(crush
->buckets
[i
]))->item_weights
[j
], bl
);
2754 ::encode((reinterpret_cast<crush_bucket_list
*>(crush
->buckets
[i
]))->sum_weights
[j
], bl
);
2758 case CRUSH_BUCKET_TREE
:
2759 ::encode((reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->num_nodes
, bl
);
2760 for (unsigned j
=0; j
<(reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->num_nodes
; j
++)
2761 ::encode((reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->node_weights
[j
], bl
);
2764 case CRUSH_BUCKET_STRAW
:
2765 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
2766 ::encode((reinterpret_cast<crush_bucket_straw
*>(crush
->buckets
[i
]))->item_weights
[j
], bl
);
2767 ::encode((reinterpret_cast<crush_bucket_straw
*>(crush
->buckets
[i
]))->straws
[j
], bl
);
2771 case CRUSH_BUCKET_STRAW2
:
2774 if (encode_compat_choose_args
&&
2775 arg_map
.args
[i
].weight_set_positions
> 0) {
2776 weights
= arg_map
.args
[i
].weight_set
[0].weights
;
2778 weights
= (reinterpret_cast<crush_bucket_straw2
*>(crush
->buckets
[i
]))->item_weights
;
2780 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
2781 ::encode(weights
[j
], bl
);
2793 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
2794 __u32 yes
= crush
->rules
[i
] ? 1:0;
2799 ::encode(crush
->rules
[i
]->len
, bl
);
2800 ::encode(crush
->rules
[i
]->mask
, bl
);
2801 for (unsigned j
=0; j
<crush
->rules
[i
]->len
; j
++)
2802 ::encode(crush
->rules
[i
]->steps
[j
], bl
);
2806 ::encode(type_map
, bl
);
2807 ::encode(name_map
, bl
);
2808 ::encode(rule_name_map
, bl
);
2811 ::encode(crush
->choose_local_tries
, bl
);
2812 ::encode(crush
->choose_local_fallback_tries
, bl
);
2813 ::encode(crush
->choose_total_tries
, bl
);
2814 ::encode(crush
->chooseleaf_descend_once
, bl
);
2815 ::encode(crush
->chooseleaf_vary_r
, bl
);
2816 ::encode(crush
->straw_calc_version
, bl
);
2817 ::encode(crush
->allowed_bucket_algs
, bl
);
2818 if (features
& CEPH_FEATURE_CRUSH_TUNABLES5
) {
2819 ::encode(crush
->chooseleaf_stable
, bl
);
2822 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2824 ::encode(class_map
, bl
);
2825 ::encode(class_name
, bl
);
2826 ::encode(class_bucket
, bl
);
2829 __u32 size
= (__u32
)choose_args
.size();
2831 for (auto c
: choose_args
) {
2832 ::encode(c
.first
, bl
);
2833 crush_choose_arg_map arg_map
= c
.second
;
2835 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
2836 crush_choose_arg
*arg
= &arg_map
.args
[i
];
2837 if (arg
->weight_set_positions
== 0 &&
2843 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
2844 crush_choose_arg
*arg
= &arg_map
.args
[i
];
2845 if (arg
->weight_set_positions
== 0 &&
2849 ::encode(arg
->weight_set_positions
, bl
);
2850 for (__u32 j
= 0; j
< arg
->weight_set_positions
; j
++) {
2851 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
2852 ::encode(weight_set
->size
, bl
);
2853 for (__u32 k
= 0; k
< weight_set
->size
; k
++)
2854 ::encode(weight_set
->weights
[k
], bl
);
2856 ::encode(arg
->ids_size
, bl
);
2857 for (__u32 j
= 0; j
< arg
->ids_size
; j
++)
2858 ::encode(arg
->ids
[j
], bl
);
2864 static void decode_32_or_64_string_map(map
<int32_t,string
>& m
, bufferlist::iterator
& blp
)
2874 ::decode(strlen
, blp
);
2876 // der, key was actually 64-bits!
2877 ::decode(strlen
, blp
);
2879 ::decode_nohead(strlen
, m
[key
], blp
);
2883 void CrushWrapper::decode(bufferlist::iterator
& blp
)
2888 ::decode(magic
, blp
);
2889 if (magic
!= CRUSH_MAGIC
)
2890 throw buffer::malformed_input("bad magic number");
2892 ::decode(crush
->max_buckets
, blp
);
2893 ::decode(crush
->max_rules
, blp
);
2894 ::decode(crush
->max_devices
, blp
);
2896 // legacy tunables, unless we decode something newer
2897 set_tunables_legacy();
2901 crush
->buckets
= (crush_bucket
**)calloc(1, crush
->max_buckets
* sizeof(crush_bucket
*));
2902 for (int i
=0; i
<crush
->max_buckets
; i
++) {
2903 decode_crush_bucket(&crush
->buckets
[i
], blp
);
2907 crush
->rules
= (crush_rule
**)calloc(1, crush
->max_rules
* sizeof(crush_rule
*));
2908 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
2912 crush
->rules
[i
] = NULL
;
2918 crush
->rules
[i
] = reinterpret_cast<crush_rule
*>(calloc(1, crush_rule_size(len
)));
2919 crush
->rules
[i
]->len
= len
;
2920 ::decode(crush
->rules
[i
]->mask
, blp
);
2921 for (unsigned j
=0; j
<crush
->rules
[i
]->len
; j
++)
2922 ::decode(crush
->rules
[i
]->steps
[j
], blp
);
2926 // NOTE: we had a bug where we were incoding int instead of int32, which means the
2927 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
2928 // both by assuming the string is always non-empty.
2929 decode_32_or_64_string_map(type_map
, blp
);
2930 decode_32_or_64_string_map(name_map
, blp
);
2931 decode_32_or_64_string_map(rule_name_map
, blp
);
2935 ::decode(crush
->choose_local_tries
, blp
);
2936 ::decode(crush
->choose_local_fallback_tries
, blp
);
2937 ::decode(crush
->choose_total_tries
, blp
);
2940 ::decode(crush
->chooseleaf_descend_once
, blp
);
2943 ::decode(crush
->chooseleaf_vary_r
, blp
);
2946 ::decode(crush
->straw_calc_version
, blp
);
2949 ::decode(crush
->allowed_bucket_algs
, blp
);
2952 ::decode(crush
->chooseleaf_stable
, blp
);
2955 ::decode(class_map
, blp
);
2956 ::decode(class_name
, blp
);
2957 for (auto &c
: class_name
)
2958 class_rname
[c
.second
] = c
.first
;
2959 ::decode(class_bucket
, blp
);
2962 __u32 choose_args_size
;
2963 ::decode(choose_args_size
, blp
);
2964 for (__u32 i
= 0; i
< choose_args_size
; i
++) {
2965 typename
decltype(choose_args
)::key_type choose_args_index
;
2966 ::decode(choose_args_index
, blp
);
2967 crush_choose_arg_map arg_map
;
2968 arg_map
.size
= crush
->max_buckets
;
2969 arg_map
.args
= (crush_choose_arg
*)calloc(
2970 arg_map
.size
, sizeof(crush_choose_arg
));
2972 ::decode(size
, blp
);
2973 for (__u32 j
= 0; j
< size
; j
++) {
2975 ::decode(bucket_index
, blp
);
2976 assert(bucket_index
< arg_map
.size
);
2977 crush_choose_arg
*arg
= &arg_map
.args
[bucket_index
];
2978 ::decode(arg
->weight_set_positions
, blp
);
2979 if (arg
->weight_set_positions
) {
2980 arg
->weight_set
= (crush_weight_set
*)calloc(
2981 arg
->weight_set_positions
, sizeof(crush_weight_set
));
2982 for (__u32 k
= 0; k
< arg
->weight_set_positions
; k
++) {
2983 crush_weight_set
*weight_set
= &arg
->weight_set
[k
];
2984 ::decode(weight_set
->size
, blp
);
2985 weight_set
->weights
= (__u32
*)calloc(
2986 weight_set
->size
, sizeof(__u32
));
2987 for (__u32 l
= 0; l
< weight_set
->size
; l
++)
2988 ::decode(weight_set
->weights
[l
], blp
);
2991 ::decode(arg
->ids_size
, blp
);
2992 if (arg
->ids_size
) {
2993 assert(arg
->ids_size
== crush
->buckets
[bucket_index
]->size
);
2994 arg
->ids
= (__s32
*)calloc(arg
->ids_size
, sizeof(__s32
));
2995 for (__u32 k
= 0; k
< arg
->ids_size
; k
++)
2996 ::decode(arg
->ids
[k
], blp
);
2999 choose_args
[choose_args_index
] = arg_map
;
3002 update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
3006 crush_destroy(crush
);
3011 void CrushWrapper::decode_crush_bucket(crush_bucket
** bptr
, bufferlist::iterator
&blp
)
3022 case CRUSH_BUCKET_UNIFORM
:
3023 size
= sizeof(crush_bucket_uniform
);
3025 case CRUSH_BUCKET_LIST
:
3026 size
= sizeof(crush_bucket_list
);
3028 case CRUSH_BUCKET_TREE
:
3029 size
= sizeof(crush_bucket_tree
);
3031 case CRUSH_BUCKET_STRAW
:
3032 size
= sizeof(crush_bucket_straw
);
3034 case CRUSH_BUCKET_STRAW2
:
3035 size
= sizeof(crush_bucket_straw2
);
3040 snprintf(str
, sizeof(str
), "unsupported bucket algorithm: %d", alg
);
3041 throw buffer::malformed_input(str
);
3044 crush_bucket
*bucket
= reinterpret_cast<crush_bucket
*>(calloc(1, size
));
3047 ::decode(bucket
->id
, blp
);
3048 ::decode(bucket
->type
, blp
);
3049 ::decode(bucket
->alg
, blp
);
3050 ::decode(bucket
->hash
, blp
);
3051 ::decode(bucket
->weight
, blp
);
3052 ::decode(bucket
->size
, blp
);
3054 bucket
->items
= (__s32
*)calloc(1, bucket
->size
* sizeof(__s32
));
3055 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
3056 ::decode(bucket
->items
[j
], blp
);
3059 switch (bucket
->alg
) {
3060 case CRUSH_BUCKET_UNIFORM
:
3061 ::decode((reinterpret_cast<crush_bucket_uniform
*>(bucket
))->item_weight
, blp
);
3064 case CRUSH_BUCKET_LIST
: {
3065 crush_bucket_list
* cbl
= reinterpret_cast<crush_bucket_list
*>(bucket
);
3066 cbl
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
3067 cbl
->sum_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
3069 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
3070 ::decode(cbl
->item_weights
[j
], blp
);
3071 ::decode(cbl
->sum_weights
[j
], blp
);
3076 case CRUSH_BUCKET_TREE
: {
3077 crush_bucket_tree
* cbt
= reinterpret_cast<crush_bucket_tree
*>(bucket
);
3078 ::decode(cbt
->num_nodes
, blp
);
3079 cbt
->node_weights
= (__u32
*)calloc(1, cbt
->num_nodes
* sizeof(__u32
));
3080 for (unsigned j
=0; j
<cbt
->num_nodes
; j
++) {
3081 ::decode(cbt
->node_weights
[j
], blp
);
3086 case CRUSH_BUCKET_STRAW
: {
3087 crush_bucket_straw
* cbs
= reinterpret_cast<crush_bucket_straw
*>(bucket
);
3088 cbs
->straws
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
3089 cbs
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
3090 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
3091 ::decode(cbs
->item_weights
[j
], blp
);
3092 ::decode(cbs
->straws
[j
], blp
);
3097 case CRUSH_BUCKET_STRAW2
: {
3098 crush_bucket_straw2
* cbs
= reinterpret_cast<crush_bucket_straw2
*>(bucket
);
3099 cbs
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
3100 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
3101 ::decode(cbs
->item_weights
[j
], blp
);
3107 // We should have handled this case in the first switch statement
3114 void CrushWrapper::dump(Formatter
*f
) const
3116 f
->open_array_section("devices");
3117 for (int i
=0; i
<get_max_devices(); i
++) {
3118 f
->open_object_section("device");
3119 f
->dump_int("id", i
);
3120 const char *n
= get_item_name(i
);
3122 f
->dump_string("name", n
);
3125 sprintf(name
, "device%d", i
);
3126 f
->dump_string("name", name
);
3128 const char *device_class
= get_item_class(i
);
3129 if (device_class
!= NULL
)
3130 f
->dump_string("class", device_class
);
3135 f
->open_array_section("types");
3136 int n
= get_num_type_names();
3137 for (int i
=0; n
; i
++) {
3138 const char *name
= get_type_name(i
);
3141 f
->open_object_section("type");
3142 f
->dump_int("type_id", 0);
3143 f
->dump_string("name", "device");
3149 f
->open_object_section("type");
3150 f
->dump_int("type_id", i
);
3151 f
->dump_string("name", name
);
3156 f
->open_array_section("buckets");
3157 for (int bucket
= -1; bucket
> -1-get_max_buckets(); --bucket
) {
3158 if (!bucket_exists(bucket
))
3160 f
->open_object_section("bucket");
3161 f
->dump_int("id", bucket
);
3162 if (get_item_name(bucket
))
3163 f
->dump_string("name", get_item_name(bucket
));
3164 f
->dump_int("type_id", get_bucket_type(bucket
));
3165 if (get_type_name(get_bucket_type(bucket
)))
3166 f
->dump_string("type_name", get_type_name(get_bucket_type(bucket
)));
3167 f
->dump_int("weight", get_bucket_weight(bucket
));
3168 f
->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket
)));
3169 f
->dump_string("hash", crush_hash_name(get_bucket_hash(bucket
)));
3170 f
->open_array_section("items");
3171 for (int j
=0; j
<get_bucket_size(bucket
); j
++) {
3172 f
->open_object_section("item");
3173 f
->dump_int("id", get_bucket_item(bucket
, j
));
3174 f
->dump_int("weight", get_bucket_item_weight(bucket
, j
));
3175 f
->dump_int("pos", j
);
3183 f
->open_array_section("rules");
3187 f
->open_object_section("tunables");
3191 dump_choose_args(f
);
3195 // depth first walker
3197 typedef CrushTreeDumper::Item Item
;
3198 const CrushWrapper
*crush
;
3199 const CrushTreeDumper::name_map_t
& weight_set_names
;
3201 explicit TreeDumper(const CrushWrapper
*crush
,
3202 const CrushTreeDumper::name_map_t
& wsnames
)
3203 : crush(crush
), weight_set_names(wsnames
) {}
3205 void dump(Formatter
*f
) {
3207 crush
->find_roots(&roots
);
3208 for (set
<int>::iterator root
= roots
.begin(); root
!= roots
.end(); ++root
) {
3209 dump_item(Item(*root
, 0, 0, crush
->get_bucket_weightf(*root
)), f
);
3214 void dump_item(const Item
& qi
, Formatter
* f
) {
3215 if (qi
.is_bucket()) {
3216 f
->open_object_section("bucket");
3217 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
3218 dump_bucket_children(qi
, f
);
3221 f
->open_object_section("device");
3222 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
3227 void dump_bucket_children(const Item
& parent
, Formatter
* f
) {
3228 f
->open_array_section("items");
3229 const int max_pos
= crush
->get_bucket_size(parent
.id
);
3230 for (int pos
= 0; pos
< max_pos
; pos
++) {
3231 int id
= crush
->get_bucket_item(parent
.id
, pos
);
3232 float weight
= crush
->get_bucket_item_weightf(parent
.id
, pos
);
3233 dump_item(Item(id
, parent
.id
, parent
.depth
+ 1, weight
), f
);
3240 void CrushWrapper::dump_tree(
3242 const CrushTreeDumper::name_map_t
& weight_set_names
) const
3245 TreeDumper(this, weight_set_names
).dump(f
);
3248 void CrushWrapper::dump_tunables(Formatter
*f
) const
3250 f
->dump_int("choose_local_tries", get_choose_local_tries());
3251 f
->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
3252 f
->dump_int("choose_total_tries", get_choose_total_tries());
3253 f
->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
3254 f
->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
3255 f
->dump_int("chooseleaf_stable", get_chooseleaf_stable());
3256 f
->dump_int("straw_calc_version", get_straw_calc_version());
3257 f
->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
3259 // be helpful about it
3260 if (has_jewel_tunables())
3261 f
->dump_string("profile", "jewel");
3262 else if (has_hammer_tunables())
3263 f
->dump_string("profile", "hammer");
3264 else if (has_firefly_tunables())
3265 f
->dump_string("profile", "firefly");
3266 else if (has_bobtail_tunables())
3267 f
->dump_string("profile", "bobtail");
3268 else if (has_argonaut_tunables())
3269 f
->dump_string("profile", "argonaut");
3271 f
->dump_string("profile", "unknown");
3272 f
->dump_int("optimal_tunables", (int)has_optimal_tunables());
3273 f
->dump_int("legacy_tunables", (int)has_legacy_tunables());
3275 // be helpful about minimum version required
3276 f
->dump_string("minimum_required_version", get_min_required_version());
3278 f
->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
3279 f
->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
3280 f
->dump_int("has_v2_rules", (int)has_v2_rules());
3281 f
->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
3282 f
->dump_int("has_v3_rules", (int)has_v3_rules());
3283 f
->dump_int("has_v4_buckets", (int)has_v4_buckets());
3284 f
->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
3285 f
->dump_int("has_v5_rules", (int)has_v5_rules());
3288 void CrushWrapper::dump_choose_args(Formatter
*f
) const
3290 f
->open_object_section("choose_args");
3291 for (auto c
: choose_args
) {
3292 crush_choose_arg_map arg_map
= c
.second
;
3293 f
->open_array_section(stringify(c
.first
).c_str());
3294 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
3295 crush_choose_arg
*arg
= &arg_map
.args
[i
];
3296 if (arg
->weight_set_positions
== 0 &&
3299 f
->open_object_section("choose_args");
3300 int bucket_index
= i
;
3301 f
->dump_int("bucket_id", -1-bucket_index
);
3302 if (arg
->weight_set_positions
> 0) {
3303 f
->open_array_section("weight_set");
3304 for (__u32 j
= 0; j
< arg
->weight_set_positions
; j
++) {
3305 f
->open_array_section("weights");
3306 __u32
*weights
= arg
->weight_set
[j
].weights
;
3307 __u32 size
= arg
->weight_set
[j
].size
;
3308 for (__u32 k
= 0; k
< size
; k
++) {
3309 f
->dump_float("weight", (float)weights
[k
]/(float)0x10000);
3315 if (arg
->ids_size
> 0) {
3316 f
->open_array_section("ids");
3317 for (__u32 j
= 0; j
< arg
->ids_size
; j
++)
3318 f
->dump_int("id", arg
->ids
[j
]);
3328 void CrushWrapper::dump_rules(Formatter
*f
) const
3330 for (int i
=0; i
<get_max_rules(); i
++) {
3331 if (!rule_exists(i
))
3337 void CrushWrapper::dump_rule(int ruleset
, Formatter
*f
) const
3339 f
->open_object_section("rule");
3340 f
->dump_int("rule_id", ruleset
);
3341 if (get_rule_name(ruleset
))
3342 f
->dump_string("rule_name", get_rule_name(ruleset
));
3343 f
->dump_int("ruleset", get_rule_mask_ruleset(ruleset
));
3344 f
->dump_int("type", get_rule_mask_type(ruleset
));
3345 f
->dump_int("min_size", get_rule_mask_min_size(ruleset
));
3346 f
->dump_int("max_size", get_rule_mask_max_size(ruleset
));
3347 f
->open_array_section("steps");
3348 for (int j
=0; j
<get_rule_len(ruleset
); j
++) {
3349 f
->open_object_section("step");
3350 switch (get_rule_op(ruleset
, j
)) {
3351 case CRUSH_RULE_NOOP
:
3352 f
->dump_string("op", "noop");
3354 case CRUSH_RULE_TAKE
:
3355 f
->dump_string("op", "take");
3357 int item
= get_rule_arg1(ruleset
, j
);
3358 f
->dump_int("item", item
);
3360 const char *name
= get_item_name(item
);
3361 f
->dump_string("item_name", name
? name
: "");
3364 case CRUSH_RULE_EMIT
:
3365 f
->dump_string("op", "emit");
3367 case CRUSH_RULE_CHOOSE_FIRSTN
:
3368 f
->dump_string("op", "choose_firstn");
3369 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
3370 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
3372 case CRUSH_RULE_CHOOSE_INDEP
:
3373 f
->dump_string("op", "choose_indep");
3374 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
3375 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
3377 case CRUSH_RULE_CHOOSELEAF_FIRSTN
:
3378 f
->dump_string("op", "chooseleaf_firstn");
3379 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
3380 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
3382 case CRUSH_RULE_CHOOSELEAF_INDEP
:
3383 f
->dump_string("op", "chooseleaf_indep");
3384 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
3385 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
3387 case CRUSH_RULE_SET_CHOOSE_TRIES
:
3388 f
->dump_string("op", "set_choose_tries");
3389 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
3391 case CRUSH_RULE_SET_CHOOSELEAF_TRIES
:
3392 f
->dump_string("op", "set_chooseleaf_tries");
3393 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
3396 f
->dump_int("opcode", get_rule_op(ruleset
, j
));
3397 f
->dump_int("arg1", get_rule_arg1(ruleset
, j
));
3398 f
->dump_int("arg2", get_rule_arg2(ruleset
, j
));
3406 void CrushWrapper::list_rules(Formatter
*f
) const
3408 for (int rule
= 0; rule
< get_max_rules(); rule
++) {
3409 if (!rule_exists(rule
))
3411 f
->dump_string("name", get_rule_name(rule
));
3415 void CrushWrapper::list_rules(ostream
*ss
) const
3417 for (int rule
= 0; rule
< get_max_rules(); rule
++) {
3418 if (!rule_exists(rule
))
3420 *ss
<< get_rule_name(rule
) << "\n";
3424 class CrushTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
3426 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
3428 explicit CrushTreePlainDumper(const CrushWrapper
*crush
,
3429 const CrushTreeDumper::name_map_t
& wsnames
)
3430 : Parent(crush
, wsnames
) {}
3431 explicit CrushTreePlainDumper(const CrushWrapper
*crush
,
3432 const CrushTreeDumper::name_map_t
& wsnames
,
3434 : Parent(crush
, wsnames
, show_shadow
) {}
3437 void dump(TextTable
*tbl
) {
3438 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
3439 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
3440 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3441 for (auto& p
: crush
->choose_args
) {
3442 if (p
.first
== CrushWrapper::DEFAULT_CHOOSE_ARGS
) {
3443 tbl
->define_column("(compat)", TextTable::LEFT
, TextTable::RIGHT
);
3446 auto q
= weight_set_names
.find(p
.first
);
3447 name
= q
!= weight_set_names
.end() ? q
->second
:
3449 tbl
->define_column(name
.c_str(), TextTable::LEFT
, TextTable::RIGHT
);
3452 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
3457 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
3458 const char *c
= crush
->get_item_class(qi
.id
);
3463 << weightf_t(qi
.weight
);
3464 for (auto& p
: crush
->choose_args
) {
3465 if (qi
.parent
< 0) {
3466 const crush_choose_arg_map cmap
= crush
->choose_args_get(p
.first
);
3467 int bidx
= -1 - qi
.parent
;
3468 const crush_bucket
*b
= crush
->get_bucket(qi
.parent
);
3470 bidx
< (int)cmap
.size
&&
3471 cmap
.args
[bidx
].weight_set
&&
3472 cmap
.args
[bidx
].weight_set_positions
>= 1) {
3475 pos
< (int)cmap
.args
[bidx
].weight_set
[0].size
&&
3476 b
->items
[pos
] != qi
.id
;
3478 *tbl
<< weightf_t((float)cmap
.args
[bidx
].weight_set
[0].weights
[pos
] /
3486 for (int k
=0; k
< qi
.depth
; k
++) {
3489 if (qi
.is_bucket()) {
3490 ss
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
3491 << crush
->get_item_name(qi
.id
);
3493 ss
<< "osd." << qi
.id
;
3496 *tbl
<< TextTable::endrow
;
3501 class CrushTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
3503 typedef CrushTreeDumper::FormattingDumper Parent
;
3505 explicit CrushTreeFormattingDumper(
3506 const CrushWrapper
*crush
,
3507 const CrushTreeDumper::name_map_t
& wsnames
)
3508 : Parent(crush
, wsnames
) {}
3510 explicit CrushTreeFormattingDumper(
3511 const CrushWrapper
*crush
,
3512 const CrushTreeDumper::name_map_t
& wsnames
,
3514 : Parent(crush
, wsnames
, show_shadow
) {}
3516 void dump(Formatter
*f
) {
3517 f
->open_array_section("nodes");
3521 // There is no stray bucket whose id is a negative number, so just get
3522 // the max_id and iterate from 0 to max_id to dump stray osds.
3523 f
->open_array_section("stray");
3524 int32_t max_id
= -1;
3525 if (!crush
->name_map
.empty()) {
3526 max_id
= crush
->name_map
.rbegin()->first
;
3528 for (int32_t i
= 0; i
<= max_id
; i
++) {
3529 if (crush
->item_exists(i
) && !is_touched(i
) && should_dump(i
)) {
3530 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
3538 void CrushWrapper::dump_tree(
3541 const CrushTreeDumper::name_map_t
& weight_set_names
,
3542 bool show_shadow
) const
3546 CrushTreePlainDumper(this, weight_set_names
, show_shadow
).dump(&tbl
);
3550 CrushTreeFormattingDumper(this, weight_set_names
, show_shadow
).dump(f
);
3554 void CrushWrapper::generate_test_instances(list
<CrushWrapper
*>& o
)
3556 o
.push_back(new CrushWrapper
);
3561 * Determine the default CRUSH ruleset ID to be used with
3562 * newly created replicated pools.
3564 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
3566 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext
*cct
)
3568 int crush_ruleset
= cct
->_conf
->osd_pool_default_crush_rule
;
3569 if (crush_ruleset
< 0) {
3570 crush_ruleset
= find_first_ruleset(pg_pool_t::TYPE_REPLICATED
);
3571 } else if (!ruleset_exists(crush_ruleset
)) {
3572 crush_ruleset
= -1; // match find_first_ruleset() retval
3574 return crush_ruleset
;
3577 bool CrushWrapper::is_valid_crush_name(const string
& s
)
3581 for (string::const_iterator p
= s
.begin(); p
!= s
.end(); ++p
) {
3585 !(*p
>= '0' && *p
<= '9') &&
3586 !(*p
>= 'A' && *p
<= 'Z') &&
3587 !(*p
>= 'a' && *p
<= 'z'))
3593 bool CrushWrapper::is_valid_crush_loc(CephContext
*cct
,
3594 const map
<string
,string
>& loc
)
3596 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
3597 if (!is_valid_crush_name(l
->first
) ||
3598 !is_valid_crush_name(l
->second
)) {
3599 ldout(cct
, 1) << "loc["
3600 << l
->first
<< "] = '"
3601 << l
->second
<< "' not a valid crush name ([A-Za-z0-9_-.]+)"
3609 int CrushWrapper::_choose_type_stack(
3611 const vector
<pair
<int,int>>& stack
,
3612 const set
<int>& overfull
,
3613 const vector
<int>& underfull
,
3614 const vector
<int>& orig
,
3615 vector
<int>::const_iterator
& i
,
3617 vector
<int> *pw
) const
3619 vector
<int> w
= *pw
;
3622 ldout(cct
, 10) << __func__
<< " stack " << stack
3628 vector
<int> cumulative_fanout(stack
.size());
3630 for (int j
= (int)stack
.size() - 1; j
>= 0; --j
) {
3631 cumulative_fanout
[j
] = f
;
3632 f
*= stack
[j
].second
;
3634 ldout(cct
, 10) << __func__
<< " cumulative_fanout " << cumulative_fanout
3637 // identify underful targets for each intermediate level.
3638 // this serves two purposes:
3639 // 1. we can tell when we are selecting a bucket that does not have any underfull
3640 // devices beneath it. that means that if the current input includes an overfull
3641 // device, we won't be able to find an underfull device with this parent to
3643 // 2. when we decide we should reject a bucket due to the above, this list gives us
3644 // a list of peers to consider that *do* have underfull devices available.. (we
3645 // are careful to pick one that has the same parent.)
3646 vector
<set
<int>> underfull_buckets
; // level -> set of buckets with >0 underfull item(s)
3647 underfull_buckets
.resize(stack
.size() - 1);
3648 for (auto osd
: underfull
) {
3650 for (int j
= (int)stack
.size() - 2; j
>= 0; --j
) {
3651 int type
= stack
[j
].first
;
3652 item
= get_parent_of_type(item
, type
);
3653 ldout(cct
, 10) << __func__
<< " underfull " << osd
<< " type " << type
3654 << " is " << item
<< dendl
;
3655 underfull_buckets
[j
].insert(item
);
3658 ldout(cct
, 20) << __func__
<< " underfull_buckets " << underfull_buckets
<< dendl
;
3660 for (unsigned j
= 0; j
< stack
.size(); ++j
) {
3661 int type
= stack
[j
].first
;
3662 int fanout
= stack
[j
].second
;
3663 int cum_fanout
= cumulative_fanout
[j
];
3664 ldout(cct
, 10) << " level " << j
<< ": type " << type
<< " fanout " << fanout
3665 << " cumulative " << cum_fanout
3666 << " w " << w
<< dendl
;
3669 if (i
== orig
.end()) {
3670 ldout(cct
, 10) << __func__
<< " end of orig, break 0" << dendl
;
3673 for (auto from
: w
) {
3674 ldout(cct
, 10) << " from " << from
<< dendl
;
3675 // identify leaves under each choice. we use this to check whether any of these
3676 // leaves are overfull. (if so, we need to make sure there are underfull candidates
3677 // to swap for them.)
3678 vector
<set
<int>> leaves
;
3679 leaves
.resize(fanout
);
3680 for (int pos
= 0; pos
< fanout
; ++pos
) {
3683 int item
= get_parent_of_type(*tmpi
, type
);
3686 while (n
-- && tmpi
!= orig
.end()) {
3687 leaves
[pos
].insert(*tmpi
++);
3689 ldout(cct
, 10) << __func__
<< " from " << *tmpi
<< " got " << item
3690 << " of type " << type
<< " over leaves " << leaves
[pos
] << dendl
;
3693 bool replaced
= false;
3694 if (overfull
.count(*i
)) {
3695 for (auto item
: underfull
) {
3696 ldout(cct
, 10) << __func__
<< " pos " << pos
3697 << " was " << *i
<< " considering " << item
3699 if (used
.count(item
)) {
3700 ldout(cct
, 20) << __func__
<< " in used " << used
<< dendl
;
3703 if (!subtree_contains(from
, item
)) {
3704 ldout(cct
, 20) << __func__
<< " not in subtree " << from
<< dendl
;
3707 if (std::find(orig
.begin(), orig
.end(), item
) != orig
.end()) {
3708 ldout(cct
, 20) << __func__
<< " in orig " << orig
<< dendl
;
3713 ldout(cct
, 10) << __func__
<< " pos " << pos
<< " replace "
3714 << *i
<< " -> " << item
<< dendl
;
3716 assert(i
!= orig
.end());
3722 ldout(cct
, 10) << __func__
<< " pos " << pos
<< " keep " << *i
3724 assert(i
!= orig
.end());
3728 if (i
== orig
.end()) {
3729 ldout(cct
, 10) << __func__
<< " end of orig, break 1" << dendl
;
3734 if (j
+ 1 < stack
.size()) {
3735 // check if any buckets have overfull leaves but no underfull candidates
3736 for (int pos
= 0; pos
< fanout
; ++pos
) {
3737 if (underfull_buckets
[j
].count(o
[pos
]) == 0) {
3738 // are any leaves overfull?
3739 bool any_overfull
= false;
3740 for (auto osd
: leaves
[pos
]) {
3741 if (overfull
.count(osd
)) {
3742 any_overfull
= true;
3746 ldout(cct
, 10) << " bucket " << o
[pos
] << " has no underfull targets and "
3747 << ">0 leaves " << leaves
[pos
] << " is overfull; alts "
3748 << underfull_buckets
[j
]
3750 for (auto alt
: underfull_buckets
[j
]) {
3751 if (std::find(o
.begin(), o
.end(), alt
) == o
.end()) {
3752 // see if alt has the same parent
3754 get_parent_of_type(o
[pos
], stack
[j
-1].first
) ==
3755 get_parent_of_type(alt
, stack
[j
-1].first
)) {
3757 ldout(cct
, 10) << " replacing " << o
[pos
]
3758 << " (which has no underfull leaves) with " << alt
3760 << get_parent_of_type(alt
, stack
[j
-1].first
) << " type "
3761 << type
<< ")" << dendl
;
3763 ldout(cct
, 10) << " replacing " << o
[pos
]
3764 << " (which has no underfull leaves) with " << alt
3765 << " (first level)" << dendl
;
3769 ldout(cct
, 30) << " alt " << alt
<< " for " << o
[pos
]
3770 << " has different parent, skipping" << dendl
;
3778 if (i
== orig
.end()) {
3779 ldout(cct
, 10) << __func__
<< " end of orig, break 2" << dendl
;
3783 ldout(cct
, 10) << __func__
<< " w <- " << o
<< " was " << w
<< dendl
;
3790 int CrushWrapper::try_remap_rule(
3794 const set
<int>& overfull
,
3795 const vector
<int>& underfull
,
3796 const vector
<int>& orig
,
3797 vector
<int> *out
) const
3799 const crush_map
*map
= crush
;
3800 const crush_rule
*rule
= get_rule(ruleno
);
3803 ldout(cct
, 10) << __func__
<< " ruleno " << ruleno
3804 << " numrep " << maxout
<< " overfull " << overfull
3805 << " underfull " << underfull
<< " orig " << orig
3807 vector
<int> w
; // working set
3810 auto i
= orig
.begin();
3813 vector
<pair
<int,int>> type_stack
; // (type, fan-out)
3815 for (unsigned step
= 0; step
< rule
->len
; ++step
) {
3816 const crush_rule_step
*curstep
= &rule
->steps
[step
];
3817 ldout(cct
, 10) << __func__
<< " step " << step
<< " w " << w
<< dendl
;
3818 switch (curstep
->op
) {
3819 case CRUSH_RULE_TAKE
:
3820 if ((curstep
->arg1
>= 0 && curstep
->arg1
< map
->max_devices
) ||
3821 (-1-curstep
->arg1
>= 0 && -1-curstep
->arg1
< map
->max_buckets
&&
3822 map
->buckets
[-1-curstep
->arg1
])) {
3824 w
.push_back(curstep
->arg1
);
3825 ldout(cct
, 10) << __func__
<< " take " << w
<< dendl
;
3827 ldout(cct
, 1) << " bad take value " << curstep
->arg1
<< dendl
;
3831 case CRUSH_RULE_CHOOSELEAF_FIRSTN
:
3832 case CRUSH_RULE_CHOOSELEAF_INDEP
:
3834 int numrep
= curstep
->arg1
;
3835 int type
= curstep
->arg2
;
3838 type_stack
.push_back(make_pair(type
, numrep
));
3840 type_stack
.push_back(make_pair(0, 1));
3841 int r
= _choose_type_stack(cct
, type_stack
, overfull
, underfull
, orig
,
3849 case CRUSH_RULE_CHOOSE_FIRSTN
:
3850 case CRUSH_RULE_CHOOSE_INDEP
:
3852 int numrep
= curstep
->arg1
;
3853 int type
= curstep
->arg2
;
3856 type_stack
.push_back(make_pair(type
, numrep
));
3860 case CRUSH_RULE_EMIT
:
3861 ldout(cct
, 10) << " emit " << w
<< dendl
;
3862 if (!type_stack
.empty()) {
3863 int r
= _choose_type_stack(cct
, type_stack
, overfull
, underfull
, orig
,
3869 for (auto item
: w
) {
3870 out
->push_back(item
);
3885 int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
3887 crush_choose_arg_map cmap
,
3890 const vector
<int>& weight
,
3894 int bidx
= -1 - bucketid
;
3895 crush_bucket
*b
= crush
->buckets
[bidx
];
3896 if (bidx
>= (int)cmap
.size
) {
3898 *ss
<< "no weight-set for bucket " << b
->id
;
3899 ldout(cct
, 10) << __func__
<< " no crush_choose_arg for bucket " << b
->id
3903 crush_choose_arg
*carg
= &cmap
.args
[bidx
];
3904 if (carg
->weight_set
== NULL
) {
3905 // create a weight-set for this bucket and populate it with the
3907 unsigned positions
= get_choose_args_positions(cmap
);
3908 carg
->weight_set_positions
= positions
;
3909 carg
->weight_set
= static_cast<crush_weight_set
*>(
3910 calloc(sizeof(crush_weight_set
), positions
));
3911 for (unsigned p
= 0; p
< positions
; ++p
) {
3912 carg
->weight_set
[p
].size
= b
->size
;
3913 carg
->weight_set
[p
].weights
= (__u32
*)calloc(b
->size
, sizeof(__u32
));
3914 for (unsigned i
= 0; i
< b
->size
; ++i
) {
3915 carg
->weight_set
[p
].weights
[i
] = crush_get_bucket_item_weight(b
, i
);
3920 if (carg
->weight_set_positions
!= weight
.size()) {
3922 *ss
<< "weight_set_positions != " << weight
.size() << " for bucket " << b
->id
;
3923 ldout(cct
, 10) << __func__
<< " weight_set_positions != " << weight
.size()
3924 << " for bucket " << b
->id
<< dendl
;
3927 for (unsigned i
= 0; i
< b
->size
; i
++) {
3928 if (b
->items
[i
] == id
) {
3929 for (unsigned j
= 0; j
< weight
.size(); ++j
) {
3930 carg
->weight_set
[j
].weights
[i
] = weight
[j
];
3932 ldout(cct
, 5) << __func__
<< " set " << id
<< " to " << weight
3933 << " in bucket " << b
->id
<< dendl
;
3938 vector
<int> bucket_weight(weight
.size(), 0);
3939 for (unsigned i
= 0; i
< b
->size
; i
++) {
3940 for (unsigned j
= 0; j
< weight
.size(); ++j
) {
3941 bucket_weight
[j
] += carg
->weight_set
[j
].weights
[i
];
3944 choose_args_adjust_item_weight(cct
, cmap
, b
->id
, bucket_weight
, nullptr);
3949 int CrushWrapper::choose_args_adjust_item_weight(
3951 crush_choose_arg_map cmap
,
3953 const vector
<int>& weight
,
3956 ldout(cct
, 5) << __func__
<< " " << id
<< " weight " << weight
<< dendl
;
3958 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
3959 crush_bucket
*b
= crush
->buckets
[bidx
];
3963 changed
+= _choose_args_adjust_item_weight_in_bucket(
3964 cct
, cmap
, b
->id
, id
, weight
, ss
);
3968 *ss
<< "item " << id
<< " not found in crush map";