1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "osd/osd_types.h"
5 #include "common/debug.h"
6 #include "common/Formatter.h"
7 #include "common/errno.h"
8 #include "include/stringify.h"
10 #include "CrushWrapper.h"
11 #include "CrushTreeDumper.h"
13 #define dout_subsys ceph_subsys_crush
15 bool CrushWrapper::has_legacy_rulesets() const
17 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
18 crush_rule
*r
= crush
->rules
[i
];
20 r
->mask
.ruleset
!= i
) {
27 int CrushWrapper::renumber_rules_by_ruleset()
30 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
31 crush_rule
*r
= crush
->rules
[i
];
32 if (r
&& r
->mask
.ruleset
>= max_ruleset
) {
33 max_ruleset
= r
->mask
.ruleset
+ 1;
36 struct crush_rule
**newrules
=
37 (crush_rule
**)calloc(1, max_ruleset
* sizeof(crush_rule
*));
38 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
39 crush_rule
*r
= crush
->rules
[i
];
42 if (newrules
[r
->mask
.ruleset
]) {
43 // collision, we can't do it.
47 newrules
[r
->mask
.ruleset
] = r
;
52 crush
->rules
= newrules
;
53 crush
->max_rules
= max_ruleset
;
57 bool CrushWrapper::has_multirule_rulesets() const
59 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
60 crush_rule
*r
= crush
->rules
[i
];
63 for (unsigned j
=i
+1; j
<crush
->max_rules
; j
++) {
64 crush_rule
*s
= crush
->rules
[j
];
67 if (r
->mask
.ruleset
== s
->mask
.ruleset
)
74 bool CrushWrapper::has_v2_rules() const
76 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
84 bool CrushWrapper::is_v2_rule(unsigned ruleid
) const
86 // check rule for use of indep or new SET_* rule steps
87 if (ruleid
>= crush
->max_rules
)
89 crush_rule
*r
= crush
->rules
[ruleid
];
92 for (unsigned j
=0; j
<r
->len
; j
++) {
93 if (r
->steps
[j
].op
== CRUSH_RULE_CHOOSE_INDEP
||
94 r
->steps
[j
].op
== CRUSH_RULE_CHOOSELEAF_INDEP
||
95 r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSE_TRIES
||
96 r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_TRIES
) {
103 bool CrushWrapper::has_v3_rules() const
105 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
113 bool CrushWrapper::is_v3_rule(unsigned ruleid
) const
115 // check rule for use of SET_CHOOSELEAF_VARY_R step
116 if (ruleid
>= crush
->max_rules
)
118 crush_rule
*r
= crush
->rules
[ruleid
];
121 for (unsigned j
=0; j
<r
->len
; j
++) {
122 if (r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_VARY_R
) {
129 bool CrushWrapper::has_v4_buckets() const
131 for (int i
=0; i
<crush
->max_buckets
; ++i
) {
132 crush_bucket
*b
= crush
->buckets
[i
];
135 if (b
->alg
== CRUSH_BUCKET_STRAW2
)
141 bool CrushWrapper::has_v5_rules() const
143 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
151 bool CrushWrapper::is_v5_rule(unsigned ruleid
) const
153 // check rule for use of SET_CHOOSELEAF_STABLE step
154 if (ruleid
>= crush
->max_rules
)
156 crush_rule
*r
= crush
->rules
[ruleid
];
159 for (unsigned j
=0; j
<r
->len
; j
++) {
160 if (r
->steps
[j
].op
== CRUSH_RULE_SET_CHOOSELEAF_STABLE
) {
167 bool CrushWrapper::has_choose_args() const
169 return !choose_args
.empty();
172 bool CrushWrapper::has_incompat_choose_args() const
174 if (choose_args
.empty())
176 if (choose_args
.size() > 1)
178 crush_choose_arg_map arg_map
= choose_args
.begin()->second
;
179 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
180 crush_choose_arg
*arg
= &arg_map
.args
[i
];
181 if (arg
->weight_set_size
== 0 &&
184 if (arg
->weight_set_size
!= 1)
186 if (arg
->ids_size
!= 0)
192 int CrushWrapper::split_id_class(int i
, int *idout
, int *classout
) const
196 string name
= get_item_name(i
);
197 size_t pos
= name
.find("~");
198 if (pos
== string::npos
) {
203 string name_no_class
= name
.substr(0, pos
);
204 if (!name_exists(name_no_class
))
206 string class_name
= name
.substr(pos
+ 1);
207 if (!class_exists(class_name
))
209 *idout
= get_item_id(name_no_class
);
210 *classout
= get_class_id(class_name
);
214 int CrushWrapper::can_rename_item(const string
& srcname
,
215 const string
& dstname
,
218 if (name_exists(srcname
)) {
219 if (name_exists(dstname
)) {
220 *ss
<< "dstname = '" << dstname
<< "' already exists";
223 if (is_valid_crush_name(dstname
)) {
226 *ss
<< "dstname = '" << dstname
<< "' does not match [-_.0-9a-zA-Z]+";
230 if (name_exists(dstname
)) {
231 *ss
<< "srcname = '" << srcname
<< "' does not exist "
232 << "and dstname = '" << dstname
<< "' already exists";
235 *ss
<< "srcname = '" << srcname
<< "' does not exist";
241 int CrushWrapper::rename_item(const string
& srcname
,
242 const string
& dstname
,
245 int ret
= can_rename_item(srcname
, dstname
, ss
);
248 int oldid
= get_item_id(srcname
);
249 return set_item_name(oldid
, dstname
);
252 int CrushWrapper::can_rename_bucket(const string
& srcname
,
253 const string
& dstname
,
256 int ret
= can_rename_item(srcname
, dstname
, ss
);
259 int srcid
= get_item_id(srcname
);
261 *ss
<< "srcname = '" << srcname
<< "' is not a bucket "
262 << "because its id = " << srcid
<< " is >= 0";
268 int CrushWrapper::rename_bucket(const string
& srcname
,
269 const string
& dstname
,
272 int ret
= can_rename_bucket(srcname
, dstname
, ss
);
275 int oldid
= get_item_id(srcname
);
276 return set_item_name(oldid
, dstname
);
279 void CrushWrapper::find_takes(set
<int>& roots
) const
281 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
282 crush_rule
*r
= crush
->rules
[i
];
285 for (unsigned j
=0; j
<r
->len
; j
++) {
286 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
)
287 roots
.insert(r
->steps
[j
].arg1
);
292 void CrushWrapper::find_roots(set
<int>& roots
) const
294 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
295 if (!crush
->buckets
[i
])
297 crush_bucket
*b
= crush
->buckets
[i
];
298 if (!_search_item_exists(b
->id
))
303 void CrushWrapper::find_nonshadow_roots(set
<int>& roots
) const
305 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
306 if (!crush
->buckets
[i
])
308 crush_bucket
*b
= crush
->buckets
[i
];
309 if (_search_item_exists(b
->id
))
311 const char *name
= get_item_name(b
->id
);
312 if (name
&& !is_valid_crush_name(name
))
318 bool CrushWrapper::subtree_contains(int root
, int item
) const
324 return false; // root is a leaf
326 const crush_bucket
*b
= get_bucket(root
);
330 for (unsigned j
=0; j
<b
->size
; j
++) {
331 if (subtree_contains(b
->items
[j
], item
))
337 bool CrushWrapper::_maybe_remove_last_instance(CephContext
*cct
, int item
, bool unlink_only
)
340 if (_search_item_exists(item
)) {
343 if (item
< 0 && _bucket_is_in_use(item
)) {
347 if (item
< 0 && !unlink_only
) {
348 crush_bucket
*t
= get_bucket(item
);
349 ldout(cct
, 5) << "_maybe_remove_last_instance removing bucket " << item
<< dendl
;
350 crush_remove_bucket(crush
, t
);
351 if (class_bucket
.count(item
) != 0)
352 class_bucket
.erase(item
);
353 class_remove_item(item
);
355 if ((item
>= 0 || !unlink_only
) && name_map
.count(item
)) {
356 ldout(cct
, 5) << "_maybe_remove_last_instance removing name for item " << item
<< dendl
;
357 name_map
.erase(item
);
359 class_remove_item(item
);
364 int CrushWrapper::remove_root(int item
, bool unused
)
366 if (unused
&& _bucket_is_in_use(item
))
369 crush_bucket
*b
= get_bucket(item
);
373 for (unsigned n
= 0; n
< b
->size
; n
++) {
374 if (b
->items
[n
] >= 0)
376 int r
= remove_root(b
->items
[n
], unused
);
381 crush_remove_bucket(crush
, b
);
382 if (name_map
.count(item
) != 0) {
383 name_map
.erase(item
);
386 if (class_bucket
.count(item
) != 0)
387 class_bucket
.erase(item
);
391 int CrushWrapper::remove_item(CephContext
*cct
, int item
, bool unlink_only
)
393 ldout(cct
, 5) << "remove_item " << item
<< (unlink_only
? " unlink_only":"") << dendl
;
397 if (item
< 0 && !unlink_only
) {
398 crush_bucket
*t
= get_bucket(item
);
400 ldout(cct
, 1) << "remove_item bucket " << item
<< " does not exist" << dendl
;
405 ldout(cct
, 1) << "remove_item bucket " << item
<< " has " << t
->size
406 << " items, not empty" << dendl
;
409 if (_bucket_is_in_use(item
)) {
414 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
415 if (!crush
->buckets
[i
])
417 crush_bucket
*b
= crush
->buckets
[i
];
419 for (unsigned i
=0; i
<b
->size
; ++i
) {
420 int id
= b
->items
[i
];
422 ldout(cct
, 5) << "remove_item removing item " << item
423 << " from bucket " << b
->id
<< dendl
;
424 bucket_remove_item(b
, item
);
425 adjust_item_weight(cct
, b
->id
, b
->weight
);
431 if (_maybe_remove_last_instance(cct
, item
, unlink_only
))
437 bool CrushWrapper::_search_item_exists(int item
) const
439 for (int i
= 0; i
< crush
->max_buckets
; i
++) {
440 if (!crush
->buckets
[i
])
442 crush_bucket
*b
= crush
->buckets
[i
];
443 for (unsigned j
=0; j
<b
->size
; ++j
) {
444 if (b
->items
[j
] == item
)
451 bool CrushWrapper::_bucket_is_in_use(int item
)
453 for (auto &i
: class_bucket
)
454 for (auto &j
: i
.second
)
455 if (j
.second
== item
)
457 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
458 crush_rule
*r
= crush
->rules
[i
];
461 for (unsigned j
= 0; j
< r
->len
; ++j
) {
462 if (r
->steps
[j
].op
== CRUSH_RULE_TAKE
) {
463 int step_item
= r
->steps
[j
].arg1
;
466 int res
= split_id_class(step_item
, &original_item
, &c
);
469 if (step_item
== item
|| original_item
== item
)
477 int CrushWrapper::_remove_item_under(CephContext
*cct
, int item
, int ancestor
, bool unlink_only
)
479 ldout(cct
, 5) << "_remove_item_under " << item
<< " under " << ancestor
480 << (unlink_only
? " unlink_only":"") << dendl
;
486 if (!bucket_exists(ancestor
))
491 crush_bucket
*b
= get_bucket(ancestor
);
492 for (unsigned i
=0; i
<b
->size
; ++i
) {
493 int id
= b
->items
[i
];
495 ldout(cct
, 5) << "_remove_item_under removing item " << item
<< " from bucket " << b
->id
<< dendl
;
496 bucket_remove_item(b
, item
);
497 adjust_item_weight(cct
, b
->id
, b
->weight
);
500 int r
= remove_item_under(cct
, item
, id
, unlink_only
);
508 int CrushWrapper::remove_item_under(CephContext
*cct
, int item
, int ancestor
, bool unlink_only
)
510 ldout(cct
, 5) << "remove_item_under " << item
<< " under " << ancestor
511 << (unlink_only
? " unlink_only":"") << dendl
;
513 if (!unlink_only
&& _bucket_is_in_use(item
)) {
517 int ret
= _remove_item_under(cct
, item
, ancestor
, unlink_only
);
521 if (item
< 0 && !unlink_only
) {
522 crush_bucket
*t
= get_bucket(item
);
524 ldout(cct
, 1) << "remove_item_under bucket " << item
525 << " does not exist" << dendl
;
530 ldout(cct
, 1) << "remove_item_under bucket " << item
<< " has " << t
->size
531 << " items, not empty" << dendl
;
536 if (_maybe_remove_last_instance(cct
, item
, unlink_only
))
542 int CrushWrapper::get_common_ancestor_distance(CephContext
*cct
, int id
,
543 const std::multimap
<string
,string
>& loc
)
545 ldout(cct
, 5) << __func__
<< " " << id
<< " " << loc
<< dendl
;
546 if (!item_exists(id
))
548 map
<string
,string
> id_loc
= get_full_location(id
);
549 ldout(cct
, 20) << " id is at " << id_loc
<< dendl
;
551 for (map
<int,string
>::const_iterator p
= type_map
.begin();
554 map
<string
,string
>::iterator ip
= id_loc
.find(p
->second
);
555 if (ip
== id_loc
.end())
557 for (std::multimap
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
560 if (q
->first
!= p
->second
)
562 if (q
->second
== ip
->second
)
569 int CrushWrapper::parse_loc_map(const std::vector
<string
>& args
,
570 std::map
<string
,string
> *ploc
)
573 for (unsigned i
= 0; i
< args
.size(); ++i
) {
574 const char *s
= args
[i
].c_str();
575 const char *pos
= strchr(s
, '=');
578 string
key(s
, 0, pos
-s
);
581 (*ploc
)[key
] = value
;
588 int CrushWrapper::parse_loc_multimap(const std::vector
<string
>& args
,
589 std::multimap
<string
,string
> *ploc
)
592 for (unsigned i
= 0; i
< args
.size(); ++i
) {
593 const char *s
= args
[i
].c_str();
594 const char *pos
= strchr(s
, '=');
597 string
key(s
, 0, pos
-s
);
600 ploc
->insert(make_pair(key
, value
));
607 bool CrushWrapper::check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
,
610 ldout(cct
, 5) << "check_item_loc item " << item
<< " loc " << loc
<< dendl
;
612 for (map
<int,string
>::const_iterator p
= type_map
.begin(); p
!= type_map
.end(); ++p
) {
617 // ignore types that aren't specified in loc
618 map
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
619 if (q
== loc
.end()) {
620 ldout(cct
, 2) << "warning: did not specify location for '" << p
->second
<< "' level (levels are "
621 << type_map
<< ")" << dendl
;
625 if (!name_exists(q
->second
)) {
626 ldout(cct
, 5) << "check_item_loc bucket " << q
->second
<< " dne" << dendl
;
630 int id
= get_item_id(q
->second
);
632 ldout(cct
, 5) << "check_item_loc requested " << q
->second
<< " for type " << p
->second
633 << " is a device, not bucket" << dendl
;
637 assert(bucket_exists(id
));
638 crush_bucket
*b
= get_bucket(id
);
640 // see if item exists in this bucket
641 for (unsigned j
=0; j
<b
->size
; j
++) {
642 if (b
->items
[j
] == item
) {
643 ldout(cct
, 2) << "check_item_loc " << item
<< " exists in bucket " << b
->id
<< dendl
;
645 *weight
= crush_get_bucket_item_weight(b
, j
);
652 ldout(cct
, 1) << "check_item_loc item " << item
<< " loc " << loc
<< dendl
;
656 map
<string
, string
> CrushWrapper::get_full_location(int id
)
658 vector
<pair
<string
, string
> > full_location_ordered
;
659 map
<string
,string
> full_location
;
661 get_full_location_ordered(id
, full_location_ordered
);
663 std::copy(full_location_ordered
.begin(),
664 full_location_ordered
.end(),
665 std::inserter(full_location
, full_location
.begin()));
667 return full_location
;
670 int CrushWrapper::get_full_location_ordered(int id
, vector
<pair
<string
, string
> >& path
)
672 if (!item_exists(id
))
677 pair
<string
, string
> parent_coord
= get_immediate_parent(cur
, &ret
);
680 path
.push_back(parent_coord
);
681 cur
= get_item_id(parent_coord
.second
);
686 string
CrushWrapper::get_full_location_ordered_string(int id
)
688 vector
<pair
<string
, string
> > full_location_ordered
;
689 string full_location
;
690 get_full_location_ordered(id
, full_location_ordered
);
691 reverse(begin(full_location_ordered
), end(full_location_ordered
));
692 for(auto i
= full_location_ordered
.begin(); i
!= full_location_ordered
.end(); i
++) {
693 full_location
= full_location
+ i
->first
+ "=" + i
->second
;
694 if (i
!= full_location_ordered
.end() - 1) {
695 full_location
= full_location
+ ",";
698 return full_location
;
701 map
<int, string
> CrushWrapper::get_parent_hierarchy(int id
)
703 map
<int,string
> parent_hierarchy
;
704 pair
<string
, string
> parent_coord
= get_immediate_parent(id
);
707 // get the integer type for id and create a counter from there
708 int type_counter
= get_bucket_type(id
);
710 // if we get a negative type then we can assume that we have an OSD
711 // change behavior in get_item_type FIXME
712 if (type_counter
< 0)
715 // read the type map and get the name of the type with the largest ID
717 for (map
<int, string
>::iterator it
= type_map
.begin(); it
!= type_map
.end(); ++it
){
718 if ( (*it
).first
> high_type
)
719 high_type
= (*it
).first
;
722 parent_id
= get_item_id(parent_coord
.second
);
724 while (type_counter
< high_type
) {
726 parent_hierarchy
[ type_counter
] = parent_coord
.first
;
728 if (type_counter
< high_type
){
729 // get the coordinate information for the next parent
730 parent_coord
= get_immediate_parent(parent_id
);
731 parent_id
= get_item_id(parent_coord
.second
);
735 return parent_hierarchy
;
738 int CrushWrapper::get_children(int id
, list
<int> *children
)
745 crush_bucket
*b
= get_bucket(id
);
750 for (unsigned n
=0; n
<b
->size
; n
++) {
751 children
->push_back(b
->items
[n
]);
756 int CrushWrapper::_get_leaves(int id
, list
<int> *leaves
)
762 leaves
->push_back(id
);
766 crush_bucket
*b
= get_bucket(id
);
771 for (unsigned n
= 0; n
< b
->size
; n
++) {
772 if (b
->items
[n
] >= 0) {
773 leaves
->push_back(b
->items
[n
]);
775 // is a bucket, do recursive call
776 int r
= _get_leaves(b
->items
[n
], leaves
);
783 return 0; // all is well
786 int CrushWrapper::get_leaves(const string
&name
, set
<int> *leaves
)
791 if (!name_exists(name
)) {
795 int id
= get_item_id(name
);
803 int r
= _get_leaves(id
, &unordered
);
808 for (auto &p
: unordered
) {
815 int CrushWrapper::insert_item(CephContext
*cct
, int item
, float weight
, string name
,
816 const map
<string
,string
>& loc
) // typename -> bucketname
818 ldout(cct
, 5) << "insert_item item " << item
<< " weight " << weight
819 << " name " << name
<< " loc " << loc
<< dendl
;
821 if (!is_valid_crush_name(name
))
824 if (!is_valid_crush_loc(cct
, loc
))
827 int r
= validate_weightf(weight
);
832 if (name_exists(name
)) {
833 if (get_item_id(name
) != item
) {
834 ldout(cct
, 10) << "device name '" << name
<< "' already exists as id "
835 << get_item_id(name
) << dendl
;
839 set_item_name(item
, name
);
844 // create locations if locations don't exist and add child in location with 0 weight
845 // the more detail in the insert_item method declaration in CrushWrapper.h
846 for (map
<int,string
>::iterator p
= type_map
.begin(); p
!= type_map
.end(); ++p
) {
847 // ignore device type
851 // skip types that are unspecified
852 map
<string
,string
>::const_iterator q
= loc
.find(p
->second
);
853 if (q
== loc
.end()) {
854 ldout(cct
, 2) << "warning: did not specify location for '" << p
->second
<< "' level (levels are "
855 << type_map
<< ")" << dendl
;
859 if (!name_exists(q
->second
)) {
860 ldout(cct
, 5) << "insert_item creating bucket " << q
->second
<< dendl
;
861 int empty
= 0, newid
;
862 int r
= add_bucket(0, 0,
863 CRUSH_HASH_DEFAULT
, p
->first
, 1, &cur
, &empty
, &newid
);
865 ldout(cct
, 1) << "add_bucket failure error: " << cpp_strerror(r
) << dendl
;
868 set_item_name(newid
, q
->second
);
874 // add to an existing bucket
875 int id
= get_item_id(q
->second
);
876 if (!bucket_exists(id
)) {
877 ldout(cct
, 1) << "insert_item doesn't have bucket " << id
<< dendl
;
881 // check that we aren't creating a cycle.
882 if (subtree_contains(id
, cur
)) {
883 ldout(cct
, 1) << "insert_item item " << cur
<< " already exists beneath " << id
<< dendl
;
887 // we have done sanity check above
888 crush_bucket
*b
= get_bucket(id
);
890 if (p
->first
!= b
->type
) {
891 ldout(cct
, 1) << "insert_item existing bucket has type "
892 << "'" << type_map
[b
->type
] << "' != "
893 << "'" << type_map
[p
->first
] << "'" << dendl
;
897 // are we forming a loop?
898 if (subtree_contains(cur
, b
->id
)) {
899 ldout(cct
, 1) << "insert_item " << cur
<< " already contains " << b
->id
900 << "; cannot form loop" << dendl
;
904 ldout(cct
, 5) << "insert_item adding " << cur
<< " weight " << weight
905 << " to bucket " << id
<< dendl
;
906 int r
= bucket_add_item(b
, cur
, 0);
911 // adjust the item's weight in location
912 if(adjust_item_weightf_in_loc(cct
, item
, weight
, loc
) > 0) {
913 if (item
>= crush
->max_devices
) {
914 crush
->max_devices
= item
+ 1;
915 ldout(cct
, 5) << "insert_item max_devices now " << crush
->max_devices
<< dendl
;
920 ldout(cct
, 1) << "error: didn't find anywhere to add item " << item
<< " in " << loc
<< dendl
;
924 int CrushWrapper::move_bucket(CephContext
*cct
, int id
, const map
<string
,string
>& loc
)
926 // sorry this only works for buckets
930 if (!item_exists(id
))
933 // get the name of the bucket we are trying to move for later
934 string id_name
= get_item_name(id
);
937 int bucket_weight
= detach_bucket(cct
, id
);
939 // insert the bucket back into the hierarchy
940 return insert_item(cct
, id
, bucket_weight
/ (float)0x10000, id_name
, loc
);
943 int CrushWrapper::swap_bucket(CephContext
*cct
, int src
, int dst
)
945 if (src
>= 0 || dst
>= 0)
947 if (!item_exists(src
) || !item_exists(dst
))
949 crush_bucket
*a
= get_bucket(src
);
950 crush_bucket
*b
= get_bucket(dst
);
951 unsigned aw
= a
->weight
;
952 unsigned bw
= b
->weight
;
955 adjust_item_weight(cct
, a
->id
, bw
);
956 adjust_item_weight(cct
, b
->id
, aw
);
959 map
<int,unsigned> tmp
;
960 unsigned as
= a
->size
;
961 unsigned bs
= b
->size
;
962 for (unsigned i
= 0; i
< as
; ++i
) {
963 int item
= a
->items
[0];
964 int itemw
= crush_get_bucket_item_weight(a
, 0);
966 bucket_remove_item(a
, item
);
968 assert(a
->size
== 0);
969 assert(b
->size
== bs
);
970 for (unsigned i
= 0; i
< bs
; ++i
) {
971 int item
= b
->items
[0];
972 int itemw
= crush_get_bucket_item_weight(b
, 0);
973 bucket_remove_item(b
, item
);
974 bucket_add_item(a
, item
, itemw
);
976 assert(a
->size
== bs
);
977 assert(b
->size
== 0);
979 bucket_add_item(b
, t
.first
, t
.second
);
981 assert(a
->size
== bs
);
982 assert(b
->size
== as
);
985 swap_names(src
, dst
);
989 int CrushWrapper::link_bucket(CephContext
*cct
, int id
, const map
<string
,string
>& loc
)
991 // sorry this only works for buckets
995 if (!item_exists(id
))
998 // get the name of the bucket we are trying to move for later
999 string id_name
= get_item_name(id
);
1001 crush_bucket
*b
= get_bucket(id
);
1002 unsigned bucket_weight
= b
->weight
;
1004 return insert_item(cct
, id
, bucket_weight
/ (float)0x10000, id_name
, loc
);
1007 int CrushWrapper::create_or_move_item(CephContext
*cct
, int item
, float weight
, string name
,
1008 const map
<string
,string
>& loc
) // typename -> bucketname
1013 if (!is_valid_crush_name(name
))
1016 if (check_item_loc(cct
, item
, loc
, &old_iweight
)) {
1017 ldout(cct
, 5) << "create_or_move_item " << item
<< " already at " << loc
<< dendl
;
1019 if (_search_item_exists(item
)) {
1020 weight
= get_item_weightf(item
);
1021 ldout(cct
, 10) << "create_or_move_item " << item
<< " exists with weight " << weight
<< dendl
;
1022 remove_item(cct
, item
, true);
1024 ldout(cct
, 5) << "create_or_move_item adding " << item
<< " weight " << weight
1025 << " at " << loc
<< dendl
;
1026 ret
= insert_item(cct
, item
, weight
, name
, loc
);
1033 int CrushWrapper::update_item(CephContext
*cct
, int item
, float weight
, string name
,
1034 const map
<string
,string
>& loc
) // typename -> bucketname
1036 ldout(cct
, 5) << "update_item item " << item
<< " weight " << weight
1037 << " name " << name
<< " loc " << loc
<< dendl
;
1040 if (!is_valid_crush_name(name
))
1043 if (!is_valid_crush_loc(cct
, loc
))
1046 ret
= validate_weightf(weight
);
1051 // compare quantized (fixed-point integer) weights!
1052 int iweight
= (int)(weight
* (float)0x10000);
1054 if (check_item_loc(cct
, item
, loc
, &old_iweight
)) {
1055 ldout(cct
, 5) << "update_item " << item
<< " already at " << loc
<< dendl
;
1056 if (old_iweight
!= iweight
) {
1057 ldout(cct
, 5) << "update_item " << item
<< " adjusting weight "
1058 << ((float)old_iweight
/(float)0x10000) << " -> " << weight
<< dendl
;
1059 adjust_item_weight_in_loc(cct
, item
, iweight
, loc
);
1062 if (get_item_name(item
) != name
) {
1063 ldout(cct
, 5) << "update_item setting " << item
<< " name to " << name
<< dendl
;
1064 set_item_name(item
, name
);
1068 if (item_exists(item
)) {
1069 remove_item(cct
, item
, true);
1071 ldout(cct
, 5) << "update_item adding " << item
<< " weight " << weight
1072 << " at " << loc
<< dendl
;
1073 ret
= insert_item(cct
, item
, weight
, name
, loc
);
1080 int CrushWrapper::get_item_weight(int id
) const
1082 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1083 crush_bucket
*b
= crush
->buckets
[bidx
];
1088 for (unsigned i
= 0; i
< b
->size
; i
++)
1089 if (b
->items
[i
] == id
)
1090 return crush_get_bucket_item_weight(b
, i
);
1095 int CrushWrapper::get_item_weight_in_loc(int id
, const map
<string
,string
> &loc
)
1097 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
1099 int bid
= get_item_id(l
->second
);
1100 if (!bucket_exists(bid
))
1102 crush_bucket
*b
= get_bucket(bid
);
1103 for (unsigned int i
= 0; i
< b
->size
; i
++) {
1104 if (b
->items
[i
] == id
) {
1105 return crush_get_bucket_item_weight(b
, i
);
1112 int CrushWrapper::adjust_item_weight(CephContext
*cct
, int id
, int weight
)
1114 ldout(cct
, 5) << "adjust_item_weight " << id
<< " weight " << weight
<< dendl
;
1116 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1117 crush_bucket
*b
= crush
->buckets
[bidx
];
1120 for (unsigned i
= 0; i
< b
->size
; i
++) {
1121 if (b
->items
[i
] == id
) {
1122 int diff
= bucket_adjust_item_weight(cct
, b
, id
, weight
);
1123 ldout(cct
, 5) << "adjust_item_weight " << id
<< " diff " << diff
<< " in bucket " << bidx
<< dendl
;
1124 adjust_item_weight(cct
, -1 - bidx
, b
->weight
);
1134 int CrushWrapper::adjust_item_weight_in_loc(CephContext
*cct
, int id
, int weight
, const map
<string
,string
>& loc
)
1136 ldout(cct
, 5) << "adjust_item_weight_in_loc " << id
<< " weight " << weight
<< " in " << loc
<< dendl
;
1139 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
1140 int bid
= get_item_id(l
->second
);
1141 if (!bucket_exists(bid
))
1143 crush_bucket
*b
= get_bucket(bid
);
1144 for (unsigned int i
= 0; i
< b
->size
; i
++) {
1145 if (b
->items
[i
] == id
) {
1146 int diff
= bucket_adjust_item_weight(cct
, b
, id
, weight
);
1147 ldout(cct
, 5) << "adjust_item_weight_in_loc " << id
<< " diff " << diff
<< " in bucket " << bid
<< dendl
;
1148 adjust_item_weight(cct
, bid
, b
->weight
);
1158 int CrushWrapper::adjust_subtree_weight(CephContext
*cct
, int id
, int weight
)
1160 ldout(cct
, 5) << __func__
<< " " << id
<< " weight " << weight
<< dendl
;
1161 crush_bucket
*b
= get_bucket(id
);
1165 list
<crush_bucket
*> q
;
1167 while (!q
.empty()) {
1170 int local_changed
= 0;
1171 for (unsigned i
=0; i
<b
->size
; ++i
) {
1172 int n
= b
->items
[i
];
1174 bucket_adjust_item_weight(cct
, b
, n
, weight
);
1178 crush_bucket
*sub
= get_bucket(n
);
1184 if (local_changed
) {
1185 adjust_item_weight(cct
, b
->id
, b
->weight
);
1191 bool CrushWrapper::check_item_present(int id
) const
1195 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1196 crush_bucket
*b
= crush
->buckets
[bidx
];
1199 for (unsigned i
= 0; i
< b
->size
; i
++)
1200 if (b
->items
[i
] == id
)
1207 pair
<string
,string
> CrushWrapper::get_immediate_parent(int id
, int *_ret
)
1210 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1211 crush_bucket
*b
= crush
->buckets
[bidx
];
1214 const char *n
= get_item_name(b
->id
);
1215 if (n
&& !is_valid_crush_name(n
))
1217 for (unsigned i
= 0; i
< b
->size
; i
++)
1218 if (b
->items
[i
] == id
) {
1219 string parent_id
= name_map
[b
->id
];
1220 string parent_bucket_type
= type_map
[b
->type
];
1223 return make_pair(parent_bucket_type
, parent_id
);
1230 return pair
<string
, string
>();
1233 int CrushWrapper::get_immediate_parent_id(int id
, int *parent
) const
1235 for (int bidx
= 0; bidx
< crush
->max_buckets
; bidx
++) {
1236 crush_bucket
*b
= crush
->buckets
[bidx
];
1239 const char *n
= get_item_name(b
->id
);
1240 if (n
&& !is_valid_crush_name(n
))
1242 for (unsigned i
= 0; i
< b
->size
; i
++) {
1243 if (b
->items
[i
] == id
) {
1252 int CrushWrapper::get_parent_of_type(int item
, int type
) const
1255 int r
= get_immediate_parent_id(item
, &item
);
1259 } while (get_bucket_type(item
) != type
);
1263 bool CrushWrapper::class_is_in_use(int class_id
)
1265 for (auto &i
: class_bucket
)
1266 for (auto &j
: i
.second
)
1267 if (j
.first
== class_id
)
1270 for (auto &i
: class_map
)
1271 if (i
.second
== class_id
)
1277 int CrushWrapper::populate_classes()
1281 for (auto &r
: roots
) {
1284 if (id_has_class(r
))
1286 for (auto &c
: class_name
) {
1288 int res
= device_class_clone(r
, c
.first
, &clone
);
1296 int CrushWrapper::cleanup_classes()
1298 return trim_roots_with_class(true);
1301 int CrushWrapper::trim_roots_with_class(bool unused
)
1305 for (auto &r
: roots
) {
1308 if (!id_has_class(r
))
1310 int res
= remove_root(r
, unused
);
1314 // there is no need to reweight because we only remove from the
1319 int32_t CrushWrapper::_alloc_class_id() const {
1320 if (class_name
.empty()) {
1323 int32_t class_id
= class_name
.rbegin()->first
+ 1;
1324 if (class_id
>= 0) {
1327 // wrapped, pick a random start and do exhaustive search
1328 uint32_t upperlimit
= numeric_limits
<int32_t>::max();
1330 class_id
= rand() % upperlimit
;
1331 const auto start
= class_id
;
1333 if (!class_name
.count(class_id
)) {
1341 } while (class_id
!= start
);
1342 assert(0 == "no available class id");
1345 void CrushWrapper::reweight(CephContext
*cct
)
1349 for (set
<int>::iterator p
= roots
.begin(); p
!= roots
.end(); ++p
) {
1352 crush_bucket
*b
= get_bucket(*p
);
1353 ldout(cct
, 5) << "reweight bucket " << *p
<< dendl
;
1354 int r
= crush_reweight_bucket(crush
, b
);
1359 int CrushWrapper::add_simple_rule_at(
1360 string name
, string root_name
,
1361 string failure_domain_name
,
1362 string device_class
,
1363 string mode
, int rule_type
,
1367 if (rule_exists(name
)) {
1369 *err
<< "rule " << name
<< " exists";
1373 if (rule_exists(rno
)) {
1375 *err
<< "rule with ruleno " << rno
<< " exists";
1378 if (ruleset_exists(rno
)) {
1380 *err
<< "ruleset " << rno
<< " exists";
1384 for (rno
= 0; rno
< get_max_rules(); rno
++) {
1385 if (!rule_exists(rno
) && !ruleset_exists(rno
))
1389 if (!name_exists(root_name
)) {
1391 *err
<< "root item " << root_name
<< " does not exist";
1394 int root
= get_item_id(root_name
);
1396 if (failure_domain_name
.length()) {
1397 type
= get_type_id(failure_domain_name
);
1400 *err
<< "unknown type " << failure_domain_name
;
1404 if (device_class
.size()) {
1405 if (!class_exists(device_class
)) {
1407 *err
<< "device class " << device_class
<< " does not exist";
1410 int c
= get_class_id(device_class
);
1411 if (class_bucket
.count(root
) == 0 ||
1412 class_bucket
[root
].count(c
) == 0) {
1414 *err
<< "root " << root_name
<< " has no devices with class "
1418 root
= class_bucket
[root
][c
];
1420 if (mode
!= "firstn" && mode
!= "indep") {
1422 *err
<< "unknown mode " << mode
;
1427 if (mode
== "indep")
1429 int min_rep
= mode
== "firstn" ? 1 : 3;
1430 int max_rep
= mode
== "firstn" ? 10 : 20;
1431 //set the ruleset the same as rule_id(rno)
1432 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_rep
, max_rep
);
1435 if (mode
== "indep") {
1436 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1437 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1439 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, root
, 0);
1441 crush_rule_set_step(rule
, step
++,
1442 mode
== "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN
:
1443 CRUSH_RULE_CHOOSELEAF_INDEP
,
1447 crush_rule_set_step(rule
, step
++,
1448 mode
== "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN
:
1449 CRUSH_RULE_CHOOSE_INDEP
,
1452 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1454 int ret
= crush_add_rule(crush
, rule
, rno
);
1456 *err
<< "failed to add rule " << rno
<< " because " << cpp_strerror(ret
);
1459 set_rule_name(rno
, name
);
1464 int CrushWrapper::add_simple_rule(
1465 string name
, string root_name
,
1466 string failure_domain_name
,
1467 string device_class
,
1468 string mode
, int rule_type
,
1471 return add_simple_rule_at(name
, root_name
, failure_domain_name
, device_class
,
1473 rule_type
, -1, err
);
1476 int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno
, map
<int,float> *pmap
)
1478 if (ruleno
>= crush
->max_rules
)
1480 if (crush
->rules
[ruleno
] == NULL
)
1482 crush_rule
*rule
= crush
->rules
[ruleno
];
1484 // build a weight map for each TAKE in the rule, and then merge them
1486 // FIXME: if there are multiple takes that place a different number of
1487 // objects we do not take that into account. (Also, note that doing this
1488 // right is also a function of the pool, since the crush rule
1489 // might choose 2 + choose 2 but pool size may only be 3.)
1490 for (unsigned i
=0; i
<rule
->len
; ++i
) {
1493 if (rule
->steps
[i
].op
== CRUSH_RULE_TAKE
) {
1494 int n
= rule
->steps
[i
].arg1
;
1501 //breadth first iterate the OSD tree
1502 while (!q
.empty()) {
1503 int bno
= q
.front();
1505 crush_bucket
*b
= crush
->buckets
[-1-bno
];
1507 for (unsigned j
=0; j
<b
->size
; ++j
) {
1508 int item_id
= b
->items
[j
];
1509 if (item_id
>= 0) { //it's an OSD
1510 float w
= crush_get_bucket_item_weight(b
, j
);
1513 } else { //not an OSD, expand the child later
1514 q
.push_back(item_id
);
1520 for (map
<int,float>::iterator p
= m
.begin(); p
!= m
.end(); ++p
) {
1521 map
<int,float>::iterator q
= pmap
->find(p
->first
);
1522 if (q
== pmap
->end()) {
1523 (*pmap
)[p
->first
] = p
->second
/ sum
;
1525 q
->second
+= p
->second
/ sum
;
1533 int CrushWrapper::remove_rule(int ruleno
)
1535 if (ruleno
>= (int)crush
->max_rules
)
1537 if (crush
->rules
[ruleno
] == NULL
)
1539 crush_destroy_rule(crush
->rules
[ruleno
]);
1540 crush
->rules
[ruleno
] = NULL
;
1541 rule_name_map
.erase(ruleno
);
1546 int CrushWrapper::bucket_adjust_item_weight(CephContext
*cct
, crush_bucket
*bucket
, int item
, int weight
)
1548 if (cct
->_conf
->osd_crush_update_weight_set
) {
1550 for (position
= 0; position
< bucket
->size
; position
++)
1551 if (bucket
->items
[position
] == item
)
1553 assert(position
!= bucket
->size
);
1554 for (auto w
: choose_args
) {
1555 crush_choose_arg_map arg_map
= w
.second
;
1556 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
1557 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1558 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1559 weight_set
->weights
[position
] = weight
;
1563 return crush_bucket_adjust_item_weight(crush
, bucket
, item
, weight
);
1566 int CrushWrapper::bucket_add_item(crush_bucket
*bucket
, int item
, int weight
)
1568 __u32 new_size
= bucket
->size
+ 1;
1569 for (auto w
: choose_args
) {
1570 crush_choose_arg_map arg_map
= w
.second
;
1571 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
1572 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1573 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1574 weight_set
->weights
= (__u32
*)realloc(weight_set
->weights
, new_size
* sizeof(__u32
));
1575 assert(weight_set
->size
+ 1 == new_size
);
1576 weight_set
->weights
[weight_set
->size
] = weight
;
1577 weight_set
->size
= new_size
;
1579 if (arg
->ids_size
) {
1580 arg
->ids
= (__s32
*)realloc(arg
->ids
, new_size
* sizeof(__s32
));
1581 assert(arg
->ids_size
+ 1 == new_size
);
1582 arg
->ids
[arg
->ids_size
] = item
;
1583 arg
->ids_size
= new_size
;
1586 return crush_bucket_add_item(crush
, bucket
, item
, weight
);
1589 int CrushWrapper::bucket_remove_item(crush_bucket
*bucket
, int item
)
1591 __u32 new_size
= bucket
->size
- 1;
1593 for (position
= 0; position
< bucket
->size
; position
++)
1594 if (bucket
->items
[position
] == item
)
1596 assert(position
!= bucket
->size
);
1597 for (auto w
: choose_args
) {
1598 crush_choose_arg_map arg_map
= w
.second
;
1599 crush_choose_arg
*arg
= &arg_map
.args
[-1-bucket
->id
];
1600 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1601 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1602 assert(weight_set
->size
- 1 == new_size
);
1603 for (__u32 k
= position
; k
< new_size
; k
++)
1604 weight_set
->weights
[k
] = weight_set
->weights
[k
+1];
1605 weight_set
->weights
= (__u32
*)realloc(weight_set
->weights
, new_size
* sizeof(__u32
));
1606 weight_set
->size
= new_size
;
1608 if (arg
->ids_size
) {
1609 assert(arg
->ids_size
- 1 == new_size
);
1610 for (__u32 k
= position
; k
< new_size
; k
++)
1611 arg
->ids
[k
] = arg
->ids
[k
+1];
1612 arg
->ids
= (__s32
*)realloc(arg
->ids
, new_size
* sizeof(__s32
));
1613 arg
->ids_size
= new_size
;
1616 return crush_bucket_remove_item(crush
, bucket
, item
);
1619 int CrushWrapper::update_device_class(int id
,
1620 const string
& class_name
,
1624 int class_id
= get_or_create_class_id(class_name
);
1626 *ss
<< name
<< " id " << id
<< " is negative";
1629 assert(item_exists(id
));
1631 if (class_map
.count(id
) != 0 && class_map
[id
] == class_id
) {
1632 *ss
<< name
<< " already set to class " << class_name
;
1636 set_item_class(id
, class_id
);
1638 int r
= rebuild_roots_with_classes();
1644 int CrushWrapper::device_class_clone(int original_id
, int device_class
, int *clone
)
1646 const char *item_name
= get_item_name(original_id
);
1647 if (item_name
== NULL
)
1649 const char *class_name
= get_class_name(device_class
);
1650 if (class_name
== NULL
)
1652 string copy_name
= item_name
+ string("~") + class_name
;
1653 if (name_exists(copy_name
)) {
1654 *clone
= get_item_id(copy_name
);
1657 crush_bucket
*original
= get_bucket(original_id
);
1658 if (IS_ERR(original
))
1660 crush_bucket
*copy
= crush_make_bucket(crush
,
1667 for (unsigned i
= 0; i
< original
->size
; i
++) {
1668 int item
= original
->items
[i
];
1669 int weight
= crush_get_bucket_item_weight(original
, i
);
1671 if (class_map
.count(item
) != 0 && class_map
[item
] == device_class
) {
1672 int res
= bucket_add_item(copy
, item
, weight
);
1678 int res
= device_class_clone(item
, device_class
, &child_copy_id
);
1681 crush_bucket
*child_copy
= get_bucket(child_copy_id
);
1682 if (IS_ERR(child_copy
))
1684 res
= bucket_add_item(copy
, child_copy_id
, child_copy
->weight
);
1689 int res
= crush_add_bucket(crush
, 0, copy
, clone
);
1692 res
= set_item_class(*clone
, device_class
);
1695 // we do not use set_item_name because the name is intentionally invalid
1696 name_map
[*clone
] = copy_name
;
1698 name_rmap
[copy_name
] = *clone
;
1699 class_bucket
[original_id
][device_class
] = *clone
;
1703 int CrushWrapper::rebuild_roots_with_classes()
1705 int r
= trim_roots_with_class(false);
1708 r
= populate_classes();
1711 return trim_roots_with_class(true);
1714 void CrushWrapper::encode(bufferlist
& bl
, uint64_t features
) const
1718 __u32 magic
= CRUSH_MAGIC
;
1719 ::encode(magic
, bl
);
1721 ::encode(crush
->max_buckets
, bl
);
1722 ::encode(crush
->max_rules
, bl
);
1723 ::encode(crush
->max_devices
, bl
);
1725 bool encode_compat_choose_args
= false;
1726 crush_choose_arg_map arg_map
;
1727 memset(&arg_map
, '\0', sizeof(arg_map
));
1728 if (has_choose_args() &&
1729 !HAVE_FEATURE(features
, CRUSH_CHOOSE_ARGS
)) {
1730 assert(!has_incompat_choose_args());
1731 encode_compat_choose_args
= true;
1732 arg_map
= choose_args
.begin()->second
;
1736 for (int i
=0; i
<crush
->max_buckets
; i
++) {
1738 if (crush
->buckets
[i
]) alg
= crush
->buckets
[i
]->alg
;
1743 ::encode(crush
->buckets
[i
]->id
, bl
);
1744 ::encode(crush
->buckets
[i
]->type
, bl
);
1745 ::encode(crush
->buckets
[i
]->alg
, bl
);
1746 ::encode(crush
->buckets
[i
]->hash
, bl
);
1747 ::encode(crush
->buckets
[i
]->weight
, bl
);
1748 ::encode(crush
->buckets
[i
]->size
, bl
);
1749 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++)
1750 ::encode(crush
->buckets
[i
]->items
[j
], bl
);
1752 switch (crush
->buckets
[i
]->alg
) {
1753 case CRUSH_BUCKET_UNIFORM
:
1754 ::encode((reinterpret_cast<crush_bucket_uniform
*>(crush
->buckets
[i
]))->item_weight
, bl
);
1757 case CRUSH_BUCKET_LIST
:
1758 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
1759 ::encode((reinterpret_cast<crush_bucket_list
*>(crush
->buckets
[i
]))->item_weights
[j
], bl
);
1760 ::encode((reinterpret_cast<crush_bucket_list
*>(crush
->buckets
[i
]))->sum_weights
[j
], bl
);
1764 case CRUSH_BUCKET_TREE
:
1765 ::encode((reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->num_nodes
, bl
);
1766 for (unsigned j
=0; j
<(reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->num_nodes
; j
++)
1767 ::encode((reinterpret_cast<crush_bucket_tree
*>(crush
->buckets
[i
]))->node_weights
[j
], bl
);
1770 case CRUSH_BUCKET_STRAW
:
1771 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
1772 ::encode((reinterpret_cast<crush_bucket_straw
*>(crush
->buckets
[i
]))->item_weights
[j
], bl
);
1773 ::encode((reinterpret_cast<crush_bucket_straw
*>(crush
->buckets
[i
]))->straws
[j
], bl
);
1777 case CRUSH_BUCKET_STRAW2
:
1780 if (encode_compat_choose_args
&&
1781 arg_map
.args
[i
].weight_set_size
> 0) {
1782 weights
= arg_map
.args
[i
].weight_set
[0].weights
;
1784 weights
= (reinterpret_cast<crush_bucket_straw2
*>(crush
->buckets
[i
]))->item_weights
;
1786 for (unsigned j
=0; j
<crush
->buckets
[i
]->size
; j
++) {
1787 ::encode(weights
[j
], bl
);
1799 for (unsigned i
=0; i
<crush
->max_rules
; i
++) {
1800 __u32 yes
= crush
->rules
[i
] ? 1:0;
1805 ::encode(crush
->rules
[i
]->len
, bl
);
1806 ::encode(crush
->rules
[i
]->mask
, bl
);
1807 for (unsigned j
=0; j
<crush
->rules
[i
]->len
; j
++)
1808 ::encode(crush
->rules
[i
]->steps
[j
], bl
);
1812 ::encode(type_map
, bl
);
1813 ::encode(name_map
, bl
);
1814 ::encode(rule_name_map
, bl
);
1817 ::encode(crush
->choose_local_tries
, bl
);
1818 ::encode(crush
->choose_local_fallback_tries
, bl
);
1819 ::encode(crush
->choose_total_tries
, bl
);
1820 ::encode(crush
->chooseleaf_descend_once
, bl
);
1821 ::encode(crush
->chooseleaf_vary_r
, bl
);
1822 ::encode(crush
->straw_calc_version
, bl
);
1823 ::encode(crush
->allowed_bucket_algs
, bl
);
1824 if (features
& CEPH_FEATURE_CRUSH_TUNABLES5
) {
1825 ::encode(crush
->chooseleaf_stable
, bl
);
1828 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
1830 ::encode(class_map
, bl
);
1831 ::encode(class_name
, bl
);
1832 ::encode(class_bucket
, bl
);
1834 __u32 size
= (__u32
)choose_args
.size();
1836 for (auto c
: choose_args
) {
1837 ::encode(c
.first
, bl
);
1838 crush_choose_arg_map arg_map
= c
.second
;
1840 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
1841 crush_choose_arg
*arg
= &arg_map
.args
[i
];
1842 if (arg
->weight_set_size
== 0 &&
1848 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
1849 crush_choose_arg
*arg
= &arg_map
.args
[i
];
1850 if (arg
->weight_set_size
== 0 &&
1854 ::encode(arg
->weight_set_size
, bl
);
1855 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1856 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1857 ::encode(weight_set
->size
, bl
);
1858 for (__u32 k
= 0; k
< weight_set
->size
; k
++)
1859 ::encode(weight_set
->weights
[k
], bl
);
1861 ::encode(arg
->ids_size
, bl
);
1862 for (__u32 j
= 0; j
< arg
->ids_size
; j
++)
1863 ::encode(arg
->ids
[j
], bl
);
1869 static void decode_32_or_64_string_map(map
<int32_t,string
>& m
, bufferlist::iterator
& blp
)
1879 ::decode(strlen
, blp
);
1881 // der, key was actually 64-bits!
1882 ::decode(strlen
, blp
);
1884 ::decode_nohead(strlen
, m
[key
], blp
);
1888 void CrushWrapper::decode(bufferlist::iterator
& blp
)
1893 ::decode(magic
, blp
);
1894 if (magic
!= CRUSH_MAGIC
)
1895 throw buffer::malformed_input("bad magic number");
1897 ::decode(crush
->max_buckets
, blp
);
1898 ::decode(crush
->max_rules
, blp
);
1899 ::decode(crush
->max_devices
, blp
);
1901 // legacy tunables, unless we decode something newer
1902 set_tunables_legacy();
1906 crush
->buckets
= (crush_bucket
**)calloc(1, crush
->max_buckets
* sizeof(crush_bucket
*));
1907 for (int i
=0; i
<crush
->max_buckets
; i
++) {
1908 decode_crush_bucket(&crush
->buckets
[i
], blp
);
1912 crush
->rules
= (crush_rule
**)calloc(1, crush
->max_rules
* sizeof(crush_rule
*));
1913 for (unsigned i
= 0; i
< crush
->max_rules
; ++i
) {
1917 crush
->rules
[i
] = NULL
;
1923 crush
->rules
[i
] = reinterpret_cast<crush_rule
*>(calloc(1, crush_rule_size(len
)));
1924 crush
->rules
[i
]->len
= len
;
1925 ::decode(crush
->rules
[i
]->mask
, blp
);
1926 for (unsigned j
=0; j
<crush
->rules
[i
]->len
; j
++)
1927 ::decode(crush
->rules
[i
]->steps
[j
], blp
);
1931 // NOTE: we had a bug where we were incoding int instead of int32, which means the
1932 // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
1933 // both by assuming the string is always non-empty.
1934 decode_32_or_64_string_map(type_map
, blp
);
1935 decode_32_or_64_string_map(name_map
, blp
);
1936 decode_32_or_64_string_map(rule_name_map
, blp
);
1940 ::decode(crush
->choose_local_tries
, blp
);
1941 ::decode(crush
->choose_local_fallback_tries
, blp
);
1942 ::decode(crush
->choose_total_tries
, blp
);
1945 ::decode(crush
->chooseleaf_descend_once
, blp
);
1948 ::decode(crush
->chooseleaf_vary_r
, blp
);
1951 ::decode(crush
->straw_calc_version
, blp
);
1954 ::decode(crush
->allowed_bucket_algs
, blp
);
1957 ::decode(crush
->chooseleaf_stable
, blp
);
1960 ::decode(class_map
, blp
);
1961 ::decode(class_name
, blp
);
1962 for (auto &c
: class_name
)
1963 class_rname
[c
.second
] = c
.first
;
1964 ::decode(class_bucket
, blp
);
1968 __u32 choose_args_size
;
1969 ::decode(choose_args_size
, blp
);
1970 for (__u32 i
= 0; i
< choose_args_size
; i
++) {
1971 uint64_t choose_args_index
;
1972 ::decode(choose_args_index
, blp
);
1973 crush_choose_arg_map arg_map
;
1974 arg_map
.size
= crush
->max_buckets
;
1975 arg_map
.args
= (crush_choose_arg
*)calloc(arg_map
.size
, sizeof(crush_choose_arg
));
1977 ::decode(size
, blp
);
1978 for (__u32 j
= 0; j
< size
; j
++) {
1980 ::decode(bucket_index
, blp
);
1981 assert(bucket_index
< arg_map
.size
);
1982 crush_choose_arg
*arg
= &arg_map
.args
[bucket_index
];
1983 ::decode(arg
->weight_set_size
, blp
);
1984 arg
->weight_set
= (crush_weight_set
*)calloc(arg
->weight_set_size
, sizeof(crush_weight_set
));
1985 for (__u32 k
= 0; k
< arg
->weight_set_size
; k
++) {
1986 crush_weight_set
*weight_set
= &arg
->weight_set
[k
];
1987 ::decode(weight_set
->size
, blp
);
1988 weight_set
->weights
= (__u32
*)calloc(weight_set
->size
, sizeof(__u32
));
1989 for (__u32 l
= 0; l
< weight_set
->size
; l
++)
1990 ::decode(weight_set
->weights
[l
], blp
);
1992 ::decode(arg
->ids_size
, blp
);
1993 arg
->ids
= (__s32
*)calloc(arg
->ids_size
, sizeof(__s32
));
1994 for (__u32 k
= 0; k
< arg
->ids_size
; k
++)
1995 ::decode(arg
->ids
[k
], blp
);
1997 choose_args
[choose_args_index
] = arg_map
;
2003 crush_destroy(crush
);
2008 void CrushWrapper::decode_crush_bucket(crush_bucket
** bptr
, bufferlist::iterator
&blp
)
2019 case CRUSH_BUCKET_UNIFORM
:
2020 size
= sizeof(crush_bucket_uniform
);
2022 case CRUSH_BUCKET_LIST
:
2023 size
= sizeof(crush_bucket_list
);
2025 case CRUSH_BUCKET_TREE
:
2026 size
= sizeof(crush_bucket_tree
);
2028 case CRUSH_BUCKET_STRAW
:
2029 size
= sizeof(crush_bucket_straw
);
2031 case CRUSH_BUCKET_STRAW2
:
2032 size
= sizeof(crush_bucket_straw2
);
2037 snprintf(str
, sizeof(str
), "unsupported bucket algorithm: %d", alg
);
2038 throw buffer::malformed_input(str
);
2041 crush_bucket
*bucket
= reinterpret_cast<crush_bucket
*>(calloc(1, size
));
2044 ::decode(bucket
->id
, blp
);
2045 ::decode(bucket
->type
, blp
);
2046 ::decode(bucket
->alg
, blp
);
2047 ::decode(bucket
->hash
, blp
);
2048 ::decode(bucket
->weight
, blp
);
2049 ::decode(bucket
->size
, blp
);
2051 bucket
->items
= (__s32
*)calloc(1, bucket
->size
* sizeof(__s32
));
2052 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2053 ::decode(bucket
->items
[j
], blp
);
2056 switch (bucket
->alg
) {
2057 case CRUSH_BUCKET_UNIFORM
:
2058 ::decode((reinterpret_cast<crush_bucket_uniform
*>(bucket
))->item_weight
, blp
);
2061 case CRUSH_BUCKET_LIST
: {
2062 crush_bucket_list
* cbl
= reinterpret_cast<crush_bucket_list
*>(bucket
);
2063 cbl
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2064 cbl
->sum_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2066 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2067 ::decode(cbl
->item_weights
[j
], blp
);
2068 ::decode(cbl
->sum_weights
[j
], blp
);
2073 case CRUSH_BUCKET_TREE
: {
2074 crush_bucket_tree
* cbt
= reinterpret_cast<crush_bucket_tree
*>(bucket
);
2075 ::decode(cbt
->num_nodes
, blp
);
2076 cbt
->node_weights
= (__u32
*)calloc(1, cbt
->num_nodes
* sizeof(__u32
));
2077 for (unsigned j
=0; j
<cbt
->num_nodes
; j
++) {
2078 ::decode(cbt
->node_weights
[j
], blp
);
2083 case CRUSH_BUCKET_STRAW
: {
2084 crush_bucket_straw
* cbs
= reinterpret_cast<crush_bucket_straw
*>(bucket
);
2085 cbs
->straws
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2086 cbs
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2087 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2088 ::decode(cbs
->item_weights
[j
], blp
);
2089 ::decode(cbs
->straws
[j
], blp
);
2094 case CRUSH_BUCKET_STRAW2
: {
2095 crush_bucket_straw2
* cbs
= reinterpret_cast<crush_bucket_straw2
*>(bucket
);
2096 cbs
->item_weights
= (__u32
*)calloc(1, bucket
->size
* sizeof(__u32
));
2097 for (unsigned j
= 0; j
< bucket
->size
; ++j
) {
2098 ::decode(cbs
->item_weights
[j
], blp
);
2104 // We should have handled this case in the first switch statement
2111 void CrushWrapper::dump(Formatter
*f
) const
2113 f
->open_array_section("devices");
2114 for (int i
=0; i
<get_max_devices(); i
++) {
2115 f
->open_object_section("device");
2116 f
->dump_int("id", i
);
2117 const char *n
= get_item_name(i
);
2119 f
->dump_string("name", n
);
2122 sprintf(name
, "device%d", i
);
2123 f
->dump_string("name", name
);
2125 const char *device_class
= get_item_class(i
);
2126 if (device_class
!= NULL
)
2127 f
->dump_string("class", device_class
);
2132 f
->open_array_section("types");
2133 int n
= get_num_type_names();
2134 for (int i
=0; n
; i
++) {
2135 const char *name
= get_type_name(i
);
2138 f
->open_object_section("type");
2139 f
->dump_int("type_id", 0);
2140 f
->dump_string("name", "device");
2146 f
->open_object_section("type");
2147 f
->dump_int("type_id", i
);
2148 f
->dump_string("name", name
);
2153 f
->open_array_section("buckets");
2154 for (int bucket
= -1; bucket
> -1-get_max_buckets(); --bucket
) {
2155 if (!bucket_exists(bucket
))
2157 f
->open_object_section("bucket");
2158 f
->dump_int("id", bucket
);
2159 if (get_item_name(bucket
))
2160 f
->dump_string("name", get_item_name(bucket
));
2161 f
->dump_int("type_id", get_bucket_type(bucket
));
2162 if (get_type_name(get_bucket_type(bucket
)))
2163 f
->dump_string("type_name", get_type_name(get_bucket_type(bucket
)));
2164 f
->dump_int("weight", get_bucket_weight(bucket
));
2165 f
->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket
)));
2166 f
->dump_string("hash", crush_hash_name(get_bucket_hash(bucket
)));
2167 f
->open_array_section("items");
2168 for (int j
=0; j
<get_bucket_size(bucket
); j
++) {
2169 f
->open_object_section("item");
2170 f
->dump_int("id", get_bucket_item(bucket
, j
));
2171 f
->dump_int("weight", get_bucket_item_weight(bucket
, j
));
2172 f
->dump_int("pos", j
);
2180 f
->open_array_section("rules");
2184 f
->open_object_section("tunables");
2188 dump_choose_args(f
);
2192 // depth first walker
2194 typedef CrushTreeDumper::Item Item
;
2195 const CrushWrapper
*crush
;
2197 explicit TreeDumper(const CrushWrapper
*crush
)
2200 void dump(Formatter
*f
) {
2202 crush
->find_roots(roots
);
2203 for (set
<int>::iterator root
= roots
.begin(); root
!= roots
.end(); ++root
) {
2204 dump_item(Item(*root
, 0, crush
->get_bucket_weightf(*root
)), f
);
2209 void dump_item(const Item
& qi
, Formatter
* f
) {
2210 if (qi
.is_bucket()) {
2211 f
->open_object_section("bucket");
2212 CrushTreeDumper::dump_item_fields(crush
, qi
, f
);
2213 dump_bucket_children(qi
, f
);
2216 f
->open_object_section("device");
2217 CrushTreeDumper::dump_item_fields(crush
, qi
, f
);
2222 void dump_bucket_children(const Item
& parent
, Formatter
* f
) {
2223 f
->open_array_section("items");
2224 const int max_pos
= crush
->get_bucket_size(parent
.id
);
2225 for (int pos
= 0; pos
< max_pos
; pos
++) {
2226 int id
= crush
->get_bucket_item(parent
.id
, pos
);
2227 float weight
= crush
->get_bucket_item_weightf(parent
.id
, pos
);
2228 dump_item(Item(id
, parent
.depth
+ 1, weight
), f
);
2235 void CrushWrapper::dump_tree(Formatter
*f
) const
2238 TreeDumper(this).dump(f
);
2241 void CrushWrapper::dump_tunables(Formatter
*f
) const
2243 f
->dump_int("choose_local_tries", get_choose_local_tries());
2244 f
->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
2245 f
->dump_int("choose_total_tries", get_choose_total_tries());
2246 f
->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
2247 f
->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
2248 f
->dump_int("chooseleaf_stable", get_chooseleaf_stable());
2249 f
->dump_int("straw_calc_version", get_straw_calc_version());
2250 f
->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
2252 // be helpful about it
2253 if (has_jewel_tunables())
2254 f
->dump_string("profile", "jewel");
2255 else if (has_hammer_tunables())
2256 f
->dump_string("profile", "hammer");
2257 else if (has_firefly_tunables())
2258 f
->dump_string("profile", "firefly");
2259 else if (has_bobtail_tunables())
2260 f
->dump_string("profile", "bobtail");
2261 else if (has_argonaut_tunables())
2262 f
->dump_string("profile", "argonaut");
2264 f
->dump_string("profile", "unknown");
2265 f
->dump_int("optimal_tunables", (int)has_optimal_tunables());
2266 f
->dump_int("legacy_tunables", (int)has_legacy_tunables());
2268 // be helpful about minimum version required
2269 f
->dump_string("minimum_required_version", get_min_required_version());
2271 f
->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
2272 f
->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
2273 f
->dump_int("has_v2_rules", (int)has_v2_rules());
2274 f
->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
2275 f
->dump_int("has_v3_rules", (int)has_v3_rules());
2276 f
->dump_int("has_v4_buckets", (int)has_v4_buckets());
2277 f
->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
2278 f
->dump_int("has_v5_rules", (int)has_v5_rules());
2281 void CrushWrapper::dump_choose_args(Formatter
*f
) const
2283 f
->open_object_section("choose_args");
2284 for (auto c
: choose_args
) {
2285 crush_choose_arg_map arg_map
= c
.second
;
2286 f
->open_array_section(stringify(c
.first
).c_str());
2287 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
2288 crush_choose_arg
*arg
= &arg_map
.args
[i
];
2289 if (arg
->weight_set_size
== 0 &&
2292 f
->open_object_section("choose_args");
2293 int bucket_index
= i
;
2294 f
->dump_int("bucket_id", -1-bucket_index
);
2295 if (arg
->weight_set_size
> 0) {
2296 f
->open_array_section("weight_set");
2297 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
2298 f
->open_array_section("weights");
2299 __u32
*weights
= arg
->weight_set
[j
].weights
;
2300 __u32 size
= arg
->weight_set
[j
].size
;
2301 for (__u32 k
= 0; k
< size
; k
++) {
2302 f
->dump_float("weight", (float)weights
[k
]/(float)0x10000);
2308 if (arg
->ids_size
> 0) {
2309 f
->open_array_section("ids");
2310 for (__u32 j
= 0; j
< arg
->ids_size
; j
++)
2311 f
->dump_int("id", arg
->ids
[j
]);
2321 void CrushWrapper::dump_rules(Formatter
*f
) const
2323 for (int i
=0; i
<get_max_rules(); i
++) {
2324 if (!rule_exists(i
))
2330 void CrushWrapper::dump_rule(int ruleset
, Formatter
*f
) const
2332 f
->open_object_section("rule");
2333 f
->dump_int("rule_id", ruleset
);
2334 if (get_rule_name(ruleset
))
2335 f
->dump_string("rule_name", get_rule_name(ruleset
));
2336 f
->dump_int("ruleset", get_rule_mask_ruleset(ruleset
));
2337 f
->dump_int("type", get_rule_mask_type(ruleset
));
2338 f
->dump_int("min_size", get_rule_mask_min_size(ruleset
));
2339 f
->dump_int("max_size", get_rule_mask_max_size(ruleset
));
2340 f
->open_array_section("steps");
2341 for (int j
=0; j
<get_rule_len(ruleset
); j
++) {
2342 f
->open_object_section("step");
2343 switch (get_rule_op(ruleset
, j
)) {
2344 case CRUSH_RULE_NOOP
:
2345 f
->dump_string("op", "noop");
2347 case CRUSH_RULE_TAKE
:
2348 f
->dump_string("op", "take");
2350 int item
= get_rule_arg1(ruleset
, j
);
2351 f
->dump_int("item", item
);
2353 const char *name
= get_item_name(item
);
2354 f
->dump_string("item_name", name
? name
: "");
2357 case CRUSH_RULE_EMIT
:
2358 f
->dump_string("op", "emit");
2360 case CRUSH_RULE_CHOOSE_FIRSTN
:
2361 f
->dump_string("op", "choose_firstn");
2362 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2363 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2365 case CRUSH_RULE_CHOOSE_INDEP
:
2366 f
->dump_string("op", "choose_indep");
2367 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2368 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2370 case CRUSH_RULE_CHOOSELEAF_FIRSTN
:
2371 f
->dump_string("op", "chooseleaf_firstn");
2372 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2373 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2375 case CRUSH_RULE_CHOOSELEAF_INDEP
:
2376 f
->dump_string("op", "chooseleaf_indep");
2377 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2378 f
->dump_string("type", get_type_name(get_rule_arg2(ruleset
, j
)));
2380 case CRUSH_RULE_SET_CHOOSE_TRIES
:
2381 f
->dump_string("op", "set_choose_tries");
2382 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2384 case CRUSH_RULE_SET_CHOOSELEAF_TRIES
:
2385 f
->dump_string("op", "set_chooseleaf_tries");
2386 f
->dump_int("num", get_rule_arg1(ruleset
, j
));
2389 f
->dump_int("opcode", get_rule_op(ruleset
, j
));
2390 f
->dump_int("arg1", get_rule_arg1(ruleset
, j
));
2391 f
->dump_int("arg2", get_rule_arg2(ruleset
, j
));
2399 void CrushWrapper::list_rules(Formatter
*f
) const
2401 for (int rule
= 0; rule
< get_max_rules(); rule
++) {
2402 if (!rule_exists(rule
))
2404 f
->dump_string("name", get_rule_name(rule
));
2408 class CrushTreePlainDumper
: public CrushTreeDumper::Dumper
<ostream
> {
2410 typedef CrushTreeDumper::Dumper
<ostream
> Parent
;
2412 explicit CrushTreePlainDumper(const CrushWrapper
*crush
)
2415 void dump(ostream
*out
) {
2416 *out
<< "ID\tWEIGHT\tTYPE NAME\n";
2421 void dump_item(const CrushTreeDumper::Item
&qi
, ostream
*out
) override
{
2422 *out
<< qi
.id
<< "\t"
2423 << weightf_t(qi
.weight
) << "\t";
2425 for (int k
=0; k
< qi
.depth
; k
++)
2430 *out
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
2431 << crush
->get_item_name(qi
.id
);
2435 *out
<< "osd." << qi
.id
;
2442 class CrushTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
2444 typedef CrushTreeDumper::FormattingDumper Parent
;
2446 explicit CrushTreeFormattingDumper(const CrushWrapper
*crush
)
2449 void dump(Formatter
*f
) {
2450 f
->open_array_section("nodes");
2453 f
->open_array_section("stray");
2459 void CrushWrapper::dump_tree(ostream
*out
, Formatter
*f
) const
2462 CrushTreePlainDumper(this).dump(out
);
2464 CrushTreeFormattingDumper(this).dump(f
);
2467 void CrushWrapper::generate_test_instances(list
<CrushWrapper
*>& o
)
2469 o
.push_back(new CrushWrapper
);
2474 * Determine the default CRUSH ruleset ID to be used with
2475 * newly created replicated pools.
2477 * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
2479 int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext
*cct
)
2481 int crush_ruleset
= cct
->_conf
->osd_pool_default_crush_rule
;
2482 if (crush_ruleset
< 0) {
2483 crush_ruleset
= find_first_ruleset(pg_pool_t::TYPE_REPLICATED
);
2484 } else if (!ruleset_exists(crush_ruleset
)) {
2485 crush_ruleset
= -1; // match find_first_ruleset() retval
2487 return crush_ruleset
;
2490 bool CrushWrapper::is_valid_crush_name(const string
& s
)
2494 for (string::const_iterator p
= s
.begin(); p
!= s
.end(); ++p
) {
2498 !(*p
>= '0' && *p
<= '9') &&
2499 !(*p
>= 'A' && *p
<= 'Z') &&
2500 !(*p
>= 'a' && *p
<= 'z'))
2506 bool CrushWrapper::is_valid_crush_loc(CephContext
*cct
,
2507 const map
<string
,string
>& loc
)
2509 for (map
<string
,string
>::const_iterator l
= loc
.begin(); l
!= loc
.end(); ++l
) {
2510 if (!is_valid_crush_name(l
->first
) ||
2511 !is_valid_crush_name(l
->second
)) {
2512 ldout(cct
, 1) << "loc["
2513 << l
->first
<< "] = '"
2514 << l
->second
<< "' not a valid crush name ([A-Za-z0-9_-.]+)"
2522 int CrushWrapper::_choose_type_stack(
2524 const vector
<pair
<int,int>>& stack
,
2525 const set
<int>& overfull
,
2526 const vector
<int>& underfull
,
2527 const vector
<int>& orig
,
2528 vector
<int>::const_iterator
& i
,
2530 vector
<int> *pw
) const
2532 vector
<int> w
= *pw
;
2535 ldout(cct
, 10) << __func__
<< " stack " << stack
2541 vector
<int> cumulative_fanout(stack
.size());
2543 for (int j
= (int)stack
.size() - 1; j
>= 0; --j
) {
2544 cumulative_fanout
[j
] = f
;
2545 f
*= stack
[j
].second
;
2547 ldout(cct
, 10) << __func__
<< " cumulative_fanout " << cumulative_fanout
2550 // identify underful targets for each intermediate level.
2551 // this serves two purposes:
2552 // 1. we can tell when we are selecting a bucket that does not have any underfull
2553 // devices beneath it. that means that if the current input includes an overfull
2554 // device, we won't be able to find an underfull device with this parent to
2556 // 2. when we decide we should reject a bucket due to the above, this list gives us
2557 // a list of peers to consider that *do* have underfull devices available.. (we
2558 // are careful to pick one that has the same parent.)
2559 vector
<set
<int>> underfull_buckets
; // level -> set of buckets with >0 underfull item(s)
2560 underfull_buckets
.resize(stack
.size() - 1);
2561 for (auto osd
: underfull
) {
2563 for (int j
= (int)stack
.size() - 2; j
>= 0; --j
) {
2564 int type
= stack
[j
].first
;
2565 item
= get_parent_of_type(item
, type
);
2566 ldout(cct
, 10) << __func__
<< " underfull " << osd
<< " type " << type
2567 << " is " << item
<< dendl
;
2568 underfull_buckets
[j
].insert(item
);
2571 ldout(cct
, 20) << __func__
<< " underfull_buckets " << underfull_buckets
<< dendl
;
2573 for (unsigned j
= 0; j
< stack
.size(); ++j
) {
2574 int type
= stack
[j
].first
;
2575 int fanout
= stack
[j
].second
;
2576 int cum_fanout
= cumulative_fanout
[j
];
2577 ldout(cct
, 10) << " level " << j
<< ": type " << type
<< " fanout " << fanout
2578 << " cumulative " << cum_fanout
2579 << " w " << w
<< dendl
;
2582 for (auto from
: w
) {
2583 ldout(cct
, 10) << " from " << from
<< dendl
;
2584 // identify leaves under each choice. we use this to check whether any of these
2585 // leaves are overfull. (if so, we need to make sure there are underfull candidates
2586 // to swap for them.)
2587 vector
<set
<int>> leaves
;
2588 leaves
.resize(fanout
);
2589 for (int pos
= 0; pos
< fanout
; ++pos
) {
2592 int item
= get_parent_of_type(*tmpi
, type
);
2595 while (n
-- && tmpi
!= orig
.end()) {
2596 leaves
[pos
].insert(*tmpi
++);
2598 ldout(cct
, 10) << __func__
<< " from " << *tmpi
<< " got " << item
2599 << " of type " << type
<< " over leaves " << leaves
[pos
] << dendl
;
2602 bool replaced
= false;
2603 if (overfull
.count(*i
)) {
2604 for (auto item
: underfull
) {
2605 ldout(cct
, 10) << __func__
<< " pos " << pos
2606 << " was " << *i
<< " considering " << item
2608 if (used
.count(item
)) {
2609 ldout(cct
, 20) << __func__
<< " in used " << used
<< dendl
;
2612 if (!subtree_contains(from
, item
)) {
2613 ldout(cct
, 20) << __func__
<< " not in subtree " << from
<< dendl
;
2616 if (std::find(orig
.begin(), orig
.end(), item
) != orig
.end()) {
2617 ldout(cct
, 20) << __func__
<< " in orig " << orig
<< dendl
;
2622 ldout(cct
, 10) << __func__
<< " pos " << pos
<< " replace "
2623 << *i
<< " -> " << item
<< dendl
;
2630 ldout(cct
, 10) << __func__
<< " pos " << pos
<< " keep " << *i
2635 if (i
== orig
.end()) {
2636 ldout(cct
, 10) << __func__
<< " end of orig, break 1" << dendl
;
2641 if (j
+ 1 < stack
.size()) {
2642 // check if any buckets have overfull leaves but no underfull candidates
2643 for (int pos
= 0; pos
< fanout
; ++pos
) {
2644 if (underfull_buckets
[j
].count(o
[pos
]) == 0) {
2645 // are any leaves overfull?
2646 bool any_overfull
= false;
2647 for (auto osd
: leaves
[pos
]) {
2648 if (overfull
.count(osd
)) {
2649 any_overfull
= true;
2653 ldout(cct
, 10) << " bucket " << o
[pos
] << " has no underfull targets and "
2654 << ">0 leaves " << leaves
[pos
] << " is overfull; alts "
2655 << underfull_buckets
[j
]
2657 for (auto alt
: underfull_buckets
[j
]) {
2658 if (std::find(o
.begin(), o
.end(), alt
) == o
.end()) {
2659 // see if alt has the same parent
2661 get_parent_of_type(o
[pos
], stack
[j
-1].first
) ==
2662 get_parent_of_type(alt
, stack
[j
-1].first
)) {
2664 ldout(cct
, 10) << " replacing " << o
[pos
]
2665 << " (which has no underfull leaves) with " << alt
2667 << get_parent_of_type(alt
, stack
[j
-1].first
) << " type "
2668 << type
<< ")" << dendl
;
2670 ldout(cct
, 10) << " replacing " << o
[pos
]
2671 << " (which has no underfull leaves) with " << alt
2672 << " (first level)" << dendl
;
2676 ldout(cct
, 30) << " alt " << alt
<< " for " << o
[pos
]
2677 << " has different parent, skipping" << dendl
;
2685 if (i
== orig
.end()) {
2686 ldout(cct
, 10) << __func__
<< " end of orig, break 2" << dendl
;
2690 ldout(cct
, 10) << __func__
<< " w <- " << o
<< " was " << w
<< dendl
;
2697 int CrushWrapper::try_remap_rule(
2701 const set
<int>& overfull
,
2702 const vector
<int>& underfull
,
2703 const vector
<int>& orig
,
2704 vector
<int> *out
) const
2706 const crush_map
*map
= crush
;
2707 const crush_rule
*rule
= get_rule(ruleno
);
2710 ldout(cct
, 10) << __func__
<< " ruleno " << ruleno
2711 << " numrep " << maxout
<< " overfull " << overfull
2712 << " underfull " << underfull
<< " orig " << orig
2714 vector
<int> w
; // working set
2717 auto i
= orig
.begin();
2720 vector
<pair
<int,int>> type_stack
; // (type, fan-out)
2722 for (unsigned step
= 0; step
< rule
->len
; ++step
) {
2723 const crush_rule_step
*curstep
= &rule
->steps
[step
];
2724 ldout(cct
, 10) << __func__
<< " step " << step
<< " w " << w
<< dendl
;
2725 switch (curstep
->op
) {
2726 case CRUSH_RULE_TAKE
:
2727 if ((curstep
->arg1
>= 0 && curstep
->arg1
< map
->max_devices
) ||
2728 (-1-curstep
->arg1
>= 0 && -1-curstep
->arg1
< map
->max_buckets
&&
2729 map
->buckets
[-1-curstep
->arg1
])) {
2731 w
.push_back(curstep
->arg1
);
2732 ldout(cct
, 10) << __func__
<< " take " << w
<< dendl
;
2734 ldout(cct
, 1) << " bad take value " << curstep
->arg1
<< dendl
;
2738 case CRUSH_RULE_CHOOSELEAF_FIRSTN
:
2739 case CRUSH_RULE_CHOOSELEAF_INDEP
:
2741 int numrep
= curstep
->arg1
;
2742 int type
= curstep
->arg2
;
2745 type_stack
.push_back(make_pair(type
, numrep
));
2746 type_stack
.push_back(make_pair(0, 1));
2747 int r
= _choose_type_stack(cct
, type_stack
, overfull
, underfull
, orig
,
2755 case CRUSH_RULE_CHOOSE_FIRSTN
:
2756 case CRUSH_RULE_CHOOSE_INDEP
:
2758 int numrep
= curstep
->arg1
;
2759 int type
= curstep
->arg2
;
2762 type_stack
.push_back(make_pair(type
, numrep
));
2766 case CRUSH_RULE_EMIT
:
2767 ldout(cct
, 10) << " emit " << w
<< dendl
;
2768 if (!type_stack
.empty()) {
2769 int r
= _choose_type_stack(cct
, type_stack
, overfull
, underfull
, orig
,
2775 for (auto item
: w
) {
2776 out
->push_back(item
);