1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #ifndef CEPH_CRUSH_WRAPPER_H
5 #define CEPH_CRUSH_WRAPPER_H
14 #include "include/types.h"
23 #include "include/assert.h"
24 #include "include/err.h"
25 #include "include/encoding.h"
26 #include "include/mempool.h"
28 #include "common/Mutex.h"
30 #define BUG_ON(x) assert(!(x))
36 namespace CrushTreeDumper
{
37 typedef mempool::osdmap::map
<int64_t,string
> name_map_t
;
40 WRITE_RAW_ENCODER(crush_rule_mask
) // it's all u8's
42 inline static void encode(const crush_rule_step
&s
, bufferlist
&bl
)
48 inline static void decode(crush_rule_step
&s
, bufferlist::iterator
&p
)
58 // magic value used by OSDMap for a "default" fallback choose_args, used if
59 // the choose_arg_map passed to do_rule does not exist. if this also
60 // doesn't exist, fall back to canonical weights.
62 DEFAULT_CHOOSE_ARGS
= -1
65 std::map
<int32_t, string
> type_map
; /* bucket/device type names */
66 std::map
<int32_t, string
> name_map
; /* bucket/device names */
67 std::map
<int32_t, string
> rule_name_map
;
69 std::map
<int32_t, int32_t> class_map
; /* item id -> class id */
70 std::map
<int32_t, string
> class_name
; /* class id -> class name */
71 std::map
<string
, int32_t> class_rname
; /* class name -> class id */
72 std::map
<int32_t, map
<int32_t, int32_t> > class_bucket
; /* bucket[id][class] == id */
73 std::map
<int64_t, crush_choose_arg_map
> choose_args
;
76 struct crush_map
*crush
;
78 bool have_uniform_rules
= false;
81 mutable bool have_rmaps
;
82 mutable std::map
<string
, int> type_rmap
, name_rmap
, rule_name_rmap
;
83 void build_rmaps() const {
84 if (have_rmaps
) return;
85 build_rmap(type_map
, type_rmap
);
86 build_rmap(name_map
, name_rmap
);
87 build_rmap(rule_name_map
, rule_name_rmap
);
90 void build_rmap(const map
<int, string
> &f
, std::map
<string
, int> &r
) const {
92 for (std::map
<int, string
>::const_iterator p
= f
.begin(); p
!= f
.end(); ++p
)
93 r
[p
->second
] = p
->first
;
97 CrushWrapper(const CrushWrapper
& other
);
98 const CrushWrapper
& operator=(const CrushWrapper
& other
);
100 CrushWrapper() : crush(0), have_rmaps(false) {
105 crush_destroy(crush
);
109 crush_map
*get_crush_map() { return crush
; }
114 crush_destroy(crush
);
115 crush
= crush_create();
120 set_tunables_default();
123 /// true if any rule has a ruleset != the rule id
124 bool has_legacy_rulesets() const;
126 /// fix rules whose ruleid != ruleset
127 int renumber_rules_by_ruleset();
129 /// true if any ruleset has more than 1 rule
130 bool has_multirule_rulesets() const;
132 /// true if any buckets that aren't straw2
133 bool has_non_straw2_buckets() const;
136 void set_tunables_argonaut() {
137 crush
->choose_local_tries
= 2;
138 crush
->choose_local_fallback_tries
= 5;
139 crush
->choose_total_tries
= 19;
140 crush
->chooseleaf_descend_once
= 0;
141 crush
->chooseleaf_vary_r
= 0;
142 crush
->chooseleaf_stable
= 0;
143 crush
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
145 void set_tunables_bobtail() {
146 crush
->choose_local_tries
= 0;
147 crush
->choose_local_fallback_tries
= 0;
148 crush
->choose_total_tries
= 50;
149 crush
->chooseleaf_descend_once
= 1;
150 crush
->chooseleaf_vary_r
= 0;
151 crush
->chooseleaf_stable
= 0;
152 crush
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
154 void set_tunables_firefly() {
155 crush
->choose_local_tries
= 0;
156 crush
->choose_local_fallback_tries
= 0;
157 crush
->choose_total_tries
= 50;
158 crush
->chooseleaf_descend_once
= 1;
159 crush
->chooseleaf_vary_r
= 1;
160 crush
->chooseleaf_stable
= 0;
161 crush
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
163 void set_tunables_hammer() {
164 crush
->choose_local_tries
= 0;
165 crush
->choose_local_fallback_tries
= 0;
166 crush
->choose_total_tries
= 50;
167 crush
->chooseleaf_descend_once
= 1;
168 crush
->chooseleaf_vary_r
= 1;
169 crush
->chooseleaf_stable
= 0;
170 crush
->allowed_bucket_algs
=
171 (1 << CRUSH_BUCKET_UNIFORM
) |
172 (1 << CRUSH_BUCKET_LIST
) |
173 (1 << CRUSH_BUCKET_STRAW
) |
174 (1 << CRUSH_BUCKET_STRAW2
);
176 void set_tunables_jewel() {
177 crush
->choose_local_tries
= 0;
178 crush
->choose_local_fallback_tries
= 0;
179 crush
->choose_total_tries
= 50;
180 crush
->chooseleaf_descend_once
= 1;
181 crush
->chooseleaf_vary_r
= 1;
182 crush
->chooseleaf_stable
= 1;
183 crush
->allowed_bucket_algs
=
184 (1 << CRUSH_BUCKET_UNIFORM
) |
185 (1 << CRUSH_BUCKET_LIST
) |
186 (1 << CRUSH_BUCKET_STRAW
) |
187 (1 << CRUSH_BUCKET_STRAW2
);
190 void set_tunables_legacy() {
191 set_tunables_argonaut();
192 crush
->straw_calc_version
= 0;
194 void set_tunables_optimal() {
195 set_tunables_jewel();
196 crush
->straw_calc_version
= 1;
198 void set_tunables_default() {
199 set_tunables_jewel();
200 crush
->straw_calc_version
= 1;
203 int get_choose_local_tries() const {
204 return crush
->choose_local_tries
;
206 void set_choose_local_tries(int n
) {
207 crush
->choose_local_tries
= n
;
210 int get_choose_local_fallback_tries() const {
211 return crush
->choose_local_fallback_tries
;
213 void set_choose_local_fallback_tries(int n
) {
214 crush
->choose_local_fallback_tries
= n
;
217 int get_choose_total_tries() const {
218 return crush
->choose_total_tries
;
220 void set_choose_total_tries(int n
) {
221 crush
->choose_total_tries
= n
;
224 int get_chooseleaf_descend_once() const {
225 return crush
->chooseleaf_descend_once
;
227 void set_chooseleaf_descend_once(int n
) {
228 crush
->chooseleaf_descend_once
= !!n
;
231 int get_chooseleaf_vary_r() const {
232 return crush
->chooseleaf_vary_r
;
234 void set_chooseleaf_vary_r(int n
) {
235 crush
->chooseleaf_vary_r
= n
;
238 int get_chooseleaf_stable() const {
239 return crush
->chooseleaf_stable
;
241 void set_chooseleaf_stable(int n
) {
242 crush
->chooseleaf_stable
= n
;
245 int get_straw_calc_version() const {
246 return crush
->straw_calc_version
;
248 void set_straw_calc_version(int n
) {
249 crush
->straw_calc_version
= n
;
252 unsigned get_allowed_bucket_algs() const {
253 return crush
->allowed_bucket_algs
;
255 void set_allowed_bucket_algs(unsigned n
) {
256 crush
->allowed_bucket_algs
= n
;
259 bool has_argonaut_tunables() const {
261 crush
->choose_local_tries
== 2 &&
262 crush
->choose_local_fallback_tries
== 5 &&
263 crush
->choose_total_tries
== 19 &&
264 crush
->chooseleaf_descend_once
== 0 &&
265 crush
->chooseleaf_vary_r
== 0 &&
266 crush
->chooseleaf_stable
== 0 &&
267 crush
->allowed_bucket_algs
== CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
269 bool has_bobtail_tunables() const {
271 crush
->choose_local_tries
== 0 &&
272 crush
->choose_local_fallback_tries
== 0 &&
273 crush
->choose_total_tries
== 50 &&
274 crush
->chooseleaf_descend_once
== 1 &&
275 crush
->chooseleaf_vary_r
== 0 &&
276 crush
->chooseleaf_stable
== 0 &&
277 crush
->allowed_bucket_algs
== CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
279 bool has_firefly_tunables() const {
281 crush
->choose_local_tries
== 0 &&
282 crush
->choose_local_fallback_tries
== 0 &&
283 crush
->choose_total_tries
== 50 &&
284 crush
->chooseleaf_descend_once
== 1 &&
285 crush
->chooseleaf_vary_r
== 1 &&
286 crush
->chooseleaf_stable
== 0 &&
287 crush
->allowed_bucket_algs
== CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
289 bool has_hammer_tunables() const {
291 crush
->choose_local_tries
== 0 &&
292 crush
->choose_local_fallback_tries
== 0 &&
293 crush
->choose_total_tries
== 50 &&
294 crush
->chooseleaf_descend_once
== 1 &&
295 crush
->chooseleaf_vary_r
== 1 &&
296 crush
->chooseleaf_stable
== 0 &&
297 crush
->allowed_bucket_algs
== ((1 << CRUSH_BUCKET_UNIFORM
) |
298 (1 << CRUSH_BUCKET_LIST
) |
299 (1 << CRUSH_BUCKET_STRAW
) |
300 (1 << CRUSH_BUCKET_STRAW2
));
302 bool has_jewel_tunables() const {
304 crush
->choose_local_tries
== 0 &&
305 crush
->choose_local_fallback_tries
== 0 &&
306 crush
->choose_total_tries
== 50 &&
307 crush
->chooseleaf_descend_once
== 1 &&
308 crush
->chooseleaf_vary_r
== 1 &&
309 crush
->chooseleaf_stable
== 1 &&
310 crush
->allowed_bucket_algs
== ((1 << CRUSH_BUCKET_UNIFORM
) |
311 (1 << CRUSH_BUCKET_LIST
) |
312 (1 << CRUSH_BUCKET_STRAW
) |
313 (1 << CRUSH_BUCKET_STRAW2
));
316 bool has_optimal_tunables() const {
317 return has_jewel_tunables();
319 bool has_legacy_tunables() const {
320 return has_argonaut_tunables();
323 bool has_nondefault_tunables() const {
325 (crush
->choose_local_tries
!= 2 ||
326 crush
->choose_local_fallback_tries
!= 5 ||
327 crush
->choose_total_tries
!= 19);
329 bool has_nondefault_tunables2() const {
331 crush
->chooseleaf_descend_once
!= 0;
333 bool has_nondefault_tunables3() const {
335 crush
->chooseleaf_vary_r
!= 0;
337 bool has_nondefault_tunables5() const {
339 crush
->chooseleaf_stable
!= 0;
342 bool has_v2_rules() const;
343 bool has_v3_rules() const;
344 bool has_v4_buckets() const;
345 bool has_v5_rules() const;
346 bool has_choose_args() const; // any choose_args
347 bool has_incompat_choose_args() const; // choose_args that can't be made compat
349 bool is_v2_rule(unsigned ruleid
) const;
350 bool is_v3_rule(unsigned ruleid
) const;
351 bool is_v5_rule(unsigned ruleid
) const;
353 string
get_min_required_version() const {
354 if (has_v5_rules() || has_nondefault_tunables5())
356 else if (has_v4_buckets())
358 else if (has_nondefault_tunables3())
360 else if (has_nondefault_tunables2() || has_nondefault_tunables())
366 // default bucket types
367 unsigned get_default_bucket_alg() const {
368 // in order of preference
369 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_STRAW2
))
370 return CRUSH_BUCKET_STRAW2
;
371 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_STRAW
))
372 return CRUSH_BUCKET_STRAW
;
373 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_TREE
))
374 return CRUSH_BUCKET_TREE
;
375 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_LIST
))
376 return CRUSH_BUCKET_LIST
;
377 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_UNIFORM
))
378 return CRUSH_BUCKET_UNIFORM
;
383 int get_num_type_names() const {
384 return type_map
.size();
386 int get_max_type_id() const {
387 if (type_map
.empty())
389 return type_map
.rbegin()->first
;
391 int get_type_id(const string
& name
) const {
393 if (type_rmap
.count(name
))
394 return type_rmap
[name
];
397 const char *get_type_name(int t
) const {
398 std::map
<int,string
>::const_iterator p
= type_map
.find(t
);
399 if (p
!= type_map
.end())
400 return p
->second
.c_str();
403 void set_type_name(int i
, const string
& name
) {
410 bool name_exists(const string
& name
) const {
412 return name_rmap
.count(name
);
414 bool item_exists(int i
) const {
415 return name_map
.count(i
);
417 int get_item_id(const string
& name
) const {
419 if (name_rmap
.count(name
))
420 return name_rmap
[name
];
423 const char *get_item_name(int t
) const {
424 std::map
<int,string
>::const_iterator p
= name_map
.find(t
);
425 if (p
!= name_map
.end())
426 return p
->second
.c_str();
429 int set_item_name(int i
, const string
& name
) {
430 if (!is_valid_crush_name(name
))
437 void swap_names(int a
, int b
) {
438 string an
= name_map
[a
];
439 string bn
= name_map
[b
];
447 int split_id_class(int i
, int *idout
, int *classout
) const;
449 bool class_exists(const string
& name
) const {
450 return class_rname
.count(name
);
452 const char *get_class_name(int i
) const {
453 auto p
= class_name
.find(i
);
454 if (p
!= class_name
.end())
455 return p
->second
.c_str();
458 int get_class_id(const string
& name
) const {
459 auto p
= class_rname
.find(name
);
460 if (p
!= class_rname
.end())
465 int remove_class_name(const string
& name
) {
466 auto p
= class_rname
.find(name
);
467 if (p
== class_rname
.end())
469 int class_id
= p
->second
;
470 auto q
= class_name
.find(class_id
);
471 if (q
== class_name
.end())
473 class_rname
.erase(name
);
474 class_name
.erase(class_id
);
478 int32_t _alloc_class_id() const;
480 int get_or_create_class_id(const string
& name
) {
481 int c
= get_class_id(name
);
483 int i
= _alloc_class_id();
484 class_name
[i
] = name
;
485 class_rname
[name
] = i
;
492 const char *get_item_class(int t
) const {
493 std::map
<int,int>::const_iterator p
= class_map
.find(t
);
494 if (p
== class_map
.end())
496 return get_class_name(p
->second
);
498 int set_item_class(int i
, const string
& name
) {
499 if (!is_valid_crush_name(name
))
501 class_map
[i
] = get_or_create_class_id(name
);
504 int set_item_class(int i
, int c
) {
508 void get_devices_by_class(const string
&name
, set
<int> *devices
) const {
511 if (!class_exists(name
)) {
514 auto cid
= get_class_id(name
);
515 for (auto& p
: class_map
) {
516 if (p
.first
>= 0 && p
.second
== cid
) {
517 devices
->insert(p
.first
);
521 void class_remove_item(int i
) {
522 auto it
= class_map
.find(i
);
523 if (it
== class_map
.end()) {
528 int can_rename_item(const string
& srcname
,
529 const string
& dstname
,
531 int rename_item(const string
& srcname
,
532 const string
& dstname
,
534 int can_rename_bucket(const string
& srcname
,
535 const string
& dstname
,
537 int rename_bucket(const string
& srcname
,
538 const string
& dstname
,
542 bool rule_exists(string name
) const {
544 return rule_name_rmap
.count(name
);
546 int get_rule_id(string name
) const {
548 if (rule_name_rmap
.count(name
))
549 return rule_name_rmap
[name
];
552 const char *get_rule_name(int t
) const {
553 std::map
<int,string
>::const_iterator p
= rule_name_map
.find(t
);
554 if (p
!= rule_name_map
.end())
555 return p
->second
.c_str();
558 void set_rule_name(int i
, const string
& name
) {
559 rule_name_map
[i
] = name
;
561 rule_name_rmap
[name
] = i
;
563 bool is_shadow_item(int id
) const {
564 const char *name
= get_item_name(id
);
565 return name
&& !is_valid_crush_name(name
);
570 * find tree nodes referenced by rules by a 'take' command
572 * Note that these may not be parentless roots.
574 void find_takes(set
<int>& roots
) const;
579 * These are parentless nodes in the map.
581 void find_roots(set
<int>& roots
) const;
585 * find tree roots that contain shadow (device class) items only
587 void find_shadow_roots(set
<int>& roots
) const {
591 if (is_shadow_item(p
)) {
598 * find tree roots that are not shadow (device class) items
600 * These are parentless nodes in the map that are not shadow
601 * items for device classes.
603 void find_nonshadow_roots(set
<int>& roots
) const {
607 if (!is_shadow_item(p
)) {
614 * see if an item is contained within a subtree
616 * @param root haystack
618 * @return true if the item is located beneath the given node
620 bool subtree_contains(int root
, int item
) const;
624 * search for an item in any bucket
627 * @return true if present
629 bool _search_item_exists(int i
) const;
633 * see if item is located where we think it is
635 * This verifies that the given item is located at a particular
636 * location in the hierarchy. However, that check is imprecise; we
637 * are actually verifying that the most specific location key/value
638 * is correct. For example, if loc specifies that rack=foo and
639 * host=bar, it will verify that host=bar is correct; any placement
640 * above that level in the hierarchy is ignored. This matches the
641 * semantics for insert_item().
644 * @param item item id
645 * @param loc location to check (map of type to bucket names)
646 * @param weight optional pointer to weight of item at that location
647 * @return true if item is at specified location
649 bool check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
, int *iweight
);
650 bool check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
, float *weight
) {
652 bool ret
= check_item_loc(cct
, item
, loc
, &iweight
);
654 *weight
= (float)iweight
/ (float)0x10000;
660 * returns the (type, name) of the parent bucket of id
662 * FIXME: ambiguous for items that occur multiple times in the map
664 pair
<string
,string
> get_immediate_parent(int id
, int *ret
= NULL
);
666 int get_immediate_parent_id(int id
, int *parent
) const;
669 * return ancestor of the given type, or 0 if none
670 * (parent is always a bucket and thus <0)
672 int get_parent_of_type(int id
, int type
) const;
675 * get the fully qualified location of a device by successively finding
676 * parents beginning at ID and ending at highest type number specified in
677 * the CRUSH map which assumes that if device foo is under device bar, the
678 * type_id of foo < bar where type_id is the integer specified in the CRUSH map
680 * returns the location in the form of (type=foo) where type is a type of bucket
681 * specified in the CRUSH map and foo is a name specified in the CRUSH map
683 map
<string
, string
> get_full_location(int id
);
686 * identical to get_full_location(int id) although it returns the type/name
687 * pairs in the order they occur in the hierarchy.
689 * returns -ENOENT if id is not found.
691 int get_full_location_ordered(int id
, vector
<pair
<string
, string
> >& path
);
694 * identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
695 * although it returns a concatenated string with the type/name pairs in descending
696 * hierarchical order with format key1=val1,key2=val2.
698 * returns the location in descending hierarchy as a string.
700 string
get_full_location_ordered_string(int id
);
703 * returns (type_id, type) of all parent buckets between id and
704 * default, can be used to check for anomolous CRUSH maps
706 map
<int, string
> get_parent_hierarchy(int id
);
709 * enumerate immediate children of given node
711 * @param id parent bucket or device id
712 * @return number of items, or error
714 int get_children(int id
, list
<int> *children
);
717 * enumerate leaves(devices) of given node
719 * @param name parent bucket name
720 * @return 0 on success or a negative errno on error.
722 int get_leaves(const string
&name
, set
<int> *leaves
);
723 int _get_leaves(int id
, list
<int> *leaves
); // worker
726 * insert an item into the map at a specific position
728 * Add an item as a specific location of the hierarchy.
729 * Specifically, we look for the most specific location constraint
730 * for which a bucket already exists, and then create intervening
731 * buckets beneath that in order to place the item.
733 * Note that any location specifiers *above* the most specific match
734 * are ignored. For example, if we specify that osd.12 goes in
735 * host=foo, rack=bar, and row=baz, and rack=bar is the most
736 * specific match, we will create host=foo beneath that point and
737 * put osd.12 inside it. However, we will not verify that rack=bar
738 * is beneath row=baz or move it.
740 * In short, we will build out a hierarchy, and move leaves around,
741 * but not adjust the hierarchy's internal structure. Yet.
743 * If the item is already present in the map, we will return EEXIST.
744 * If the location key/value pairs are nonsensical
745 * (rack=nameofdevice), or location specifies that do not attach us
746 * to any existing part of the hierarchy, we will return EINVAL.
750 * @param weight item weight
751 * @param name item name
752 * @param loc location (map of type to bucket names)
753 * @return 0 for success, negative on error
755 int insert_item(CephContext
*cct
, int id
, float weight
, string name
, const map
<string
,string
>& loc
);
758 * move a bucket in the hierarchy to the given location
760 * This has the same location and ancestor creation behavior as
761 * insert_item(), but will relocate the specified existing bucket.
764 * @param id bucket id
765 * @param loc location (map of type to bucket names)
766 * @return 0 for success, negative on error
768 int move_bucket(CephContext
*cct
, int id
, const map
<string
,string
>& loc
);
771 * swap bucket contents of two buckets without touching bucket ids
774 * @param src bucket a
775 * @param dst bucket b
776 * @return 0 for success, negative on error
778 int swap_bucket(CephContext
*cct
, int src
, int dst
);
781 * add a link to an existing bucket in the hierarchy to the new location
783 * This has the same location and ancestor creation behavior as
784 * insert_item(), but will add a new link to the specified existing
788 * @param id bucket id
789 * @param loc location (map of type to bucket names)
790 * @return 0 for success, negative on error
792 int link_bucket(CephContext
*cct
, int id
, const map
<string
,string
>& loc
);
795 * add or update an item's position in the map
797 * This is analogous to insert_item, except we will move an item if
798 * it is already present.
802 * @param weight item weight
803 * @param name item name
804 * @param loc location (map of type to bucket names)
805 * @return 0 for no change, 1 for successful change, negative on error
807 int update_item(CephContext
*cct
, int id
, float weight
, string name
, const map
<string
,string
>& loc
);
810 * create or move an item, but do not adjust its weight if it already exists
813 * @param item item id
814 * @param weight initial item weight (if we need to create it)
815 * @param name item name
816 * @param loc location (map of type to bucket names)
817 * @return 0 for no change, 1 for successful change, negative on error
819 int create_or_move_item(CephContext
*cct
, int item
, float weight
, string name
,
820 const map
<string
,string
>& loc
);
823 * remove all instances of an item from the map
826 * @param id item id to remove
827 * @param unlink_only unlink but do not remove bucket (useful if multiple links or not empty)
828 * @return 0 on success, negative on error
830 int remove_item(CephContext
*cct
, int id
, bool unlink_only
);
833 * recursively remove buckets starting at item and stop removing
834 * when a bucket is in use.
836 * @param item id to remove
837 * @param unused true if only unused items should be removed
838 * @return 0 on success, negative on error
840 int remove_root(int item
, bool unused
);
843 * remove all instances of an item nested beneath a certain point from the map
846 * @param id item id to remove
847 * @param ancestor ancestor item id under which to search for id
848 * @param unlink_only unlink but do not remove bucket (useful if bucket has multiple links or is not empty)
849 * @return 0 on success, negative on error
852 bool _maybe_remove_last_instance(CephContext
*cct
, int id
, bool unlink_only
);
853 int _remove_item_under(CephContext
*cct
, int id
, int ancestor
, bool unlink_only
);
854 bool _bucket_is_in_use(int id
);
856 int remove_item_under(CephContext
*cct
, int id
, int ancestor
, bool unlink_only
);
859 * calculate the locality/distance from a given id to a crush location map
861 * Specifically, we look for the lowest-valued type for which the
862 * location of id matches that described in loc.
865 * @param id the existing id in the map
866 * @param loc a set of key=value pairs describing a location in the hierarchy
868 int get_common_ancestor_distance(CephContext
*cct
, int id
,
869 const std::multimap
<string
,string
>& loc
);
872 * parse a set of key/value pairs out of a string vector
874 * These are used to describe a location in the CRUSH hierarchy.
876 * @param args list of strings (each key= or key=value)
877 * @param ploc pointer to a resulting location map or multimap
879 static int parse_loc_map(const std::vector
<string
>& args
,
880 std::map
<string
,string
> *ploc
);
881 static int parse_loc_multimap(const std::vector
<string
>& args
,
882 std::multimap
<string
,string
> *ploc
);
885 * get an item's weight
887 * Will return the weight for the first instance it finds.
889 * @param id item id to check
890 * @return weight of item
892 int get_item_weight(int id
) const;
893 float get_item_weightf(int id
) const {
894 return (float)get_item_weight(id
) / (float)0x10000;
896 int get_item_weight_in_loc(int id
, const map
<string
,string
> &loc
);
897 float get_item_weightf_in_loc(int id
, const map
<string
,string
> &loc
) {
898 return (float)get_item_weight_in_loc(id
, loc
) / (float)0x10000;
901 int validate_weightf(float weight
) {
902 uint64_t iweight
= weight
* 0x10000;
903 if (iweight
> std::numeric_limits
<int>::max()) {
908 int adjust_item_weight(CephContext
*cct
, int id
, int weight
);
909 int adjust_item_weightf(CephContext
*cct
, int id
, float weight
) {
910 int r
= validate_weightf(weight
);
914 return adjust_item_weight(cct
, id
, (int)(weight
* (float)0x10000));
916 int adjust_item_weight_in_loc(CephContext
*cct
, int id
, int weight
, const map
<string
,string
>& loc
);
917 int adjust_item_weightf_in_loc(CephContext
*cct
, int id
, float weight
, const map
<string
,string
>& loc
) {
918 int r
= validate_weightf(weight
);
922 return adjust_item_weight_in_loc(cct
, id
, (int)(weight
* (float)0x10000), loc
);
924 void reweight(CephContext
*cct
);
926 int adjust_subtree_weight(CephContext
*cct
, int id
, int weight
);
927 int adjust_subtree_weightf(CephContext
*cct
, int id
, float weight
) {
928 int r
= validate_weightf(weight
);
932 return adjust_subtree_weight(cct
, id
, (int)(weight
* (float)0x10000));
935 /// check if item id is present in the map hierarchy
936 bool check_item_present(int id
) const;
940 int get_max_devices() const {
941 if (!crush
) return 0;
942 return crush
->max_devices
;
948 crush_rule
*get_rule(unsigned ruleno
) const {
949 if (!crush
) return (crush_rule
*)(-ENOENT
);
950 if (ruleno
>= crush
->max_rules
)
952 return crush
->rules
[ruleno
];
954 crush_rule_step
*get_rule_step(unsigned ruleno
, unsigned step
) const {
955 crush_rule
*n
= get_rule(ruleno
);
956 if (IS_ERR(n
)) return (crush_rule_step
*)(-EINVAL
);
957 if (step
>= n
->len
) return (crush_rule_step
*)(-EINVAL
);
958 return &n
->steps
[step
];
963 int get_max_rules() const {
964 if (!crush
) return 0;
965 return crush
->max_rules
;
967 bool rule_exists(unsigned ruleno
) const {
968 if (!crush
) return false;
969 if (ruleno
< crush
->max_rules
&&
970 crush
->rules
[ruleno
] != NULL
)
974 int get_rule_len(unsigned ruleno
) const {
975 crush_rule
*r
= get_rule(ruleno
);
976 if (IS_ERR(r
)) return PTR_ERR(r
);
979 int get_rule_mask_ruleset(unsigned ruleno
) const {
980 crush_rule
*r
= get_rule(ruleno
);
981 if (IS_ERR(r
)) return -1;
982 return r
->mask
.ruleset
;
984 int get_rule_mask_type(unsigned ruleno
) const {
985 crush_rule
*r
= get_rule(ruleno
);
986 if (IS_ERR(r
)) return -1;
989 int get_rule_mask_min_size(unsigned ruleno
) const {
990 crush_rule
*r
= get_rule(ruleno
);
991 if (IS_ERR(r
)) return -1;
992 return r
->mask
.min_size
;
994 int get_rule_mask_max_size(unsigned ruleno
) const {
995 crush_rule
*r
= get_rule(ruleno
);
996 if (IS_ERR(r
)) return -1;
997 return r
->mask
.max_size
;
999 int get_rule_op(unsigned ruleno
, unsigned step
) const {
1000 crush_rule_step
*s
= get_rule_step(ruleno
, step
);
1001 if (IS_ERR(s
)) return PTR_ERR(s
);
1004 int get_rule_arg1(unsigned ruleno
, unsigned step
) const {
1005 crush_rule_step
*s
= get_rule_step(ruleno
, step
);
1006 if (IS_ERR(s
)) return PTR_ERR(s
);
1009 int get_rule_arg2(unsigned ruleno
, unsigned step
) const {
1010 crush_rule_step
*s
= get_rule_step(ruleno
, step
);
1011 if (IS_ERR(s
)) return PTR_ERR(s
);
1016 * calculate a map of osds to weights for a given rule
1018 * Generate a map of which OSDs get how much relative weight for a
1021 * @param ruleno [in] rule id
1022 * @param pmap [out] map of osd to weight
1023 * @return 0 for success, or negative error code
1025 int get_rule_weight_osd_map(unsigned ruleno
, map
<int,float> *pmap
);
1029 int add_rule(int ruleno
, int len
, int type
, int minsize
, int maxsize
) {
1030 if (!crush
) return -ENOENT
;
1031 crush_rule
*n
= crush_make_rule(len
, ruleno
, type
, minsize
, maxsize
);
1033 ruleno
= crush_add_rule(crush
, n
, ruleno
);
1036 int set_rule_mask_max_size(unsigned ruleno
, int max_size
) {
1037 crush_rule
*r
= get_rule(ruleno
);
1038 if (IS_ERR(r
)) return -1;
1039 return r
->mask
.max_size
= max_size
;
1041 int set_rule_step(unsigned ruleno
, unsigned step
, int op
, int arg1
, int arg2
) {
1042 if (!crush
) return -ENOENT
;
1043 crush_rule
*n
= get_rule(ruleno
);
1045 crush_rule_set_step(n
, step
, op
, arg1
, arg2
);
1048 int set_rule_step_take(unsigned ruleno
, unsigned step
, int val
) {
1049 return set_rule_step(ruleno
, step
, CRUSH_RULE_TAKE
, val
, 0);
1051 int set_rule_step_set_choose_tries(unsigned ruleno
, unsigned step
, int val
) {
1052 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSE_TRIES
, val
, 0);
1054 int set_rule_step_set_choose_local_tries(unsigned ruleno
, unsigned step
, int val
) {
1055 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES
, val
, 0);
1057 int set_rule_step_set_choose_local_fallback_tries(unsigned ruleno
, unsigned step
, int val
) {
1058 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES
, val
, 0);
1060 int set_rule_step_set_chooseleaf_tries(unsigned ruleno
, unsigned step
, int val
) {
1061 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, val
, 0);
1063 int set_rule_step_set_chooseleaf_vary_r(unsigned ruleno
, unsigned step
, int val
) {
1064 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSELEAF_VARY_R
, val
, 0);
1066 int set_rule_step_set_chooseleaf_stable(unsigned ruleno
, unsigned step
, int val
) {
1067 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSELEAF_STABLE
, val
, 0);
1069 int set_rule_step_choose_firstn(unsigned ruleno
, unsigned step
, int val
, int type
) {
1070 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSE_FIRSTN
, val
, type
);
1072 int set_rule_step_choose_indep(unsigned ruleno
, unsigned step
, int val
, int type
) {
1073 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSE_INDEP
, val
, type
);
1075 int set_rule_step_choose_leaf_firstn(unsigned ruleno
, unsigned step
, int val
, int type
) {
1076 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSELEAF_FIRSTN
, val
, type
);
1078 int set_rule_step_choose_leaf_indep(unsigned ruleno
, unsigned step
, int val
, int type
) {
1079 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSELEAF_INDEP
, val
, type
);
1081 int set_rule_step_emit(unsigned ruleno
, unsigned step
) {
1082 return set_rule_step(ruleno
, step
, CRUSH_RULE_EMIT
, 0, 0);
1085 int add_simple_rule(
1086 string name
, string root_name
, string failure_domain_type
,
1087 string device_class
,
1088 string mode
, int rule_type
, ostream
*err
= 0);
1091 * @param rno rule[set] id to use, -1 to pick the lowest available
1093 int add_simple_rule_at(
1094 string name
, string root_name
,
1095 string failure_domain_type
, string device_class
, string mode
,
1096 int rule_type
, int rno
, ostream
*err
= 0);
1098 int remove_rule(int ruleno
);
1102 const crush_bucket
*get_bucket(int id
) const {
1104 return (crush_bucket
*)(-EINVAL
);
1105 unsigned int pos
= (unsigned int)(-1 - id
);
1106 unsigned int max_buckets
= crush
->max_buckets
;
1107 if (pos
>= max_buckets
)
1108 return (crush_bucket
*)(-ENOENT
);
1109 crush_bucket
*ret
= crush
->buckets
[pos
];
1111 return (crush_bucket
*)(-ENOENT
);
1115 crush_bucket
*get_bucket(int id
) {
1117 return (crush_bucket
*)(-EINVAL
);
1118 unsigned int pos
= (unsigned int)(-1 - id
);
1119 unsigned int max_buckets
= crush
->max_buckets
;
1120 if (pos
>= max_buckets
)
1121 return (crush_bucket
*)(-ENOENT
);
1122 crush_bucket
*ret
= crush
->buckets
[pos
];
1124 return (crush_bucket
*)(-ENOENT
);
1128 * detach a bucket from its parent and adjust the parent weight
1130 * returns the weight of the detached bucket
1132 int detach_bucket(CephContext
*cct
, int item
);
1135 int get_max_buckets() const {
1136 if (!crush
) return -EINVAL
;
1137 return crush
->max_buckets
;
1139 int get_next_bucket_id() const {
1140 if (!crush
) return -EINVAL
;
1141 return crush_get_next_bucket_id(crush
);
1143 bool bucket_exists(int id
) const {
1144 const crush_bucket
*b
= get_bucket(id
);
1149 int get_bucket_weight(int id
) const {
1150 const crush_bucket
*b
= get_bucket(id
);
1151 if (IS_ERR(b
)) return PTR_ERR(b
);
1154 float get_bucket_weightf(int id
) const {
1155 const crush_bucket
*b
= get_bucket(id
);
1156 if (IS_ERR(b
)) return 0;
1157 return b
->weight
/ (float)0x10000;
1159 int get_bucket_type(int id
) const {
1160 const crush_bucket
*b
= get_bucket(id
);
1161 if (IS_ERR(b
)) return PTR_ERR(b
);
1164 int get_bucket_alg(int id
) const {
1165 const crush_bucket
*b
= get_bucket(id
);
1166 if (IS_ERR(b
)) return PTR_ERR(b
);
1169 int get_bucket_hash(int id
) const {
1170 const crush_bucket
*b
= get_bucket(id
);
1171 if (IS_ERR(b
)) return PTR_ERR(b
);
1174 int get_bucket_size(int id
) const {
1175 const crush_bucket
*b
= get_bucket(id
);
1176 if (IS_ERR(b
)) return PTR_ERR(b
);
1179 int get_bucket_item(int id
, int pos
) const {
1180 const crush_bucket
*b
= get_bucket(id
);
1181 if (IS_ERR(b
)) return PTR_ERR(b
);
1182 if ((__u32
)pos
>= b
->size
)
1184 return b
->items
[pos
];
1186 int get_bucket_item_weight(int id
, int pos
) const {
1187 const crush_bucket
*b
= get_bucket(id
);
1188 if (IS_ERR(b
)) return PTR_ERR(b
);
1189 return crush_get_bucket_item_weight(b
, pos
);
1191 float get_bucket_item_weightf(int id
, int pos
) const {
1192 const crush_bucket
*b
= get_bucket(id
);
1193 if (IS_ERR(b
)) return 0;
1194 return (float)crush_get_bucket_item_weight(b
, pos
) / (float)0x10000;
1198 int add_bucket(int bucketno
, int alg
, int hash
, int type
, int size
,
1199 int *items
, int *weights
, int *idout
);
1200 int bucket_add_item(crush_bucket
*bucket
, int item
, int weight
);
1201 int bucket_remove_item(struct crush_bucket
*bucket
, int item
);
1202 int bucket_adjust_item_weight(CephContext
*cct
, struct crush_bucket
*bucket
, int item
, int weight
);
1206 crush_finalize(crush
);
1207 have_uniform_rules
= !has_legacy_rulesets();
1210 int update_device_class(int id
, const string
& class_name
, const string
& name
, ostream
*ss
);
1211 int remove_device_class(CephContext
*cct
, int id
, ostream
*ss
);
1212 int device_class_clone(
1213 int original
, int device_class
,
1214 const std::map
<int32_t, map
<int32_t, int32_t>>& old_class_bucket
,
1215 const std::set
<int32_t>& used_ids
,
1217 int populate_classes(
1218 const std::map
<int32_t, map
<int32_t, int32_t>>& old_class_bucket
);
1219 bool _class_is_dead(int class_id
);
1220 void cleanup_dead_classes();
1221 int rebuild_roots_with_classes();
1222 /* remove unused roots generated for class devices */
1223 int trim_roots_with_class(bool unused
);
1225 void start_choose_profile() {
1226 free(crush
->choose_tries
);
1228 * the original choose_total_tries value was off by one (it
1229 * counted "retries" and not "tries"). add one to alloc.
1231 crush
->choose_tries
= (__u32
*)calloc(sizeof(*crush
->choose_tries
),
1232 (crush
->choose_total_tries
+ 1));
1233 memset(crush
->choose_tries
, 0,
1234 sizeof(*crush
->choose_tries
) * (crush
->choose_total_tries
+ 1));
1236 void stop_choose_profile() {
1237 free(crush
->choose_tries
);
1238 crush
->choose_tries
= 0;
1241 int get_choose_profile(__u32
**vec
) {
1242 if (crush
->choose_tries
) {
1243 *vec
= crush
->choose_tries
;
1244 return crush
->choose_total_tries
;
1250 void set_max_devices(int m
) {
1251 crush
->max_devices
= m
;
1254 int find_rule(int ruleset
, int type
, int size
) const {
1255 if (!crush
) return -1;
1256 if (!have_uniform_rules
) {
1257 return crush_find_rule(crush
, ruleset
, type
, size
);
1259 if (ruleset
< (int)crush
->max_rules
&&
1260 crush
->rules
[ruleset
])
1266 bool ruleset_exists(const int ruleset
) const {
1267 for (size_t i
= 0; i
< crush
->max_rules
; ++i
) {
1268 if (rule_exists(i
) && crush
->rules
[i
]->mask
.ruleset
== ruleset
) {
1277 * Return the lowest numbered ruleset of type `type`
1279 * @returns a ruleset ID, or -1 if no matching rulesets found.
1281 int find_first_ruleset(int type
) const {
1284 for (size_t i
= 0; i
< crush
->max_rules
; ++i
) {
1286 && crush
->rules
[i
]->mask
.type
== type
1287 && (crush
->rules
[i
]->mask
.ruleset
< result
|| result
== -1)) {
1288 result
= crush
->rules
[i
]->mask
.ruleset
;
1295 bool have_choose_args(int64_t choose_args_index
) const {
1296 return choose_args
.count(choose_args_index
);
1299 crush_choose_arg_map
choose_args_get_with_fallback(
1300 int64_t choose_args_index
) const {
1301 auto i
= choose_args
.find(choose_args_index
);
1302 if (i
== choose_args
.end()) {
1303 i
= choose_args
.find(DEFAULT_CHOOSE_ARGS
);
1305 if (i
== choose_args
.end()) {
1306 crush_choose_arg_map arg_map
;
1307 arg_map
.args
= NULL
;
1314 crush_choose_arg_map
choose_args_get(int64_t choose_args_index
) const {
1315 auto i
= choose_args
.find(choose_args_index
);
1316 if (i
== choose_args
.end()) {
1317 crush_choose_arg_map arg_map
;
1318 arg_map
.args
= NULL
;
1326 void destroy_choose_args(crush_choose_arg_map arg_map
) {
1327 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
1328 crush_choose_arg
*arg
= &arg_map
.args
[i
];
1329 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1330 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1331 free(weight_set
->weights
);
1333 if (arg
->weight_set
)
1334 free(arg
->weight_set
);
1341 void create_choose_args(int64_t id
, int positions
) {
1342 if (choose_args
.count(id
))
1345 auto &cmap
= choose_args
[id
];
1346 cmap
.args
= (crush_choose_arg
*)calloc(sizeof(crush_choose_arg
),
1347 crush
->max_buckets
);
1348 cmap
.size
= crush
->max_buckets
;
1349 for (int bidx
=0; bidx
< crush
->max_buckets
; ++bidx
) {
1350 crush_bucket
*b
= crush
->buckets
[bidx
];
1351 auto &carg
= cmap
.args
[bidx
];
1354 if (b
&& b
->alg
== CRUSH_BUCKET_STRAW2
) {
1355 crush_bucket_straw2
*sb
= (crush_bucket_straw2
*)b
;
1356 carg
.weight_set_size
= positions
;
1357 carg
.weight_set
= (crush_weight_set
*)calloc(sizeof(crush_weight_set
),
1358 carg
.weight_set_size
);
1359 // initialize with canonical weights
1360 for (int pos
= 0; pos
< positions
; ++pos
) {
1361 carg
.weight_set
[pos
].size
= b
->size
;
1362 carg
.weight_set
[pos
].weights
= (__u32
*)calloc(4, b
->size
);
1363 for (unsigned i
= 0; i
< b
->size
; ++i
) {
1364 carg
.weight_set
[pos
].weights
[i
] = sb
->item_weights
[i
];
1368 carg
.weight_set
= NULL
;
1369 carg
.weight_set_size
= 0;
1374 void rm_choose_args(int64_t id
) {
1375 auto p
= choose_args
.find(id
);
1376 if (p
!= choose_args
.end()) {
1377 destroy_choose_args(p
->second
);
1378 choose_args
.erase(p
);
1382 void choose_args_clear() {
1383 for (auto w
: choose_args
)
1384 destroy_choose_args(w
.second
);
1385 choose_args
.clear();
1388 // adjust choose_args_map weight, preserving the hierarchical summation
1389 // property. used by callers optimizing layouts by tweaking weights.
1390 int _choose_args_adjust_item_weight_in_bucket(
1392 crush_choose_arg_map cmap
,
1395 const vector
<int>& weight
,
1397 int choose_args_adjust_item_weight(
1399 crush_choose_arg_map cmap
,
1400 int id
, const vector
<int>& weight
,
1402 int choose_args_adjust_item_weightf(
1404 crush_choose_arg_map cmap
,
1405 int id
, const vector
<double>& weightf
,
1407 vector
<int> weight(weightf
.size());
1408 for (unsigned i
= 0; i
< weightf
.size(); ++i
) {
1409 weight
[i
] = (int)(weightf
[i
] * (float)0x10000);
1411 return choose_args_adjust_item_weight(cct
, cmap
, id
, weight
, ss
);
1414 int get_choose_args_positions(crush_choose_arg_map cmap
) {
1415 // infer positions from other buckets
1416 for (unsigned j
= 0; j
< cmap
.size
; ++j
) {
1417 if (cmap
.args
[j
].weight_set_size
) {
1418 return cmap
.args
[j
].weight_set_size
;
1424 template<typename WeightVector
>
1425 void do_rule(int rule
, int x
, vector
<int>& out
, int maxout
,
1426 const WeightVector
& weight
,
1427 uint64_t choose_args_index
) const {
1429 char work
[crush_work_size(crush
, maxout
)];
1430 crush_init_workspace(crush
, work
);
1431 crush_choose_arg_map arg_map
= choose_args_get_with_fallback(
1433 int numrep
= crush_do_rule(crush
, rule
, x
, rawout
, maxout
, &weight
[0],
1434 weight
.size(), work
, arg_map
.args
);
1438 for (int i
=0; i
<numrep
; i
++)
1442 int _choose_type_stack(
1444 const vector
<pair
<int,int>>& stack
,
1445 const set
<int>& overfull
,
1446 const vector
<int>& underfull
,
1447 const vector
<int>& orig
,
1448 vector
<int>::const_iterator
& i
,
1450 vector
<int> *pw
) const;
1456 const set
<int>& overfull
,
1457 const vector
<int>& underfull
,
1458 const vector
<int>& orig
,
1459 vector
<int> *out
) const;
1461 bool check_crush_rule(int ruleset
, int type
, int size
, ostream
& ss
) {
1465 for (i
= 0; i
< crush
->max_rules
; i
++) {
1466 if (crush
->rules
[i
] &&
1467 crush
->rules
[i
]->mask
.ruleset
== ruleset
&&
1468 crush
->rules
[i
]->mask
.type
== type
) {
1470 if (crush
->rules
[i
]->mask
.min_size
<= size
&&
1471 crush
->rules
[i
]->mask
.max_size
>= size
) {
1473 } else if (size
< crush
->rules
[i
]->mask
.min_size
) {
1474 ss
<< "pool size is smaller than the crush rule min size";
1477 ss
<< "pool size is bigger than the crush rule max size";
1486 void encode(bufferlist
&bl
, uint64_t features
) const;
1487 void decode(bufferlist::iterator
&blp
);
1488 void decode_crush_bucket(crush_bucket
** bptr
, bufferlist::iterator
&blp
);
1489 void dump(Formatter
*f
) const;
1490 void dump_rules(Formatter
*f
) const;
1491 void dump_rule(int ruleset
, Formatter
*f
) const;
1492 void dump_tunables(Formatter
*f
) const;
1493 void dump_choose_args(Formatter
*f
) const;
1494 void list_rules(Formatter
*f
) const;
1495 void list_rules(ostream
*ss
) const;
1496 void dump_tree(ostream
*out
,
1498 const CrushTreeDumper::name_map_t
& ws
,
1499 bool show_shadow
= false) const;
1500 void dump_tree(ostream
*out
, Formatter
*f
) {
1501 dump_tree(out
, f
, CrushTreeDumper::name_map_t());
1503 void dump_tree(Formatter
*f
,
1504 const CrushTreeDumper::name_map_t
& ws
) const;
1505 static void generate_test_instances(list
<CrushWrapper
*>& o
);
1507 int get_osd_pool_default_crush_replicated_ruleset(CephContext
*cct
);
1509 static bool is_valid_crush_name(const string
& s
);
1510 static bool is_valid_crush_loc(CephContext
*cct
,
1511 const map
<string
,string
>& loc
);
1513 WRITE_CLASS_ENCODER_FEATURES(CrushWrapper
)