1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #ifndef CEPH_CRUSH_WRAPPER_H
5 #define CEPH_CRUSH_WRAPPER_H
14 #include "include/types.h"
23 #include "include/assert.h"
24 #include "include/err.h"
25 #include "include/encoding.h"
28 #include "common/Mutex.h"
30 #define BUG_ON(x) assert(!(x))
36 WRITE_RAW_ENCODER(crush_rule_mask
) // it's all u8's
38 inline static void encode(const crush_rule_step
&s
, bufferlist
&bl
)
44 inline static void decode(crush_rule_step
&s
, bufferlist::iterator
&p
)
54 std::map
<int32_t, string
> type_map
; /* bucket/device type names */
55 std::map
<int32_t, string
> name_map
; /* bucket/device names */
56 std::map
<int32_t, string
> rule_name_map
;
57 std::map
<int32_t, int32_t> class_map
; /* item id -> class id */
58 std::map
<int32_t, string
> class_name
; /* class id -> class name */
59 std::map
<string
, int32_t> class_rname
; /* class name -> class id */
60 std::map
<int32_t, map
<int32_t, int32_t> > class_bucket
; /* bucket[id][class] == id */
61 std::map
<uint64_t, crush_choose_arg_map
> choose_args
;
64 struct crush_map
*crush
;
66 bool have_uniform_rules
= false;
69 mutable bool have_rmaps
;
70 mutable std::map
<string
, int> type_rmap
, name_rmap
, rule_name_rmap
;
71 void build_rmaps() const {
72 if (have_rmaps
) return;
73 build_rmap(type_map
, type_rmap
);
74 build_rmap(name_map
, name_rmap
);
75 build_rmap(rule_name_map
, rule_name_rmap
);
78 void build_rmap(const map
<int, string
> &f
, std::map
<string
, int> &r
) const {
80 for (std::map
<int, string
>::const_iterator p
= f
.begin(); p
!= f
.end(); ++p
)
81 r
[p
->second
] = p
->first
;
85 CrushWrapper(const CrushWrapper
& other
);
86 const CrushWrapper
& operator=(const CrushWrapper
& other
);
88 CrushWrapper() : crush(0), have_rmaps(false) {
97 crush_map
*get_crush_map() { return crush
; }
102 crush_destroy(crush
);
103 crush
= crush_create();
108 set_tunables_default();
111 /// true if any rule has a ruleset != the rule id
112 bool has_legacy_rulesets() const;
114 /// fix rules whose ruleid != ruleset
115 int renumber_rules_by_ruleset();
117 /// true if any ruleset has more than 1 rule
118 bool has_multirule_rulesets() const;
121 void set_tunables_argonaut() {
122 crush
->choose_local_tries
= 2;
123 crush
->choose_local_fallback_tries
= 5;
124 crush
->choose_total_tries
= 19;
125 crush
->chooseleaf_descend_once
= 0;
126 crush
->chooseleaf_vary_r
= 0;
127 crush
->chooseleaf_stable
= 0;
128 crush
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
130 void set_tunables_bobtail() {
131 crush
->choose_local_tries
= 0;
132 crush
->choose_local_fallback_tries
= 0;
133 crush
->choose_total_tries
= 50;
134 crush
->chooseleaf_descend_once
= 1;
135 crush
->chooseleaf_vary_r
= 0;
136 crush
->chooseleaf_stable
= 0;
137 crush
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
139 void set_tunables_firefly() {
140 crush
->choose_local_tries
= 0;
141 crush
->choose_local_fallback_tries
= 0;
142 crush
->choose_total_tries
= 50;
143 crush
->chooseleaf_descend_once
= 1;
144 crush
->chooseleaf_vary_r
= 1;
145 crush
->chooseleaf_stable
= 0;
146 crush
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
148 void set_tunables_hammer() {
149 crush
->choose_local_tries
= 0;
150 crush
->choose_local_fallback_tries
= 0;
151 crush
->choose_total_tries
= 50;
152 crush
->chooseleaf_descend_once
= 1;
153 crush
->chooseleaf_vary_r
= 1;
154 crush
->chooseleaf_stable
= 0;
155 crush
->allowed_bucket_algs
=
156 (1 << CRUSH_BUCKET_UNIFORM
) |
157 (1 << CRUSH_BUCKET_LIST
) |
158 (1 << CRUSH_BUCKET_STRAW
) |
159 (1 << CRUSH_BUCKET_STRAW2
);
161 void set_tunables_jewel() {
162 crush
->choose_local_tries
= 0;
163 crush
->choose_local_fallback_tries
= 0;
164 crush
->choose_total_tries
= 50;
165 crush
->chooseleaf_descend_once
= 1;
166 crush
->chooseleaf_vary_r
= 1;
167 crush
->chooseleaf_stable
= 1;
168 crush
->allowed_bucket_algs
=
169 (1 << CRUSH_BUCKET_UNIFORM
) |
170 (1 << CRUSH_BUCKET_LIST
) |
171 (1 << CRUSH_BUCKET_STRAW
) |
172 (1 << CRUSH_BUCKET_STRAW2
);
175 void set_tunables_legacy() {
176 set_tunables_argonaut();
177 crush
->straw_calc_version
= 0;
179 void set_tunables_optimal() {
180 set_tunables_jewel();
181 crush
->straw_calc_version
= 1;
183 void set_tunables_default() {
184 set_tunables_jewel();
185 crush
->straw_calc_version
= 1;
188 int get_choose_local_tries() const {
189 return crush
->choose_local_tries
;
191 void set_choose_local_tries(int n
) {
192 crush
->choose_local_tries
= n
;
195 int get_choose_local_fallback_tries() const {
196 return crush
->choose_local_fallback_tries
;
198 void set_choose_local_fallback_tries(int n
) {
199 crush
->choose_local_fallback_tries
= n
;
202 int get_choose_total_tries() const {
203 return crush
->choose_total_tries
;
205 void set_choose_total_tries(int n
) {
206 crush
->choose_total_tries
= n
;
209 int get_chooseleaf_descend_once() const {
210 return crush
->chooseleaf_descend_once
;
212 void set_chooseleaf_descend_once(int n
) {
213 crush
->chooseleaf_descend_once
= !!n
;
216 int get_chooseleaf_vary_r() const {
217 return crush
->chooseleaf_vary_r
;
219 void set_chooseleaf_vary_r(int n
) {
220 crush
->chooseleaf_vary_r
= n
;
223 int get_chooseleaf_stable() const {
224 return crush
->chooseleaf_stable
;
226 void set_chooseleaf_stable(int n
) {
227 crush
->chooseleaf_stable
= n
;
230 int get_straw_calc_version() const {
231 return crush
->straw_calc_version
;
233 void set_straw_calc_version(int n
) {
234 crush
->straw_calc_version
= n
;
237 unsigned get_allowed_bucket_algs() const {
238 return crush
->allowed_bucket_algs
;
240 void set_allowed_bucket_algs(unsigned n
) {
241 crush
->allowed_bucket_algs
= n
;
244 bool has_argonaut_tunables() const {
246 crush
->choose_local_tries
== 2 &&
247 crush
->choose_local_fallback_tries
== 5 &&
248 crush
->choose_total_tries
== 19 &&
249 crush
->chooseleaf_descend_once
== 0 &&
250 crush
->chooseleaf_vary_r
== 0 &&
251 crush
->chooseleaf_stable
== 0 &&
252 crush
->allowed_bucket_algs
== CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
254 bool has_bobtail_tunables() const {
256 crush
->choose_local_tries
== 0 &&
257 crush
->choose_local_fallback_tries
== 0 &&
258 crush
->choose_total_tries
== 50 &&
259 crush
->chooseleaf_descend_once
== 1 &&
260 crush
->chooseleaf_vary_r
== 0 &&
261 crush
->chooseleaf_stable
== 0 &&
262 crush
->allowed_bucket_algs
== CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
264 bool has_firefly_tunables() const {
266 crush
->choose_local_tries
== 0 &&
267 crush
->choose_local_fallback_tries
== 0 &&
268 crush
->choose_total_tries
== 50 &&
269 crush
->chooseleaf_descend_once
== 1 &&
270 crush
->chooseleaf_vary_r
== 1 &&
271 crush
->chooseleaf_stable
== 0 &&
272 crush
->allowed_bucket_algs
== CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
274 bool has_hammer_tunables() const {
276 crush
->choose_local_tries
== 0 &&
277 crush
->choose_local_fallback_tries
== 0 &&
278 crush
->choose_total_tries
== 50 &&
279 crush
->chooseleaf_descend_once
== 1 &&
280 crush
->chooseleaf_vary_r
== 1 &&
281 crush
->chooseleaf_stable
== 0 &&
282 crush
->allowed_bucket_algs
== ((1 << CRUSH_BUCKET_UNIFORM
) |
283 (1 << CRUSH_BUCKET_LIST
) |
284 (1 << CRUSH_BUCKET_STRAW
) |
285 (1 << CRUSH_BUCKET_STRAW2
));
287 bool has_jewel_tunables() const {
289 crush
->choose_local_tries
== 0 &&
290 crush
->choose_local_fallback_tries
== 0 &&
291 crush
->choose_total_tries
== 50 &&
292 crush
->chooseleaf_descend_once
== 1 &&
293 crush
->chooseleaf_vary_r
== 1 &&
294 crush
->chooseleaf_stable
== 1 &&
295 crush
->allowed_bucket_algs
== ((1 << CRUSH_BUCKET_UNIFORM
) |
296 (1 << CRUSH_BUCKET_LIST
) |
297 (1 << CRUSH_BUCKET_STRAW
) |
298 (1 << CRUSH_BUCKET_STRAW2
));
301 bool has_optimal_tunables() const {
302 return has_jewel_tunables();
304 bool has_legacy_tunables() const {
305 return has_argonaut_tunables();
308 bool has_nondefault_tunables() const {
310 (crush
->choose_local_tries
!= 2 ||
311 crush
->choose_local_fallback_tries
!= 5 ||
312 crush
->choose_total_tries
!= 19);
314 bool has_nondefault_tunables2() const {
316 crush
->chooseleaf_descend_once
!= 0;
318 bool has_nondefault_tunables3() const {
320 crush
->chooseleaf_vary_r
!= 0;
322 bool has_nondefault_tunables5() const {
324 crush
->chooseleaf_stable
!= 0;
327 bool has_v2_rules() const;
328 bool has_v3_rules() const;
329 bool has_v4_buckets() const;
330 bool has_v5_rules() const;
331 bool has_choose_args() const; // any choose_args
332 bool has_incompat_choose_args() const; // choose_args that can't be made compat
334 bool is_v2_rule(unsigned ruleid
) const;
335 bool is_v3_rule(unsigned ruleid
) const;
336 bool is_v5_rule(unsigned ruleid
) const;
338 string
get_min_required_version() const {
339 if (has_v5_rules() || has_nondefault_tunables5())
341 else if (has_v4_buckets())
343 else if (has_nondefault_tunables3())
345 else if (has_nondefault_tunables2() || has_nondefault_tunables())
351 // default bucket types
352 unsigned get_default_bucket_alg() const {
353 // in order of preference
354 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_STRAW2
))
355 return CRUSH_BUCKET_STRAW2
;
356 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_STRAW
))
357 return CRUSH_BUCKET_STRAW
;
358 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_TREE
))
359 return CRUSH_BUCKET_TREE
;
360 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_LIST
))
361 return CRUSH_BUCKET_LIST
;
362 if (crush
->allowed_bucket_algs
& (1 << CRUSH_BUCKET_UNIFORM
))
363 return CRUSH_BUCKET_UNIFORM
;
368 int get_num_type_names() const {
369 return type_map
.size();
371 int get_max_type_id() const {
372 if (type_map
.empty())
374 return type_map
.rbegin()->first
;
376 int get_type_id(const string
& name
) const {
378 if (type_rmap
.count(name
))
379 return type_rmap
[name
];
382 const char *get_type_name(int t
) const {
383 std::map
<int,string
>::const_iterator p
= type_map
.find(t
);
384 if (p
!= type_map
.end())
385 return p
->second
.c_str();
388 void set_type_name(int i
, const string
& name
) {
395 bool name_exists(const string
& name
) const {
397 return name_rmap
.count(name
);
399 bool item_exists(int i
) const {
400 return name_map
.count(i
);
402 int get_item_id(const string
& name
) const {
404 if (name_rmap
.count(name
))
405 return name_rmap
[name
];
408 const char *get_item_name(int t
) const {
409 std::map
<int,string
>::const_iterator p
= name_map
.find(t
);
410 if (p
!= name_map
.end())
411 return p
->second
.c_str();
414 int set_item_name(int i
, const string
& name
) {
415 if (!is_valid_crush_name(name
))
422 void swap_names(int a
, int b
) {
423 string an
= name_map
[a
];
424 string bn
= name_map
[b
];
432 bool id_has_class(int i
) {
435 if (split_id_class(i
, &idout
, &classout
) != 0)
437 return classout
!= -1;
439 int split_id_class(int i
, int *idout
, int *classout
) const;
441 bool class_exists(const string
& name
) const {
442 return class_rname
.count(name
);
444 const char *get_class_name(int i
) const {
445 std::map
<int,string
>::const_iterator p
= class_name
.find(i
);
446 if (p
!= class_name
.end())
447 return p
->second
.c_str();
450 int get_class_id(const string
& name
) const {
451 std::map
<string
,int>::const_iterator p
= class_rname
.find(name
);
452 if (p
!= class_rname
.end())
457 int remove_class_name(const string
& name
) {
458 std::map
<string
,int>::const_iterator p
= class_rname
.find(name
);
459 if (p
== class_rname
.end())
461 int class_id
= p
->second
;
462 std::map
<int,string
>::const_iterator q
= class_name
.find(class_id
);
463 if (q
== class_name
.end())
465 class_rname
.erase(name
);
466 class_name
.erase(class_id
);
469 int get_or_create_class_id(const string
& name
) {
470 int c
= get_class_id(name
);
472 int i
= class_name
.size();
473 class_name
[i
] = name
;
474 class_rname
[name
] = i
;
481 const char *get_item_class(int t
) const {
482 std::map
<int,int>::const_iterator p
= class_map
.find(t
);
483 if (p
== class_map
.end())
485 return get_class_name(p
->second
);
487 int set_item_class(int i
, const string
& name
) {
488 if (!is_valid_crush_name(name
))
490 class_map
[i
] = get_or_create_class_id(name
);
493 int set_item_class(int i
, int c
) {
498 int can_rename_item(const string
& srcname
,
499 const string
& dstname
,
501 int rename_item(const string
& srcname
,
502 const string
& dstname
,
504 int can_rename_bucket(const string
& srcname
,
505 const string
& dstname
,
507 int rename_bucket(const string
& srcname
,
508 const string
& dstname
,
512 bool rule_exists(string name
) const {
514 return rule_name_rmap
.count(name
);
516 int get_rule_id(string name
) const {
518 if (rule_name_rmap
.count(name
))
519 return rule_name_rmap
[name
];
522 const char *get_rule_name(int t
) const {
523 std::map
<int,string
>::const_iterator p
= rule_name_map
.find(t
);
524 if (p
!= rule_name_map
.end())
525 return p
->second
.c_str();
528 void set_rule_name(int i
, const string
& name
) {
529 rule_name_map
[i
] = name
;
531 rule_name_rmap
[name
] = i
;
536 * find tree nodes referenced by rules by a 'take' command
538 * Note that these may not be parentless roots.
540 void find_takes(set
<int>& roots
) const;
545 * These are parentless nodes in the map.
547 void find_roots(set
<int>& roots
) const;
550 * see if an item is contained within a subtree
552 * @param root haystack
554 * @return true if the item is located beneath the given node
556 bool subtree_contains(int root
, int item
) const;
560 * search for an item in any bucket
563 * @return true if present
565 bool _search_item_exists(int i
) const;
569 * see if item is located where we think it is
571 * This verifies that the given item is located at a particular
572 * location in the hierarchy. However, that check is imprecise; we
573 * are actually verifying that the most specific location key/value
574 * is correct. For example, if loc specifies that rack=foo and
575 * host=bar, it will verify that host=bar is correct; any placement
576 * above that level in the hierarchy is ignored. This matches the
577 * semantics for insert_item().
580 * @param item item id
581 * @param loc location to check (map of type to bucket names)
582 * @param weight optional pointer to weight of item at that location
583 * @return true if item is at specified location
585 bool check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
, int *iweight
);
586 bool check_item_loc(CephContext
*cct
, int item
, const map
<string
,string
>& loc
, float *weight
) {
588 bool ret
= check_item_loc(cct
, item
, loc
, &iweight
);
590 *weight
= (float)iweight
/ (float)0x10000;
596 * returns the (type, name) of the parent bucket of id
598 * FIXME: ambiguous for items that occur multiple times in the map
600 pair
<string
,string
> get_immediate_parent(int id
, int *ret
= NULL
);
601 int get_immediate_parent_id(int id
, int *parent
) const;
604 * return ancestor of the given type, or 0 if none
605 * (parent is always a bucket and thus <0)
607 int get_parent_of_type(int id
, int type
) const;
610 * get the fully qualified location of a device by successively finding
611 * parents beginning at ID and ending at highest type number specified in
612 * the CRUSH map which assumes that if device foo is under device bar, the
613 * type_id of foo < bar where type_id is the integer specified in the CRUSH map
615 * returns the location in the form of (type=foo) where type is a type of bucket
616 * specified in the CRUSH map and foo is a name specified in the CRUSH map
618 map
<string
, string
> get_full_location(int id
);
621 * identical to get_full_location(int id) although it returns the type/name
622 * pairs in the order they occur in the hierarchy.
624 * returns -ENOENT if id is not found.
626 int get_full_location_ordered(int id
, vector
<pair
<string
, string
> >& path
);
629 * identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
630 * although it returns a concatenated string with the type/name pairs in descending
631 * hierarchical order with format key1=val1,key2=val2.
633 * returns the location in descending hierarchy as a string.
635 string
get_full_location_ordered_string(int id
);
638 * returns (type_id, type) of all parent buckets between id and
639 * default, can be used to check for anomolous CRUSH maps
641 map
<int, string
> get_parent_hierarchy(int id
);
644 * enumerate immediate children of given node
646 * @param id parent bucket or device id
647 * @return number of items, or error
649 int get_children(int id
, list
<int> *children
);
652 * enumerate leaves(devices) of given node
654 * @param name parent bucket name
655 * @return 0 on success or a negative errno on error.
657 int get_leaves(const string
&name
, set
<int> *leaves
);
658 int _get_leaves(int id
, list
<int> *leaves
); // worker
661 * insert an item into the map at a specific position
663 * Add an item as a specific location of the hierarchy.
664 * Specifically, we look for the most specific location constraint
665 * for which a bucket already exists, and then create intervening
666 * buckets beneath that in order to place the item.
668 * Note that any location specifiers *above* the most specific match
669 * are ignored. For example, if we specify that osd.12 goes in
670 * host=foo, rack=bar, and row=baz, and rack=bar is the most
671 * specific match, we will create host=foo beneath that point and
672 * put osd.12 inside it. However, we will not verify that rack=bar
673 * is beneath row=baz or move it.
675 * In short, we will build out a hierarchy, and move leaves around,
676 * but not adjust the hierarchy's internal structure. Yet.
678 * If the item is already present in the map, we will return EEXIST.
679 * If the location key/value pairs are nonsensical
680 * (rack=nameofdevice), or location specifies that do not attach us
681 * to any existing part of the hierarchy, we will return EINVAL.
685 * @param weight item weight
686 * @param name item name
687 * @param loc location (map of type to bucket names)
688 * @return 0 for success, negative on error
690 int insert_item(CephContext
*cct
, int id
, float weight
, string name
, const map
<string
,string
>& loc
);
693 * move a bucket in the hierarchy to the given location
695 * This has the same location and ancestor creation behavior as
696 * insert_item(), but will relocate the specified existing bucket.
699 * @param id bucket id
700 * @param loc location (map of type to bucket names)
701 * @return 0 for success, negative on error
703 int move_bucket(CephContext
*cct
, int id
, const map
<string
,string
>& loc
);
706 * swap bucket contents of two buckets without touching bucket ids
709 * @param src bucket a
710 * @param dst bucket b
711 * @return 0 for success, negative on error
713 int swap_bucket(CephContext
*cct
, int src
, int dst
);
716 * add a link to an existing bucket in the hierarchy to the new location
718 * This has the same location and ancestor creation behavior as
719 * insert_item(), but will add a new link to the specified existing
723 * @param id bucket id
724 * @param loc location (map of type to bucket names)
725 * @return 0 for success, negative on error
727 int link_bucket(CephContext
*cct
, int id
, const map
<string
,string
>& loc
);
730 * add or update an item's position in the map
732 * This is analogous to insert_item, except we will move an item if
733 * it is already present.
737 * @param weight item weight
738 * @param name item name
739 * @param loc location (map of type to bucket names)
740 * @return 0 for no change, 1 for successful change, negative on error
742 int update_item(CephContext
*cct
, int id
, float weight
, string name
, const map
<string
,string
>& loc
);
745 * create or move an item, but do not adjust its weight if it already exists
748 * @param item item id
749 * @param weight initial item weight (if we need to create it)
750 * @param name item name
751 * @param loc location (map of type to bucket names)
752 * @return 0 for no change, 1 for successful change, negative on error
754 int create_or_move_item(CephContext
*cct
, int item
, float weight
, string name
,
755 const map
<string
,string
>& loc
);
758 * remove all instances of an item from the map
761 * @param id item id to remove
762 * @param unlink_only unlink but do not remove bucket (useful if multiple links or not empty)
763 * @return 0 on success, negative on error
765 int remove_item(CephContext
*cct
, int id
, bool unlink_only
);
768 * recursively remove buckets starting at item and stop removing
769 * when a bucket is in use.
771 * @param item id to remove
772 * @param unused true if only unused items should be removed
773 * @return 0 on success, negative on error
775 int remove_root(int item
, bool unused
);
778 * remove all instances of an item nested beneath a certain point from the map
781 * @param id item id to remove
782 * @param ancestor ancestor item id under which to search for id
783 * @param unlink_only unlink but do not remove bucket (useful if bucket has multiple links or is not empty)
784 * @return 0 on success, negative on error
787 bool _maybe_remove_last_instance(CephContext
*cct
, int id
, bool unlink_only
);
788 int _remove_item_under(CephContext
*cct
, int id
, int ancestor
, bool unlink_only
);
789 bool _bucket_is_in_use(int id
);
791 int remove_item_under(CephContext
*cct
, int id
, int ancestor
, bool unlink_only
);
794 * calculate the locality/distance from a given id to a crush location map
796 * Specifically, we look for the lowest-valued type for which the
797 * location of id matches that described in loc.
800 * @param id the existing id in the map
801 * @param loc a set of key=value pairs describing a location in the hierarchy
803 int get_common_ancestor_distance(CephContext
*cct
, int id
,
804 const std::multimap
<string
,string
>& loc
);
807 * parse a set of key/value pairs out of a string vector
809 * These are used to describe a location in the CRUSH hierarchy.
811 * @param args list of strings (each key= or key=value)
812 * @param ploc pointer to a resulting location map or multimap
814 static int parse_loc_map(const std::vector
<string
>& args
,
815 std::map
<string
,string
> *ploc
);
816 static int parse_loc_multimap(const std::vector
<string
>& args
,
817 std::multimap
<string
,string
> *ploc
);
820 * get an item's weight
822 * Will return the weight for the first instance it finds.
824 * @param id item id to check
825 * @return weight of item
827 int get_item_weight(int id
) const;
828 float get_item_weightf(int id
) const {
829 return (float)get_item_weight(id
) / (float)0x10000;
831 int get_item_weight_in_loc(int id
, const map
<string
,string
> &loc
);
832 float get_item_weightf_in_loc(int id
, const map
<string
,string
> &loc
) {
833 return (float)get_item_weight_in_loc(id
, loc
) / (float)0x10000;
836 int adjust_item_weight(CephContext
*cct
, int id
, int weight
);
837 int adjust_item_weightf(CephContext
*cct
, int id
, float weight
) {
838 return adjust_item_weight(cct
, id
, (int)(weight
* (float)0x10000));
840 int adjust_item_weight_in_loc(CephContext
*cct
, int id
, int weight
, const map
<string
,string
>& loc
);
841 int adjust_item_weightf_in_loc(CephContext
*cct
, int id
, float weight
, const map
<string
,string
>& loc
) {
842 return adjust_item_weight_in_loc(cct
, id
, (int)(weight
* (float)0x10000), loc
);
844 void reweight(CephContext
*cct
);
846 int adjust_subtree_weight(CephContext
*cct
, int id
, int weight
);
847 int adjust_subtree_weightf(CephContext
*cct
, int id
, float weight
) {
848 return adjust_subtree_weight(cct
, id
, (int)(weight
* (float)0x10000));
851 /// check if item id is present in the map hierarchy
852 bool check_item_present(int id
) const;
856 int get_max_devices() const {
857 if (!crush
) return 0;
858 return crush
->max_devices
;
864 crush_rule
*get_rule(unsigned ruleno
) const {
865 if (!crush
) return (crush_rule
*)(-ENOENT
);
866 if (ruleno
>= crush
->max_rules
)
868 return crush
->rules
[ruleno
];
870 crush_rule_step
*get_rule_step(unsigned ruleno
, unsigned step
) const {
871 crush_rule
*n
= get_rule(ruleno
);
872 if (IS_ERR(n
)) return (crush_rule_step
*)(-EINVAL
);
873 if (step
>= n
->len
) return (crush_rule_step
*)(-EINVAL
);
874 return &n
->steps
[step
];
879 int get_max_rules() const {
880 if (!crush
) return 0;
881 return crush
->max_rules
;
883 bool rule_exists(unsigned ruleno
) const {
884 if (!crush
) return false;
885 if (ruleno
< crush
->max_rules
&&
886 crush
->rules
[ruleno
] != NULL
)
890 int get_rule_len(unsigned ruleno
) const {
891 crush_rule
*r
= get_rule(ruleno
);
892 if (IS_ERR(r
)) return PTR_ERR(r
);
895 int get_rule_mask_ruleset(unsigned ruleno
) const {
896 crush_rule
*r
= get_rule(ruleno
);
897 if (IS_ERR(r
)) return -1;
898 return r
->mask
.ruleset
;
900 int get_rule_mask_type(unsigned ruleno
) const {
901 crush_rule
*r
= get_rule(ruleno
);
902 if (IS_ERR(r
)) return -1;
905 int get_rule_mask_min_size(unsigned ruleno
) const {
906 crush_rule
*r
= get_rule(ruleno
);
907 if (IS_ERR(r
)) return -1;
908 return r
->mask
.min_size
;
910 int get_rule_mask_max_size(unsigned ruleno
) const {
911 crush_rule
*r
= get_rule(ruleno
);
912 if (IS_ERR(r
)) return -1;
913 return r
->mask
.max_size
;
915 int get_rule_op(unsigned ruleno
, unsigned step
) const {
916 crush_rule_step
*s
= get_rule_step(ruleno
, step
);
917 if (IS_ERR(s
)) return PTR_ERR(s
);
920 int get_rule_arg1(unsigned ruleno
, unsigned step
) const {
921 crush_rule_step
*s
= get_rule_step(ruleno
, step
);
922 if (IS_ERR(s
)) return PTR_ERR(s
);
925 int get_rule_arg2(unsigned ruleno
, unsigned step
) const {
926 crush_rule_step
*s
= get_rule_step(ruleno
, step
);
927 if (IS_ERR(s
)) return PTR_ERR(s
);
932 * calculate a map of osds to weights for a given rule
934 * Generate a map of which OSDs get how much relative weight for a
937 * @param ruleno [in] rule id
938 * @param pmap [out] map of osd to weight
939 * @return 0 for success, or negative error code
941 int get_rule_weight_osd_map(unsigned ruleno
, map
<int,float> *pmap
);
944 int add_rule(int len
, int ruleset
, int type
, int minsize
, int maxsize
, int ruleno
) {
945 if (!crush
) return -ENOENT
;
946 crush_rule
*n
= crush_make_rule(len
, ruleset
, type
, minsize
, maxsize
);
948 ruleno
= crush_add_rule(crush
, n
, ruleno
);
951 int set_rule_mask_max_size(unsigned ruleno
, int max_size
) {
952 crush_rule
*r
= get_rule(ruleno
);
953 if (IS_ERR(r
)) return -1;
954 return r
->mask
.max_size
= max_size
;
956 int set_rule_step(unsigned ruleno
, unsigned step
, int op
, int arg1
, int arg2
) {
957 if (!crush
) return -ENOENT
;
958 crush_rule
*n
= get_rule(ruleno
);
960 crush_rule_set_step(n
, step
, op
, arg1
, arg2
);
963 int set_rule_step_take(unsigned ruleno
, unsigned step
, int val
) {
964 return set_rule_step(ruleno
, step
, CRUSH_RULE_TAKE
, val
, 0);
966 int set_rule_step_set_choose_tries(unsigned ruleno
, unsigned step
, int val
) {
967 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSE_TRIES
, val
, 0);
969 int set_rule_step_set_choose_local_tries(unsigned ruleno
, unsigned step
, int val
) {
970 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES
, val
, 0);
972 int set_rule_step_set_choose_local_fallback_tries(unsigned ruleno
, unsigned step
, int val
) {
973 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES
, val
, 0);
975 int set_rule_step_set_chooseleaf_tries(unsigned ruleno
, unsigned step
, int val
) {
976 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, val
, 0);
978 int set_rule_step_set_chooseleaf_vary_r(unsigned ruleno
, unsigned step
, int val
) {
979 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSELEAF_VARY_R
, val
, 0);
981 int set_rule_step_set_chooseleaf_stable(unsigned ruleno
, unsigned step
, int val
) {
982 return set_rule_step(ruleno
, step
, CRUSH_RULE_SET_CHOOSELEAF_STABLE
, val
, 0);
984 int set_rule_step_choose_firstn(unsigned ruleno
, unsigned step
, int val
, int type
) {
985 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSE_FIRSTN
, val
, type
);
987 int set_rule_step_choose_indep(unsigned ruleno
, unsigned step
, int val
, int type
) {
988 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSE_INDEP
, val
, type
);
990 int set_rule_step_choose_leaf_firstn(unsigned ruleno
, unsigned step
, int val
, int type
) {
991 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSELEAF_FIRSTN
, val
, type
);
993 int set_rule_step_choose_leaf_indep(unsigned ruleno
, unsigned step
, int val
, int type
) {
994 return set_rule_step(ruleno
, step
, CRUSH_RULE_CHOOSELEAF_INDEP
, val
, type
);
996 int set_rule_step_emit(unsigned ruleno
, unsigned step
) {
997 return set_rule_step(ruleno
, step
, CRUSH_RULE_EMIT
, 0, 0);
1000 int add_simple_rule(
1001 string name
, string root_name
, string failure_domain_type
,
1002 string mode
, int rule_type
, ostream
*err
= 0);
1005 * @param rno rule[set] id to use, -1 to pick the lowest available
1007 int add_simple_rule_at(
1008 string name
, string root_name
,
1009 string failure_domain_type
, string mode
,
1010 int rule_type
, int rno
, ostream
*err
= 0);
1012 int remove_rule(int ruleno
);
1017 const crush_bucket
*get_bucket(int id
) const {
1019 return (crush_bucket
*)(-EINVAL
);
1020 unsigned int pos
= (unsigned int)(-1 - id
);
1021 unsigned int max_buckets
= crush
->max_buckets
;
1022 if (pos
>= max_buckets
)
1023 return (crush_bucket
*)(-ENOENT
);
1024 crush_bucket
*ret
= crush
->buckets
[pos
];
1026 return (crush_bucket
*)(-ENOENT
);
1029 crush_bucket
*get_bucket(int id
) {
1031 return (crush_bucket
*)(-EINVAL
);
1032 unsigned int pos
= (unsigned int)(-1 - id
);
1033 unsigned int max_buckets
= crush
->max_buckets
;
1034 if (pos
>= max_buckets
)
1035 return (crush_bucket
*)(-ENOENT
);
1036 crush_bucket
*ret
= crush
->buckets
[pos
];
1038 return (crush_bucket
*)(-ENOENT
);
1042 * detach a bucket from its parent and adjust the parent weight
1044 * returns the weight of the detached bucket
1046 int detach_bucket(CephContext
*cct
, int item
){
1053 // check that the bucket that we want to detach exists
1054 assert(bucket_exists(item
));
1056 // get the bucket's weight
1057 crush_bucket
*b
= get_bucket(item
);
1058 unsigned bucket_weight
= b
->weight
;
1060 // get where the bucket is located
1061 pair
<string
, string
> bucket_location
= get_immediate_parent(item
);
1063 // get the id of the parent bucket
1064 int parent_id
= get_item_id(bucket_location
.second
);
1066 // get the parent bucket
1067 crush_bucket
*parent_bucket
= get_bucket(parent_id
);
1069 if (!IS_ERR(parent_bucket
)) {
1070 // zero out the bucket weight
1071 bucket_adjust_item_weight(cct
, parent_bucket
, item
, 0);
1072 adjust_item_weight(cct
, parent_bucket
->id
, parent_bucket
->weight
);
1074 // remove the bucket from the parent
1075 bucket_remove_item(parent_bucket
, item
);
1076 } else if (PTR_ERR(parent_bucket
) != -ENOENT
) {
1077 return PTR_ERR(parent_bucket
);
1080 // check that we're happy
1081 int test_weight
= 0;
1082 map
<string
,string
> test_location
;
1083 test_location
[ bucket_location
.first
] = (bucket_location
.second
);
1085 bool successful_detach
= !(check_item_loc(cct
, item
, test_location
, &test_weight
));
1086 assert(successful_detach
);
1087 assert(test_weight
== 0);
1089 return bucket_weight
;
1093 int get_max_buckets() const {
1094 if (!crush
) return -EINVAL
;
1095 return crush
->max_buckets
;
1097 int get_next_bucket_id() const {
1098 if (!crush
) return -EINVAL
;
1099 return crush_get_next_bucket_id(crush
);
1101 bool bucket_exists(int id
) const {
1102 const crush_bucket
*b
= get_bucket(id
);
1107 int get_bucket_weight(int id
) const {
1108 const crush_bucket
*b
= get_bucket(id
);
1109 if (IS_ERR(b
)) return PTR_ERR(b
);
1112 float get_bucket_weightf(int id
) const {
1113 const crush_bucket
*b
= get_bucket(id
);
1114 if (IS_ERR(b
)) return 0;
1115 return b
->weight
/ (float)0x10000;
1117 int get_bucket_type(int id
) const {
1118 const crush_bucket
*b
= get_bucket(id
);
1119 if (IS_ERR(b
)) return PTR_ERR(b
);
1122 int get_bucket_alg(int id
) const {
1123 const crush_bucket
*b
= get_bucket(id
);
1124 if (IS_ERR(b
)) return PTR_ERR(b
);
1127 int get_bucket_hash(int id
) const {
1128 const crush_bucket
*b
= get_bucket(id
);
1129 if (IS_ERR(b
)) return PTR_ERR(b
);
1132 int get_bucket_size(int id
) const {
1133 const crush_bucket
*b
= get_bucket(id
);
1134 if (IS_ERR(b
)) return PTR_ERR(b
);
1137 int get_bucket_item(int id
, int pos
) const {
1138 const crush_bucket
*b
= get_bucket(id
);
1139 if (IS_ERR(b
)) return PTR_ERR(b
);
1140 if ((__u32
)pos
>= b
->size
)
1142 return b
->items
[pos
];
1144 int get_bucket_item_weight(int id
, int pos
) const {
1145 const crush_bucket
*b
= get_bucket(id
);
1146 if (IS_ERR(b
)) return PTR_ERR(b
);
1147 return crush_get_bucket_item_weight(b
, pos
);
1149 float get_bucket_item_weightf(int id
, int pos
) const {
1150 const crush_bucket
*b
= get_bucket(id
);
1151 if (IS_ERR(b
)) return 0;
1152 return (float)crush_get_bucket_item_weight(b
, pos
) / (float)0x10000;
1156 int add_bucket(int bucketno
, int alg
, int hash
, int type
, int size
,
1157 int *items
, int *weights
, int *idout
) {
1159 alg
= get_default_bucket_alg();
1163 crush_bucket
*b
= crush_make_bucket(crush
, alg
, hash
, type
, size
, items
, weights
);
1165 return crush_add_bucket(crush
, bucketno
, b
, idout
);
1168 int bucket_add_item(crush_bucket
*bucket
, int item
, int weight
);
1169 int bucket_remove_item(struct crush_bucket
*bucket
, int item
);
1170 int bucket_adjust_item_weight(CephContext
*cct
, struct crush_bucket
*bucket
, int item
, int weight
);
1174 crush_finalize(crush
);
1175 have_uniform_rules
= !has_legacy_rulesets();
1178 int update_device_class(CephContext
*cct
, int id
, const string
& class_name
, const string
& name
);
1179 int device_class_clone(int original
, int device_class
, int *clone
);
1180 bool class_is_in_use(int class_id
);
1181 int populate_classes();
1182 int rebuild_roots_with_classes();
1183 /* remove unused roots generated for class devices */
1184 int trim_roots_with_class(bool unused
);
1185 int cleanup_classes();
1187 void start_choose_profile() {
1188 free(crush
->choose_tries
);
1190 * the original choose_total_tries value was off by one (it
1191 * counted "retries" and not "tries"). add one to alloc.
1193 crush
->choose_tries
= (__u32
*)malloc(sizeof(*crush
->choose_tries
) * (crush
->choose_total_tries
+ 1));
1194 memset(crush
->choose_tries
, 0,
1195 sizeof(*crush
->choose_tries
) * (crush
->choose_total_tries
+ 1));
1197 void stop_choose_profile() {
1198 free(crush
->choose_tries
);
1199 crush
->choose_tries
= 0;
1202 int get_choose_profile(__u32
**vec
) {
1203 if (crush
->choose_tries
) {
1204 *vec
= crush
->choose_tries
;
1205 return crush
->choose_total_tries
;
1211 void set_max_devices(int m
) {
1212 crush
->max_devices
= m
;
1215 int find_rule(int ruleset
, int type
, int size
) const {
1216 if (!crush
) return -1;
1217 if (!have_uniform_rules
) {
1218 return crush_find_rule(crush
, ruleset
, type
, size
);
1220 if (ruleset
< (int)crush
->max_rules
&&
1221 crush
->rules
[ruleset
])
1227 bool ruleset_exists(int const ruleset
) const {
1228 for (size_t i
= 0; i
< crush
->max_rules
; ++i
) {
1229 if (rule_exists(i
) && crush
->rules
[i
]->mask
.ruleset
== ruleset
) {
1238 * Return the lowest numbered ruleset of type `type`
1240 * @returns a ruleset ID, or -1 if no matching rulesets found.
1242 int find_first_ruleset(int type
) const {
1245 for (size_t i
= 0; i
< crush
->max_rules
; ++i
) {
1247 && crush
->rules
[i
]->mask
.type
== type
1248 && (crush
->rules
[i
]->mask
.ruleset
< result
|| result
== -1)) {
1249 result
= crush
->rules
[i
]->mask
.ruleset
;
1256 crush_choose_arg_map
choose_args_get(uint64_t choose_args_index
) const {
1257 auto i
= choose_args
.find(choose_args_index
);
1258 if (i
== choose_args
.end()) {
1259 crush_choose_arg_map arg_map
;
1260 arg_map
.args
= NULL
;
1268 void destroy_choose_args(crush_choose_arg_map arg_map
) {
1269 for (__u32 i
= 0; i
< arg_map
.size
; i
++) {
1270 crush_choose_arg
*arg
= &arg_map
.args
[i
];
1271 for (__u32 j
= 0; j
< arg
->weight_set_size
; j
++) {
1272 crush_weight_set
*weight_set
= &arg
->weight_set
[j
];
1273 free(weight_set
->weights
);
1275 if (arg
->weight_set
)
1276 free(arg
->weight_set
);
1283 void choose_args_clear() {
1284 for (auto w
: choose_args
)
1285 destroy_choose_args(w
.second
);
1286 choose_args
.clear();
1289 template<typename WeightVector
>
1290 void do_rule(int rule
, int x
, vector
<int>& out
, int maxout
,
1291 const WeightVector
& weight
,
1292 uint64_t choose_args_index
) const {
1294 char work
[crush_work_size(crush
, maxout
)];
1295 crush_init_workspace(crush
, work
);
1296 crush_choose_arg_map arg_map
= choose_args_get(choose_args_index
);
1297 int numrep
= crush_do_rule(crush
, rule
, x
, rawout
, maxout
, &weight
[0],
1298 weight
.size(), work
, arg_map
.args
);
1302 for (int i
=0; i
<numrep
; i
++)
1306 int _choose_type_stack(
1308 const vector
<pair
<int,int>>& stack
,
1309 const set
<int>& overfull
,
1310 const vector
<int>& underfull
,
1311 const vector
<int>& orig
,
1312 vector
<int>::const_iterator
& i
,
1314 vector
<int> *pw
) const;
1320 const set
<int>& overfull
,
1321 const vector
<int>& underfull
,
1322 const vector
<int>& orig
,
1323 vector
<int> *out
) const;
1325 bool check_crush_rule(int ruleset
, int type
, int size
, ostream
& ss
) {
1329 for (i
= 0; i
< crush
->max_rules
; i
++) {
1330 if (crush
->rules
[i
] &&
1331 crush
->rules
[i
]->mask
.ruleset
== ruleset
&&
1332 crush
->rules
[i
]->mask
.type
== type
) {
1334 if (crush
->rules
[i
]->mask
.min_size
<= size
&&
1335 crush
->rules
[i
]->mask
.max_size
>= size
) {
1337 } else if (size
< crush
->rules
[i
]->mask
.min_size
) {
1338 ss
<< "pool size is smaller than the crush rule min size";
1341 ss
<< "pool size is bigger than the crush rule max size";
1350 void encode(bufferlist
&bl
, uint64_t features
) const;
1351 void decode(bufferlist::iterator
&blp
);
1352 void decode_crush_bucket(crush_bucket
** bptr
, bufferlist::iterator
&blp
);
1353 void dump(Formatter
*f
) const;
1354 void dump_rules(Formatter
*f
) const;
1355 void dump_rule(int ruleset
, Formatter
*f
) const;
1356 void dump_tunables(Formatter
*f
) const;
1357 void dump_choose_args(Formatter
*f
) const;
1358 void list_rules(Formatter
*f
) const;
1359 void dump_tree(ostream
*out
, Formatter
*f
) const;
1360 void dump_tree(Formatter
*f
) const;
1361 static void generate_test_instances(list
<CrushWrapper
*>& o
);
1363 int get_osd_pool_default_crush_replicated_ruleset(CephContext
*cct
);
1365 static bool is_valid_crush_name(const string
& s
);
1366 static bool is_valid_crush_loc(CephContext
*cct
,
1367 const map
<string
,string
>& loc
);
1369 WRITE_CLASS_ENCODER_FEATURES(CrushWrapper
)