#include "include/assert.h"
#include "include/err.h"
#include "include/encoding.h"
-
+#include "include/mempool.h"
#include "common/Mutex.h"
class Formatter;
}
+namespace CrushTreeDumper {
+ typedef mempool::osdmap::map<int64_t,string> name_map_t;
+}
+
WRITE_RAW_ENCODER(crush_rule_mask) // it's all u8's
inline static void encode(const crush_rule_step &s, bufferlist &bl)
using namespace std;
class CrushWrapper {
public:
+ // magic value used by OSDMap for a "default" fallback choose_args, used if
+ // the choose_arg_map passed to do_rule does not exist. if this also
+ // doesn't exist, fall back to canonical weights.
+ enum {
+ DEFAULT_CHOOSE_ARGS = -1
+ };
+
std::map<int32_t, string> type_map; /* bucket/device type names */
std::map<int32_t, string> name_map; /* bucket/device names */
std::map<int32_t, string> rule_name_map;
+
std::map<int32_t, int32_t> class_map; /* item id -> class id */
std::map<int32_t, string> class_name; /* class id -> class name */
std::map<string, int32_t> class_rname; /* class name -> class id */
std::map<int32_t, map<int32_t, int32_t> > class_bucket; /* bucket[id][class] == id */
- std::map<uint64_t, crush_choose_arg_map> choose_args;
+ std::map<int64_t, crush_choose_arg_map> choose_args;
private:
- struct crush_map *crush;
+ struct crush_map *crush = nullptr;
bool have_uniform_rules = false;
/* reverse maps */
- mutable bool have_rmaps;
+ mutable bool have_rmaps = false;
mutable std::map<string, int> type_rmap, name_rmap, rule_name_rmap;
void build_rmaps() const {
if (have_rmaps) return;
CrushWrapper(const CrushWrapper& other);
const CrushWrapper& operator=(const CrushWrapper& other);
- CrushWrapper() : crush(0), have_rmaps(false) {
+ CrushWrapper() {
create();
}
~CrushWrapper() {
set_tunables_default();
}
- /// true if any rule has a ruleset != the rule id
- bool has_legacy_rulesets() const;
+ /**
+ * true if any rule has a rule id != its position in the array
+ *
+ * These indicate "ruleset" IDs that were created by older versions
+ * of Ceph. They are cleaned up in renumber_rules so that eventually
+ * we can remove the code for handling them.
+ */
+ bool has_legacy_rule_ids() const;
- /// fix rules whose ruleid != ruleset
- int renumber_rules_by_ruleset();
+ /**
+ * fix rules whose ruleid != ruleset
+ *
+ * These rules were created in older versions of Ceph. The concept
+ * of a ruleset no longer exists.
+ *
+ * Return a map of old ID -> new ID. Caller must update OSDMap
+ * to use new IDs.
+ */
+ std::map<int, int> renumber_rules();
- /// true if any ruleset has more than 1 rule
- bool has_multirule_rulesets() const;
+ /// true if any buckets that aren't straw2
+ bool has_non_straw2_buckets() const;
// tunables
void set_tunables_argonaut() {
name_rmap[bn] = a;
}
}
- bool id_has_class(int i) {
- int idout;
- int classout;
- if (split_id_class(i, &idout, &classout) != 0)
- return false;
- return classout != -1;
- }
int split_id_class(int i, int *idout, int *classout) const;
bool class_exists(const string& name) const {
return 0;
}
- int rename_class(const string& srcname, const string& dstname) {
- auto p = class_rname.find(srcname);
- if (p == class_rname.end())
- return -ENOENT;
- int class_id = p->second;
- auto q = class_name.find(class_id);
- if (q == class_name.end())
- return -ENOENT;
- class_rname.erase(srcname);
- class_name.erase(class_id);
- class_rname[dstname] = class_id;
- class_name[class_id] = dstname;
- return 0;
- }
-
int32_t _alloc_class_id() const;
int get_or_create_class_id(const string& name) {
ostream *ss);
// rule names
+ int rename_rule(const string& srcname,
+ const string& dstname,
+ ostream *ss);
bool rule_exists(string name) const {
build_rmaps();
return rule_name_rmap.count(name);
if (have_rmaps)
rule_name_rmap[name] = i;
}
+ bool is_shadow_item(int id) const {
+ const char *name = get_item_name(id);
+ return name && !is_valid_crush_name(name);
+ }
/**
*
* Note that these may not be parentless roots.
*/
- void find_takes(set<int>& roots) const;
+ void find_takes(set<int> *roots) const;
+ void find_takes_by_rule(int rule, set<int> *roots) const;
/**
* find tree roots
*
* These are parentless nodes in the map.
*/
- void find_roots(set<int>& roots) const;
+ void find_roots(set<int> *roots) const;
+
+
+ /**
+ * find tree roots that contain shadow (device class) items only
+ */
+ void find_shadow_roots(set<int> *roots) const {
+ set<int> all;
+ find_roots(&all);
+ for (auto& p: all) {
+ if (is_shadow_item(p)) {
+ roots->insert(p);
+ }
+ }
+ }
/**
* find tree roots that are not shadow (device class) items
* These are parentless nodes in the map that are not shadow
* items for device classes.
*/
- void find_nonshadow_roots(set<int>& roots) const;
+ void find_nonshadow_roots(set<int> *roots) const {
+ set<int> all;
+ find_roots(&all);
+ for (auto& p: all) {
+ if (!is_shadow_item(p)) {
+ roots->insert(p);
+ }
+ }
+ }
/**
* see if an item is contained within a subtree
* FIXME: ambiguous for items that occur multiple times in the map
*/
pair<string,string> get_immediate_parent(int id, int *ret = NULL);
+
int get_immediate_parent_id(int id, int *parent) const;
/**
* return ancestor of the given type, or 0 if none
+ * can pass in a specific crush **rule** to return ancestor from that rule only
* (parent is always a bucket and thus <0)
*/
- int get_parent_of_type(int id, int type) const;
+ int get_parent_of_type(int id, int type, int rule = -1) const;
/**
* get the fully qualified location of a device by successively finding
* @return number of items, or error
*/
int get_children(int id, list<int> *children);
+ void get_children_of_type(int id,
+ int type,
+ set<int> *children,
+ bool exclude_shadow = true) const;
+
+ /**
+ * get failure-domain type of a specific crush rule
+ * @param rule_id crush rule id
+ * @return type of failure-domain or a negative errno on error.
+ */
+ int get_rule_failure_domain(int rule_id);
/**
* enumerate leaves(devices) of given node
* when a bucket is in use.
*
* @param item id to remove
- * @param unused true if only unused items should be removed
* @return 0 on success, negative on error
*/
- int remove_root(int item, bool unused);
+ int remove_root(int item);
/**
* remove all instances of an item nested beneath a certain point from the map
return true;
return false;
}
+ bool rule_has_take(unsigned ruleno, int take) const {
+ if (!crush) return false;
+ crush_rule *rule = get_rule(ruleno);
+ for (unsigned i = 0; i < rule->len; ++i) {
+ if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+ rule->steps[i].arg1 == take) {
+ return true;
+ }
+ }
+ return false;
+ }
int get_rule_len(unsigned ruleno) const {
crush_rule *r = get_rule(ruleno);
if (IS_ERR(r)) return PTR_ERR(r);
return s->arg2;
}
+private:
+ float _get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+ void _normalize_weight_map(float sum, const map<int,float>& m,
+ map<int,float> *pmap) const;
+
+public:
/**
* calculate a map of osds to weights for a given rule
*
* @param pmap [out] map of osd to weight
* @return 0 for success, or negative error code
*/
- int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+ int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap) const;
+
+ /**
+ * calculate a map of osds to weights for a given starting root
+ *
+ * Generate a map of which OSDs get how much relative weight for a
+ * given starting root
+ *
+ * @param root node
+ * @param pmap [out] map of osd to weight
+ * @return 0 for success, or negative error code
+ */
+ int get_take_weight_osd_map(int root, map<int,float> *pmap) const;
/* modifiers */
- int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
+
+ int add_rule(int ruleno, int len, int type, int minsize, int maxsize) {
if (!crush) return -ENOENT;
- crush_rule *n = crush_make_rule(len, ruleset, type, minsize, maxsize);
+ crush_rule *n = crush_make_rule(len, ruleno, type, minsize, maxsize);
assert(n);
ruleno = crush_add_rule(crush, n, ruleno);
return ruleno;
/** buckets **/
-private:
const crush_bucket *get_bucket(int id) const {
if (!crush)
return (crush_bucket *)(-EINVAL);
return (crush_bucket *)(-ENOENT);
return ret;
}
+private:
crush_bucket *get_bucket(int id) {
if (!crush)
return (crush_bucket *)(-EINVAL);
*
* returns the weight of the detached bucket
**/
- int detach_bucket(CephContext *cct, int item){
- if (!crush)
- return (-EINVAL);
-
- if (item >= 0)
- return (-EINVAL);
-
- // check that the bucket that we want to detach exists
- assert(bucket_exists(item));
-
- // get the bucket's weight
- crush_bucket *b = get_bucket(item);
- unsigned bucket_weight = b->weight;
-
- // get where the bucket is located
- pair<string, string> bucket_location = get_immediate_parent(item);
-
- // get the id of the parent bucket
- int parent_id = get_item_id(bucket_location.second);
-
- // get the parent bucket
- crush_bucket *parent_bucket = get_bucket(parent_id);
-
- if (!IS_ERR(parent_bucket)) {
- // zero out the bucket weight
- bucket_adjust_item_weight(cct, parent_bucket, item, 0);
- adjust_item_weight(cct, parent_bucket->id, parent_bucket->weight);
-
- // remove the bucket from the parent
- bucket_remove_item(parent_bucket, item);
- } else if (PTR_ERR(parent_bucket) != -ENOENT) {
- return PTR_ERR(parent_bucket);
- }
-
- // check that we're happy
- int test_weight = 0;
- map<string,string> test_location;
- test_location[ bucket_location.first ] = (bucket_location.second);
-
- bool successful_detach = !(check_item_loc(cct, item, test_location, &test_weight));
- assert(successful_detach);
- assert(test_weight == 0);
-
- return bucket_weight;
- }
+ int detach_bucket(CephContext *cct, int item);
public:
int get_max_buckets() const {
/* modifiers */
int add_bucket(int bucketno, int alg, int hash, int type, int size,
- int *items, int *weights, int *idout) {
- if (alg == 0) {
- alg = get_default_bucket_alg();
- if (alg == 0)
- return -EINVAL;
- }
- crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items, weights);
- assert(b);
- return crush_add_bucket(crush, bucketno, b, idout);
- }
-
+ int *items, int *weights, int *idout);
int bucket_add_item(crush_bucket *bucket, int item, int weight);
int bucket_remove_item(struct crush_bucket *bucket, int item);
int bucket_adjust_item_weight(CephContext *cct, struct crush_bucket *bucket, int item, int weight);
void finalize() {
assert(crush);
crush_finalize(crush);
- have_uniform_rules = !has_legacy_rulesets();
+ if (!name_map.empty() &&
+ name_map.rbegin()->first >= crush->max_devices) {
+ crush->max_devices = name_map.rbegin()->first + 1;
+ }
+ have_uniform_rules = !has_legacy_rule_ids();
}
+ int bucket_set_alg(int id, int alg);
int update_device_class(int id, const string& class_name, const string& name, ostream *ss);
- int device_class_clone(int original, int device_class, int *clone);
- bool class_is_in_use(int class_id);
- int populate_classes();
+ int remove_device_class(CephContext *cct, int id, ostream *ss);
+ int device_class_clone(
+ int original, int device_class,
+ const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
+ const std::set<int32_t>& used_ids,
+ int *clone,
+ map<int,map<int,vector<int>>> *cmap_item_weight);
+ int rename_class(const string& srcname, const string& dstname);
+ int populate_classes(
+ const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
+ int get_rules_by_class(const string &class_name, set<int> *rules);
+ int get_rules_by_osd(int osd, set<int> *rules);
+ bool _class_is_dead(int class_id);
+ void cleanup_dead_classes();
int rebuild_roots_with_classes();
/* remove unused roots generated for class devices */
- int trim_roots_with_class(bool unused);
- int cleanup_classes();
+ int trim_roots_with_class();
void start_choose_profile() {
free(crush->choose_tries);
* the original choose_total_tries value was off by one (it
* counted "retries" and not "tries"). add one to alloc.
*/
- crush->choose_tries = (__u32 *)malloc(sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1));
+ crush->choose_tries = (__u32 *)calloc(sizeof(*crush->choose_tries),
+ (crush->choose_total_tries + 1));
memset(crush->choose_tries, 0,
sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1));
}
int find_rule(int ruleset, int type, int size) const {
if (!crush) return -1;
- if (!have_uniform_rules) {
- return crush_find_rule(crush, ruleset, type, size);
- } else {
- if (ruleset < (int)crush->max_rules &&
- crush->rules[ruleset])
- return ruleset;
- return -1;
+ if (have_uniform_rules &&
+ ruleset < (int)crush->max_rules &&
+ crush->rules[ruleset] &&
+ crush->rules[ruleset]->mask.type == type &&
+ crush->rules[ruleset]->mask.min_size <= size &&
+ crush->rules[ruleset]->mask.max_size >= size) {
+ return ruleset;
}
+ return crush_find_rule(crush, ruleset, type, size);
}
- bool ruleset_exists(int const ruleset) const {
+ bool ruleset_exists(const int ruleset) const {
for (size_t i = 0; i < crush->max_rules; ++i) {
if (rule_exists(i) && crush->rules[i]->mask.ruleset == ruleset) {
return true;
/**
* Return the lowest numbered ruleset of type `type`
*
- * @returns a ruleset ID, or -1 if no matching rulesets found.
+ * @returns a ruleset ID, or -1 if no matching rules found.
*/
int find_first_ruleset(int type) const {
int result = -1;
return result;
}
- crush_choose_arg_map choose_args_get(uint64_t choose_args_index) const {
+ bool have_choose_args(int64_t choose_args_index) const {
+ return choose_args.count(choose_args_index);
+ }
+
+ crush_choose_arg_map choose_args_get_with_fallback(
+ int64_t choose_args_index) const {
+ auto i = choose_args.find(choose_args_index);
+ if (i == choose_args.end()) {
+ i = choose_args.find(DEFAULT_CHOOSE_ARGS);
+ }
+ if (i == choose_args.end()) {
+ crush_choose_arg_map arg_map;
+ arg_map.args = NULL;
+ arg_map.size = 0;
+ return arg_map;
+ } else {
+ return i->second;
+ }
+ }
+ crush_choose_arg_map choose_args_get(int64_t choose_args_index) const {
auto i = choose_args.find(choose_args_index);
if (i == choose_args.end()) {
crush_choose_arg_map arg_map;
void destroy_choose_args(crush_choose_arg_map arg_map) {
for (__u32 i = 0; i < arg_map.size; i++) {
crush_choose_arg *arg = &arg_map.args[i];
- for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
crush_weight_set *weight_set = &arg->weight_set[j];
free(weight_set->weights);
}
}
free(arg_map.args);
}
-
+
+ void create_choose_args(int64_t id, int positions) {
+ if (choose_args.count(id))
+ return;
+ assert(positions);
+ auto &cmap = choose_args[id];
+ cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
+ crush->max_buckets);
+ cmap.size = crush->max_buckets;
+ for (int bidx=0; bidx < crush->max_buckets; ++bidx) {
+ crush_bucket *b = crush->buckets[bidx];
+ auto &carg = cmap.args[bidx];
+ carg.ids = NULL;
+ carg.ids_size = 0;
+ if (b && b->alg == CRUSH_BUCKET_STRAW2) {
+ crush_bucket_straw2 *sb = (crush_bucket_straw2*)b;
+ carg.weight_set_positions = positions;
+ carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
+ carg.weight_set_positions);
+ // initialize with canonical weights
+ for (int pos = 0; pos < positions; ++pos) {
+ carg.weight_set[pos].size = b->size;
+ carg.weight_set[pos].weights = (__u32*)calloc(4, b->size);
+ for (unsigned i = 0; i < b->size; ++i) {
+ carg.weight_set[pos].weights[i] = sb->item_weights[i];
+ }
+ }
+ } else {
+ carg.weight_set = NULL;
+ carg.weight_set_positions = 0;
+ }
+ }
+ }
+
+ void rm_choose_args(int64_t id) {
+ auto p = choose_args.find(id);
+ if (p != choose_args.end()) {
+ destroy_choose_args(p->second);
+ choose_args.erase(p);
+ }
+ }
+
void choose_args_clear() {
for (auto w : choose_args)
destroy_choose_args(w.second);
choose_args.clear();
}
+ // remove choose_args for buckets that no longer exist, create them for new buckets
+ void update_choose_args(CephContext *cct);
+
+ // adjust choose_args_map weight, preserving the hierarchical summation
+ // property. used by callers optimizing layouts by tweaking weights.
+ int _choose_args_adjust_item_weight_in_bucket(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int bucketid,
+ int id,
+ const vector<int>& weight,
+ ostream *ss);
+ int choose_args_adjust_item_weight(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id, const vector<int>& weight,
+ ostream *ss);
+ int choose_args_adjust_item_weightf(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id, const vector<double>& weightf,
+ ostream *ss) {
+ vector<int> weight(weightf.size());
+ for (unsigned i = 0; i < weightf.size(); ++i) {
+ weight[i] = (int)(weightf[i] * (float)0x10000);
+ }
+ return choose_args_adjust_item_weight(cct, cmap, id, weight, ss);
+ }
+
+ int get_choose_args_positions(crush_choose_arg_map cmap) {
+ // infer positions from other buckets
+ for (unsigned j = 0; j < cmap.size; ++j) {
+ if (cmap.args[j].weight_set_positions) {
+ return cmap.args[j].weight_set_positions;
+ }
+ }
+ return 1;
+ }
+
template<typename WeightVector>
void do_rule(int rule, int x, vector<int>& out, int maxout,
const WeightVector& weight,
int rawout[maxout];
char work[crush_work_size(crush, maxout)];
crush_init_workspace(crush, work);
- crush_choose_arg_map arg_map = choose_args_get(choose_args_index);
+ crush_choose_arg_map arg_map = choose_args_get_with_fallback(
+ choose_args_index);
int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0],
weight.size(), work, arg_map.args);
if (numrep < 0)
void dump_tunables(Formatter *f) const;
void dump_choose_args(Formatter *f) const;
void list_rules(Formatter *f) const;
- void dump_tree(ostream *out, Formatter *f) const;
- void dump_tree(Formatter *f) const;
+ void list_rules(ostream *ss) const;
+ void dump_tree(ostream *out,
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& ws,
+ bool show_shadow = false) const;
+ void dump_tree(ostream *out, Formatter *f) {
+ dump_tree(out, f, CrushTreeDumper::name_map_t());
+ }
+ void dump_tree(Formatter *f,
+ const CrushTreeDumper::name_map_t& ws) const;
static void generate_test_instances(list<CrushWrapper*>& o);
int get_osd_pool_default_crush_replicated_ruleset(CephContext *cct);