#define dout_subsys ceph_subsys_crush
+bool CrushWrapper::has_legacy_rulesets() const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (r &&
+ r->mask.ruleset != i) {
+ return true;
+ }
+ }
+ return false;
+}
+
+int CrushWrapper::renumber_rules_by_ruleset()
+{
+ int max_ruleset = 0;
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (r && r->mask.ruleset >= max_ruleset) {
+ max_ruleset = r->mask.ruleset + 1;
+ }
+ }
+ struct crush_rule **newrules =
+ (crush_rule**)calloc(1, max_ruleset * sizeof(crush_rule*));
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ if (newrules[r->mask.ruleset]) {
+ // collision, we can't do it.
+ free(newrules);
+ return -EINVAL;
+ }
+ newrules[r->mask.ruleset] = r;
+ }
+
+ // success, swap!
+ free(crush->rules);
+ crush->rules = newrules;
+ crush->max_rules = max_ruleset;
+ return 0;
+}
+
+bool CrushWrapper::has_multirule_rulesets() const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j=i+1; j<crush->max_rules; j++) {
+ crush_rule *s = crush->rules[j];
+ if (!s)
+ continue;
+ if (r->mask.ruleset == s->mask.ruleset)
+ return true;
+ }
+ }
+ return false;
+}
+
bool CrushWrapper::has_v2_rules() const
{
for (unsigned i=0; i<crush->max_rules; i++) {
return false;
}
-bool CrushWrapper::has_chooseargs() const
+bool CrushWrapper::has_choose_args() const
{
return !choose_args.empty();
}
-bool CrushWrapper::has_incompat_chooseargs() const
+bool CrushWrapper::has_incompat_choose_args() const
{
- // FIXME: if the chooseargs all have 1 position *and* do not remap IDs then
- // we can fabricate a compatible crush map for legacy clients by swapping the
- // choose_args weights in for the real weights. until then,
- return has_chooseargs();
+ if (choose_args.empty())
+ return false;
+ if (choose_args.size() > 1)
+ return true;
+ crush_choose_arg_map arg_map = choose_args.begin()->second;
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ crush_choose_arg *arg = &arg_map.args[i];
+ if (arg->weight_set_size == 0 &&
+ arg->ids_size == 0)
+ continue;
+ if (arg->weight_set_size != 1)
+ return true;
+ if (arg->ids_size != 0)
+ return true;
+ }
+ return false;
}
int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
{
- if (choose_args.size() > 0) {
- ldout(cct, 1) << "remove_item not implemented when choose_args is not empty" << dendl;
- return -EDOM;
- }
-
ldout(cct, 5) << "remove_item " << item << (unlink_only ? " unlink_only":"") << dendl;
int ret = -ENOENT;
if (id == item) {
ldout(cct, 5) << "remove_item removing item " << item
<< " from bucket " << b->id << dendl;
- crush_bucket_remove_item(crush, b, item);
+ bucket_remove_item(b, item);
adjust_item_weight(cct, b->id, b->weight);
ret = 0;
}
int id = b->items[i];
if (id == item) {
ldout(cct, 5) << "_remove_item_under removing item " << item << " from bucket " << b->id << dendl;
- crush_bucket_remove_item(crush, b, item);
+ bucket_remove_item(b, item);
adjust_item_weight(cct, b->id, b->weight);
ret = 0;
} else if (id < 0) {
return 0;
}
+string CrushWrapper::get_full_location_ordered_string(int id)
+{
+ vector<pair<string, string> > full_location_ordered;
+ string full_location;
+ get_full_location_ordered(id, full_location_ordered);
+ reverse(begin(full_location_ordered), end(full_location_ordered));
+ for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
+ full_location = full_location + i->first + "=" + i->second;
+ if (i != full_location_ordered.end() - 1) {
+ full_location = full_location + ",";
+ }
+ }
+ return full_location;
+}
map<int, string> CrushWrapper::get_parent_hierarchy(int id)
{
return b->size;
}
+int CrushWrapper::_get_leaves(int id, list<int> *leaves)
+{
+ assert(leaves);
-int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name,
- const map<string,string>& loc) // typename -> bucketname
+ // Already leaf?
+ if (id >= 0) {
+ leaves->push_back(id);
+ return 0;
+ }
+
+ crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return -ENOENT;
+ }
+
+ for (unsigned n = 0; n < b->size; n++) {
+ if (b->items[n] >= 0) {
+ leaves->push_back(b->items[n]);
+ } else {
+ // is a bucket, do recursive call
+ int r = _get_leaves(b->items[n], leaves);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ return 0; // all is well
+}
+
+int CrushWrapper::get_leaves(const string &name, set<int> *leaves)
{
- if (choose_args.size() > 0) {
- ldout(cct, 1) << "insert_item not implemented when choose_args is not empty" << dendl;
- return -EDOM;
+ assert(leaves);
+ leaves->clear();
+
+ if (!name_exists(name)) {
+ return -ENOENT;
+ }
+
+ int id = get_item_id(name);
+ if (id >= 0) {
+ // already leaf
+ leaves->insert(id);
+ return 0;
}
+ list<int> unordered;
+ int r = _get_leaves(id, &unordered);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto &p : unordered) {
+ leaves->insert(p);
+ }
+
+ return 0;
+}
+
+int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc) // typename -> bucketname
+{
ldout(cct, 5) << "insert_item item " << item << " weight " << weight
<< " name " << name << " loc " << loc << dendl;
ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
<< " to bucket " << id << dendl;
- int r = crush_bucket_add_item(crush, b, cur, 0);
+ int r = bucket_add_item(b, cur, 0);
assert (!r);
break;
}
int CrushWrapper::move_bucket(CephContext *cct, int id, const map<string,string>& loc)
{
- if (choose_args.size() > 0) {
- ldout(cct, 1) << "move_bucket not implemented when choose_args is not empty" << dendl;
- return -EDOM;
- }
-
// sorry this only works for buckets
if (id >= 0)
return -EINVAL;
return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
}
-int CrushWrapper::link_bucket(CephContext *cct, int id, const map<string,string>& loc)
+int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
{
- if (choose_args.size() > 0) {
- ldout(cct, 1) << "link_bucket not implemented when choose_args is not empty" << dendl;
- return -EDOM;
- }
+ if (src >= 0 || dst >= 0)
+ return -EINVAL;
+ if (!item_exists(src) || !item_exists(dst))
+ return -EINVAL;
+ crush_bucket *a = get_bucket(src);
+ crush_bucket *b = get_bucket(dst);
+ unsigned aw = a->weight;
+ unsigned bw = b->weight;
+
+ // swap weights
+ adjust_item_weight(cct, a->id, bw);
+ adjust_item_weight(cct, b->id, aw);
+
+ // swap items
+ map<int,unsigned> tmp;
+ unsigned as = a->size;
+ unsigned bs = b->size;
+ for (unsigned i = 0; i < as; ++i) {
+ int item = a->items[0];
+ int itemw = crush_get_bucket_item_weight(a, 0);
+ tmp[item] = itemw;
+ bucket_remove_item(a, item);
+ }
+ assert(a->size == 0);
+ assert(b->size == bs);
+ for (unsigned i = 0; i < bs; ++i) {
+ int item = b->items[0];
+ int itemw = crush_get_bucket_item_weight(b, 0);
+ bucket_remove_item(b, item);
+ bucket_add_item(a, item, itemw);
+ }
+ assert(a->size == bs);
+ assert(b->size == 0);
+ for (auto t : tmp) {
+ bucket_add_item(b, t.first, t.second);
+ }
+ assert(a->size == bs);
+ assert(b->size == as);
+
+ // swap names
+ swap_names(src, dst);
+ return 0;
+}
+int CrushWrapper::link_bucket(CephContext *cct, int id, const map<string,string>& loc)
+{
// sorry this only works for buckets
if (id >= 0)
return -EINVAL;
int CrushWrapper::create_or_move_item(CephContext *cct, int item, float weight, string name,
const map<string,string>& loc) // typename -> bucketname
{
- if (choose_args.size() > 0) {
- ldout(cct, 1) << "create_or_move_item not implemented when choose_args is not empty" << dendl;
- return -EDOM;
- }
-
int ret = 0;
int old_iweight;
int CrushWrapper::update_item(CephContext *cct, int item, float weight, string name,
const map<string,string>& loc) // typename -> bucketname
{
- if (choose_args.size() > 0) {
- ldout(cct, 1) << "update_item not implemented when choose_args is not empty" << dendl;
- return -EDOM;
- }
-
ldout(cct, 5) << "update_item item " << item << " weight " << weight
<< " name " << name << " loc " << loc << dendl;
int ret = 0;
continue;
for (unsigned i = 0; i < b->size; i++) {
if (b->items[i] == id) {
- int diff = crush_bucket_adjust_item_weight(crush, b, id, weight);
+ int diff = bucket_adjust_item_weight(cct, b, id, weight);
ldout(cct, 5) << "adjust_item_weight " << id << " diff " << diff << " in bucket " << bidx << dendl;
adjust_item_weight(cct, -1 - bidx, b->weight);
changed++;
crush_bucket *b = get_bucket(bid);
for (unsigned int i = 0; i < b->size; i++) {
if (b->items[i] == id) {
- int diff = crush_bucket_adjust_item_weight(crush, b, id, weight);
+ int diff = bucket_adjust_item_weight(cct, b, id, weight);
ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " diff " << diff << " in bucket " << bid << dendl;
adjust_item_weight(cct, bid, b->weight);
changed++;
for (unsigned i=0; i<b->size; ++i) {
int n = b->items[i];
if (n >= 0) {
- crush_bucket_adjust_item_weight(crush, b, n, weight);
+ bucket_adjust_item_weight(cct, b, n, weight);
++changed;
++local_changed;
} else {
return -ENOENT;
}
+int CrushWrapper::get_parent_of_type(int item, int type) const
+{
+ do {
+ int r = get_immediate_parent_id(item, &item);
+ if (r < 0) {
+ return 0;
+ }
+ } while (get_bucket_type(item) != type);
+ return item;
+}
+
bool CrushWrapper::class_is_in_use(int class_id)
{
for (auto &i : class_bucket)
}
}
-int CrushWrapper::add_simple_ruleset_at(string name, string root_name,
- string failure_domain_name,
- string mode, int rule_type,
- int rno, ostream *err)
+int CrushWrapper::add_simple_rule_at(
+ string name, string root_name,
+ string failure_domain_name,
+ string mode, int rule_type,
+ int rno, ostream *err)
{
if (rule_exists(name)) {
if (err)
return rno;
}
-int CrushWrapper::add_simple_ruleset(string name, string root_name,
- string failure_domain_name,
- string mode, int rule_type,
- ostream *err)
+int CrushWrapper::add_simple_rule(
+ string name, string root_name,
+ string failure_domain_name,
+ string mode, int rule_type,
+ ostream *err)
{
- return add_simple_ruleset_at(name, root_name, failure_domain_name, mode,
- rule_type, -1, err);
+ return add_simple_rule_at(name, root_name, failure_domain_name, mode,
+ rule_type, -1, err);
}
int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
crush_rule *rule = crush->rules[ruleno];
// build a weight map for each TAKE in the rule, and then merge them
+
+ // FIXME: if there are multiple takes that place a different number of
+ // objects we do not take that into account. (Also, note that doing this
+ // right is also a function of the pool, since the crush rule
+ // might choose 2 + choose 2 but pool size may only be 3.)
for (unsigned i=0; i<rule->len; ++i) {
map<int,float> m;
float sum = 0;
return 0;
}
+int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
+{
+ if (cct->_conf->osd_crush_update_weight_set) {
+ unsigned position;
+ for (position = 0; position < bucket->size; position++)
+ if (bucket->items[position] == item)
+ break;
+ assert(position != bucket->size);
+ for (auto w : choose_args) {
+ crush_choose_arg_map arg_map = w.second;
+ crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+ for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ weight_set->weights[position] = weight;
+ }
+ }
+ }
+ return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
+}
+
+int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
+{
+ __u32 new_size = bucket->size + 1;
+ for (auto w : choose_args) {
+ crush_choose_arg_map arg_map = w.second;
+ crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+ for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ weight_set->weights = (__u32*)realloc(weight_set->weights, new_size * sizeof(__u32));
+ assert(weight_set->size + 1 == new_size);
+ weight_set->weights[weight_set->size] = weight;
+ weight_set->size = new_size;
+ }
+ if (arg->ids_size) {
+ arg->ids = (int*)realloc(arg->ids, new_size * sizeof(int));
+ assert(arg->ids_size + 1 == new_size);
+ arg->ids[arg->ids_size] = item;
+ arg->ids_size = new_size;
+ }
+ }
+ return crush_bucket_add_item(crush, bucket, item, weight);
+}
+
+int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
+{
+ __u32 new_size = bucket->size - 1;
+ unsigned position;
+ for (position = 0; position < bucket->size; position++)
+ if (bucket->items[position] == item)
+ break;
+ assert(position != bucket->size);
+ for (auto w : choose_args) {
+ crush_choose_arg_map arg_map = w.second;
+ crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+ for (__u32 j = 0; j < arg->weight_set_size; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ assert(weight_set->size - 1 == new_size);
+ for (__u32 k = position; k < new_size; k++)
+ weight_set->weights[k] = weight_set->weights[k+1];
+ weight_set->weights = (__u32*)realloc(weight_set->weights, new_size * sizeof(__u32));
+ weight_set->size = new_size;
+ }
+ if (arg->ids_size) {
+ assert(arg->ids_size - 1 == new_size);
+ for (__u32 k = position; k < new_size; k++)
+ arg->ids[k] = arg->ids[k+1];
+ arg->ids = (int*)realloc(arg->ids, new_size * sizeof(int));
+ arg->ids_size = new_size;
+ }
+ }
+ return crush_bucket_remove_item(crush, bucket, item);
+}
+
int CrushWrapper::update_device_class(CephContext *cct, int id, const string& class_name, const string& name)
{
int class_id = get_class_id(class_name);
int weight = crush_get_bucket_item_weight(original, i);
if (item >= 0) {
if (class_map.count(item) != 0 && class_map[item] == device_class) {
- int res = crush_bucket_add_item(crush, copy, item, weight);
+ int res = bucket_add_item(copy, item, weight);
if (res)
return res;
}
crush_bucket *child_copy = get_bucket(child_copy_id);
if (IS_ERR(child_copy))
return -ENOENT;
- res = crush_bucket_add_item(crush, copy, child_copy_id, child_copy->weight);
+ res = bucket_add_item(copy, child_copy_id, child_copy->weight);
if (res)
return res;
}
::encode(crush->max_rules, bl);
::encode(crush->max_devices, bl);
+ bool encode_compat_choose_args = false;
+ crush_choose_arg_map arg_map;
+ memset(&arg_map, '\0', sizeof(arg_map));
+ if (has_choose_args() &&
+ !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
+ assert(!has_incompat_choose_args());
+ encode_compat_choose_args = true;
+ arg_map = choose_args.begin()->second;
+ }
+
// buckets
for (int i=0; i<crush->max_buckets; i++) {
__u32 alg = 0;
break;
case CRUSH_BUCKET_STRAW2:
- for (unsigned j=0; j<crush->buckets[i]->size; j++) {
- ::encode((reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights[j], bl);
+ {
+ __u32 *weights;
+ if (encode_compat_choose_args &&
+ arg_map.args[i].weight_set_size > 0) {
+ weights = arg_map.args[i].weight_set[0].weights;
+ } else {
+ weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
+ }
+ for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+ ::encode(weights[j], bl);
+ }
}
break;
// fixme
}
-int CrushWrapper::_get_osd_pool_default_crush_replicated_ruleset(CephContext *cct,
- bool quiet)
-{
- int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
- if (crush_ruleset == -1) {
- crush_ruleset = cct->_conf->osd_pool_default_crush_replicated_ruleset;
- } else if (!quiet) {
- ldout(cct, 0) << "osd_pool_default_crush_rule is deprecated "
- << "use osd_pool_default_crush_replicated_ruleset instead"
- << dendl;
- ldout(cct, 0) << "osd_pool_default_crush_rule = "
- << cct->_conf-> osd_pool_default_crush_rule << " overrides "
- << "osd_pool_default_crush_replicated_ruleset = "
- << cct->_conf->osd_pool_default_crush_replicated_ruleset
- << dendl;
- }
-
- return crush_ruleset;
-}
-
/**
* Determine the default CRUSH ruleset ID to be used with
* newly created replicated pools.
*/
int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
{
- int crush_ruleset = _get_osd_pool_default_crush_replicated_ruleset(cct,
- false);
- if (crush_ruleset == CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) {
+ int crush_ruleset = cct->_conf->osd_pool_default_crush_rule;
+ if (crush_ruleset < 0) {
crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
} else if (!ruleset_exists(crush_ruleset)) {
crush_ruleset = -1; // match find_first_ruleset() retval
}
-
return crush_ruleset;
}
ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
<< dendl;
+ // identify underful targets for each intermediate level.
+ // this serves two purposes:
+ // 1. we can tell when we are selecting a bucket that does not have any underfull
+ // devices beneath it. that means that if the current input includes an overfull
+ // device, we won't be able to find an underfull device with this parent to
+ // swap for it.
+ // 2. when we decide we should reject a bucket due to the above, this list gives us
+ // a list of peers to consider that *do* have underfull devices available.. (we
+ // are careful to pick one that has the same parent.)
+ vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
+ underfull_buckets.resize(stack.size() - 1);
+ for (auto osd : underfull) {
+ int item = osd;
+ for (int j = (int)stack.size() - 2; j >= 0; --j) {
+ int type = stack[j].first;
+ item = get_parent_of_type(item, type);
+ ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
+ << " is " << item << dendl;
+ underfull_buckets[j].insert(item);
+ }
+ }
+ ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
+
for (unsigned j = 0; j < stack.size(); ++j) {
int type = stack[j].first;
int fanout = stack[j].second;
auto tmpi = i;
for (auto from : w) {
ldout(cct, 10) << " from " << from << dendl;
-
+ // identify leaves under each choice. we use this to check whether any of these
+ // leaves are overfull. (if so, we need to make sure there are underfull candidates
+ // to swap for them.)
+ vector<set<int>> leaves;
+ leaves.resize(fanout);
for (int pos = 0; pos < fanout; ++pos) {
if (type > 0) {
// non-leaf
- int item = *tmpi;
- do {
- int r = get_immediate_parent_id(item, &item);
- if (r < 0) {
- ldout(cct, 10) << __func__ << " parent of " << item << " got "
- << cpp_strerror(r) << dendl;
- return -EINVAL;
- }
- } while (get_bucket_type(item) != type);
+ int item = get_parent_of_type(*tmpi, type);
o.push_back(item);
- ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
- << " of type " << type << dendl;
int n = cum_fanout;
- while (n-- && tmpi != orig.end())
- ++tmpi;
+ while (n-- && tmpi != orig.end()) {
+ leaves[pos].insert(*tmpi++);
+ }
+ ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
+ << " of type " << type << " over leaves " << leaves[pos] << dendl;
} else {
// leaf
bool replaced = false;
}
}
}
+ if (j + 1 < stack.size()) {
+ // check if any buckets have overfull leaves but no underfull candidates
+ for (int pos = 0; pos < fanout; ++pos) {
+ if (underfull_buckets[j].count(o[pos]) == 0) {
+ // are any leaves overfull?
+ bool any_overfull = false;
+ for (auto osd : leaves[pos]) {
+ if (overfull.count(osd)) {
+ any_overfull = true;
+ }
+ }
+ if (any_overfull) {
+ ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
+ << ">0 leaves " << leaves[pos] << " is overfull; alts "
+ << underfull_buckets[j]
+ << dendl;
+ for (auto alt : underfull_buckets[j]) {
+ if (std::find(o.begin(), o.end(), alt) == o.end()) {
+ // see if alt has the same parent
+ if (j == 0 ||
+ get_parent_of_type(o[pos], stack[j-1].first) ==
+ get_parent_of_type(alt, stack[j-1].first)) {
+ if (j)
+ ldout(cct, 10) << " replacing " << o[pos]
+ << " (which has no underfull leaves) with " << alt
+ << " (same parent "
+ << get_parent_of_type(alt, stack[j-1].first) << " type "
+ << type << ")" << dendl;
+ else
+ ldout(cct, 10) << " replacing " << o[pos]
+ << " (which has no underfull leaves) with " << alt
+ << " (first level)" << dendl;
+ o[pos] = alt;
+ break;
+ } else {
+ ldout(cct, 30) << " alt " << alt << " for " << o[pos]
+ << " has different parent, skipping" << dendl;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
if (i == orig.end()) {
ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
break;