#include "common/errno.h"
#include "common/Formatter.h"
#include "common/TextTable.h"
-#include "global/global_context.h"
#include "include/ceph_features.h"
#include "include/str_map.h"
for (auto osd : up) {
auto it = weight_map.find(osd);
if (it == weight_map.end()) {
- // osd is gone or has been moved out of the specific crush-tree
+ ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
+ << "been moved out of the specific crush-tree"
+ << dendl;
to_cancel->push_back(pg);
break;
}
auto adjusted_weight = get_weightf(it->first) * it->second;
if (adjusted_weight == 0) {
- // osd is out/crush-out
+ ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
+ << " is out/crush-out"
+ << dendl;
to_cancel->push_back(pg);
break;
}
pg_t pg, ///< pg to potentially remap
const set<int>& overfull, ///< osds we'd want to evacuate
const vector<int>& underfull, ///< osds to move to, in order of preference
+ const vector<int>& more_underfull, ///< more osds only slightly underfull
vector<int> *orig,
vector<int> *out) ///< resulting alternative mapping
{
rule,
pool->get_size(),
overfull, underfull,
+ more_underfull,
*orig,
out);
if (r < 0)
int OSDMap::calc_pg_upmaps(
CephContext *cct,
- float max_deviation_ratio,
+ uint32_t max_deviation,
int max,
const set<int64_t>& only_pools,
OSDMap::Incremental *pending_inc)
{
ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
OSDMap tmp;
+ // Can't be less than 1 pg
+ if (max_deviation < 1)
+ max_deviation = 1;
tmp.deepish_copy_from(*this);
int num_changed = 0;
map<int,set<pg_t>> pgs_by_osd;
lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
return 0;
}
- float decay_factor = 1.0 / float(max);
float stddev = 0;
map<int,float> osd_deviation; // osd, deviation(pgs)
multimap<float,int> deviation_osd; // deviation(pgs), osd
+ float cur_max_deviation = 0;
for (auto& i : pgs_by_osd) {
// make sure osd is still there (belongs to this crush-tree)
ceph_assert(osd_weight.count(i.first));
osd_deviation[i.first] = deviation;
deviation_osd.insert(make_pair(deviation, i.first));
stddev += deviation * deviation;
+ if (fabsf(deviation) > cur_max_deviation)
+ cur_max_deviation = fabsf(deviation);
}
- if (stddev <= cct->_conf.get_val<double>("osd_calc_pg_upmaps_max_stddev")) {
+ ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
+ if (cur_max_deviation <= max_deviation) {
ldout(cct, 10) << __func__ << " distribution is almost perfect"
<< dendl;
return 0;
auto local_fallback_retries =
cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
while (max--) {
+ ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
// build overfull and underfull
set<int> overfull;
+ set<int> more_overfull;
+ bool using_more_overfull = false;
vector<int> underfull;
- float decay = 0;
- int decay_count = 0;
- while (overfull.empty()) {
- for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
- if (i->first >= (1.0 - decay))
+ vector<int> more_underfull;
+ for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
+ ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
+ if (i->first <= 0)
+ break;
+ if (i->first > max_deviation) {
+ ldout(cct, 30) << " add overfull osd." << i->second << dendl;
overfull.insert(i->second);
+ } else {
+ more_overfull.insert(i->second);
+ }
}
- if (!overfull.empty())
- break;
- decay_count++;
- decay = decay_factor * decay_count;
- if (decay >= 1.0)
- break;
- ldout(cct, 30) << " decay_factor = " << decay_factor
- << " decay_count = " << decay_count
- << " decay (overfull) = " << decay
- << dendl;
- }
- if (overfull.empty()) {
- lderr(cct) << __func__ << " failed to build overfull" << dendl;
- break;
- }
- decay = 0;
- decay_count = 0;
- while (underfull.empty()) {
- for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
- if (i->first >= (-.999 + decay))
+ for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
+ ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
+ if (i->first >= 0)
break;
- underfull.push_back(i->second);
- }
- if (!underfull.empty())
- break;
- decay_count++;
- decay = decay_factor * decay_count;
- if (decay >= .999)
- break;
- ldout(cct, 30) << " decay_factor = " << decay_factor
- << " decay_count = " << decay_count
- << " decay (underfull) = " << decay
- << dendl;
+ if (i->first < -(int)max_deviation) {
+ ldout(cct, 30) << " add underfull osd." << i->second << dendl;
+ underfull.push_back(i->second);
+ } else {
+ more_underfull.push_back(i->second);
+ }
}
- if (underfull.empty()) {
- lderr(cct) << __func__ << " failed to build underfull" << dendl;
+ if (underfull.empty() && overfull.empty()) {
+ ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
break;
}
+ if (overfull.empty() && !underfull.empty()) {
+ ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
+ overfull = more_overfull;
+ using_more_overfull = true;
+ }
ldout(cct, 10) << " overfull " << overfull
<< " underfull " << underfull
auto temp_pgs_by_osd = pgs_by_osd;
// always start with fullest, break if we find any changes to make
for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
- if (skip_overfull) {
+ if (skip_overfull && !underfull.empty()) {
ldout(cct, 10) << " skipping overfull " << dendl;
break; // fall through to check underfull
}
int osd = p->second;
float deviation = p->first;
float target = osd_weight[osd] * pgs_per_weight;
+ ldout(cct, 10) << " Overfull search osd." << osd
+ << " target " << target
+ << " deviation " << deviation
+ << dendl;
ceph_assert(target > 0);
- float deviation_ratio = deviation / target;
- if (deviation_ratio < max_deviation_ratio) {
+ if (!using_more_overfull && deviation <= max_deviation) {
ldout(cct, 10) << " osd." << osd
<< " target " << target
<< " deviation " << deviation
- << " -> ratio " << deviation_ratio
- << " < max ratio " << max_deviation_ratio
+ << " < max deviation " << max_deviation
<< dendl;
break;
}
ldout(cct, 10) << " trying " << pg << dendl;
vector<int> raw, orig, out;
tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
- if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
+ if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
continue;
}
ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
continue;
}
ceph_assert(orig != out);
+ int pos = -1;
+ float max_dev = 0;
for (unsigned i = 0; i < out.size(); ++i) {
if (orig[i] == out[i])
continue; // skip invalid remappings
if (existing.count(orig[i]) || existing.count(out[i]))
continue; // we want new remappings only!
+ if (osd_deviation[orig[i]] > max_dev) {
+ max_dev = osd_deviation[orig[i]];
+ pos = i;
+ ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
+ }
+ }
+ if (pos != -1) {
+ int i = pos;
ldout(cct, 10) << " will try adding new remapping pair "
<< orig[i] << " -> " << out[i] << " for " << pg
+ << (orig[i] != osd ? " NOT selected osd" : "")
<< dendl;
existing.insert(orig[i]);
existing.insert(out[i]);
float deviation = p.first;
float target = osd_weight[osd] * pgs_per_weight;
ceph_assert(target > 0);
- float deviation_ratio = abs(deviation / target);
- if (deviation_ratio < max_deviation_ratio) {
- // respect max_deviation_ratio too
+ if (fabsf(deviation) < max_deviation) {
+ // respect max_deviation too
ldout(cct, 10) << " osd." << osd
<< " target " << target
<< " deviation " << deviation
- << " -> absolute ratio " << deviation_ratio
- << " < max ratio " << max_deviation_ratio
+ << " -> absolute " << fabsf(deviation)
+ << " < max " << max_deviation
<< dendl;
break;
}
float new_stddev = 0;
map<int,float> temp_osd_deviation;
multimap<float,int> temp_deviation_osd;
+ float cur_max_deviation = 0;
for (auto& i : temp_pgs_by_osd) {
// make sure osd is still there (belongs to this crush-tree)
ceph_assert(osd_weight.count(i.first));
<< dendl;
temp_osd_deviation[i.first] = deviation;
temp_deviation_osd.insert(make_pair(deviation, i.first));
- new_stddev += deviation * deviation;
+ new_stddev += deviation * deviation;
+ if (fabsf(deviation) > cur_max_deviation)
+ cur_max_deviation = fabsf(deviation);
}
ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
if (new_stddev >= stddev) {
pending_inc->new_pg_upmap_items[i.first] = i.second;
++num_changed;
}
+ ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
+ if (cur_max_deviation <= max_deviation) {
+ ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
+ << dendl;
+ break;
+ }
}
ldout(cct, 10) << " num_changed = " << num_changed << dendl;
return num_changed;
}
}
-void OSDMap::check_health(health_check_map_t *checks) const
+void OSDMap::check_health(CephContext *cct,
+ health_check_map_t *checks) const
{
int num_osds = get_num_osds();
break;
type = crush->get_bucket_type(parent_id);
if (!subtree_type_is_down(
- g_ceph_context, parent_id, type,
+ cct, parent_id, type,
&down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
break;
current = parent_id;
{
// An osd could configure failsafe ratio, to something different
// but for now assume it is the same here.
- float fsr = g_conf()->osd_failsafe_full_ratio;
+ float fsr = cct->_conf->osd_failsafe_full_ratio;
if (fsr > 1.0) fsr /= 100;
float fr = get_full_ratio();
float br = get_backfillfull_ratio();
}
// OLD_CRUSH_TUNABLES
- if (g_conf()->mon_warn_on_legacy_crush_tunables) {
+ if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
string min = crush->get_min_required_version();
- if (min < g_conf()->mon_crush_min_required_version) {
+ if (min < cct->_conf->mon_crush_min_required_version) {
ostringstream ss;
ss << "crush map has legacy tunables (require " << min
- << ", min is " << g_conf()->mon_crush_min_required_version << ")";
+ << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
}
}
// OLD_CRUSH_STRAW_CALC_VERSION
- if (g_conf()->mon_warn_on_crush_straw_calc_version_zero) {
+ if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
if (crush->get_straw_calc_version() == 0) {
ostringstream ss;
ss << "crush map has straw_calc_version=0";
}
// CACHE_POOL_NO_HIT_SET
- if (g_conf()->mon_warn_on_cache_pools_without_hit_sets) {
+ if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
list<string> detail;
for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
p != pools.end();
d.detail.swap(nearfull_detail);
}
}
+
+ // POOL_PG_NUM_NOT_POWER_OF_TWO
+ if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
+ list<string> detail;
+ for (auto it : get_pools()) {
+ if (!isp2(it.second.get_pg_num_target())) {
+ ostringstream ss;
+ ss << "pool '" << get_pool_name(it.first)
+ << "' pg_num " << it.second.get_pg_num_target()
+ << " is not a power of two";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " pool(s) have non-power-of-two pg_num";
+ auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
+ ss.str());
+ d.detail.swap(detail);
+ }
+ }
}
int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,