X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fosd%2FOSDMap.cc;h=fc8db797ce71e239c4dce092db2a682144fa469b;hb=92f5a8d42d07f9929ae4fa7e01342fe8d96808a8;hp=be5d6fcf0deaa5f4683dded065f24e5790b00671;hpb=a0324939f9d0e1905d5df8f57442f09dc70af83d;p=ceph.git diff --git a/ceph/src/osd/OSDMap.cc b/ceph/src/osd/OSDMap.cc index be5d6fcf0..fc8db797c 100644 --- a/ceph/src/osd/OSDMap.cc +++ b/ceph/src/osd/OSDMap.cc @@ -26,7 +26,6 @@ #include "common/errno.h" #include "common/Formatter.h" #include "common/TextTable.h" -#include "global/global_context.h" #include "include/ceph_features.h" #include "include/str_map.h" @@ -1842,13 +1841,17 @@ bool OSDMap::check_pg_upmaps( for (auto osd : up) { auto it = weight_map.find(osd); if (it == weight_map.end()) { - // osd is gone or has been moved out of the specific crush-tree + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has " + << "been moved out of the specific crush-tree" + << dendl; to_cancel->push_back(pg); break; } auto adjusted_weight = get_weightf(it->first) * it->second; if (adjusted_weight == 0) { - // osd is out/crush-out + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd + << " is out/crush-out" + << dendl; to_cancel->push_back(pg); break; } @@ -4398,6 +4401,7 @@ bool OSDMap::try_pg_upmap( pg_t pg, ///< pg to potentially remap const set& overfull, ///< osds we'd want to evacuate const vector& underfull, ///< osds to move to, in order of preference + const vector& more_underfull, ///< more osds only slightly underfull vector *orig, vector *out) ///< resulting alternative mapping { @@ -4426,6 +4430,7 @@ bool OSDMap::try_pg_upmap( rule, pool->get_size(), overfull, underfull, + more_underfull, *orig, out); if (r < 0) @@ -4437,13 +4442,16 @@ bool OSDMap::try_pg_upmap( int OSDMap::calc_pg_upmaps( CephContext *cct, - float max_deviation_ratio, + uint32_t max_deviation, int max, const set& only_pools, OSDMap::Incremental *pending_inc) { ldout(cct, 10) << __func__ << " pools " << only_pools << dendl; OSDMap tmp; + // Can't be less than 1 pg + if (max_deviation < 1) + max_deviation = 1; tmp.deepish_copy_from(*this); int num_changed = 0; map> pgs_by_osd; @@ -4505,10 +4513,10 @@ int OSDMap::calc_pg_upmaps( lderr(cct) << __func__ << " abort due to max <= 0" << dendl; return 0; } - float decay_factor = 1.0 / float(max); float stddev = 0; map osd_deviation; // osd, deviation(pgs) multimap deviation_osd; // deviation(pgs), osd + float cur_max_deviation = 0; for (auto& i : pgs_by_osd) { // make sure osd is still there (belongs to this crush-tree) ceph_assert(osd_weight.count(i.first)); @@ -4522,8 +4530,11 @@ int OSDMap::calc_pg_upmaps( osd_deviation[i.first] = deviation; deviation_osd.insert(make_pair(deviation, i.first)); stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); } - if (stddev <= cct->_conf.get_val("osd_calc_pg_upmaps_max_stddev")) { + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { ldout(cct, 10) << __func__ << " distribution is almost perfect" << dendl; return 0; @@ -4534,55 +4545,45 @@ int OSDMap::calc_pg_upmaps( auto local_fallback_retries = cct->_conf.get_val("osd_calc_pg_upmaps_local_fallback_retries"); while (max--) { + ldout(cct, 30) << "Top of loop #" << max+1 << dendl; // build overfull and underfull set overfull; + set more_overfull; + bool using_more_overfull = false; vector underfull; - float decay = 0; - int decay_count = 0; - while (overfull.empty()) { - for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) { - if (i->first >= (1.0 - decay)) + vector more_underfull; + for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) { + ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl; + if (i->first <= 0) + break; + if (i->first > max_deviation) { + ldout(cct, 30) << " add overfull osd." << i->second << dendl; overfull.insert(i->second); + } else { + more_overfull.insert(i->second); + } } - if (!overfull.empty()) - break; - decay_count++; - decay = decay_factor * decay_count; - if (decay >= 1.0) - break; - ldout(cct, 30) << " decay_factor = " << decay_factor - << " decay_count = " << decay_count - << " decay (overfull) = " << decay - << dendl; - } - if (overfull.empty()) { - lderr(cct) << __func__ << " failed to build overfull" << dendl; - break; - } - decay = 0; - decay_count = 0; - while (underfull.empty()) { - for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) { - if (i->first >= (-.999 + decay)) + for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) { + ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl; + if (i->first >= 0) break; - underfull.push_back(i->second); - } - if (!underfull.empty()) - break; - decay_count++; - decay = decay_factor * decay_count; - if (decay >= .999) - break; - ldout(cct, 30) << " decay_factor = " << decay_factor - << " decay_count = " << decay_count - << " decay (underfull) = " << decay - << dendl; + if (i->first < -(int)max_deviation) { + ldout(cct, 30) << " add underfull osd." << i->second << dendl; + underfull.push_back(i->second); + } else { + more_underfull.push_back(i->second); + } } - if (underfull.empty()) { - lderr(cct) << __func__ << " failed to build underfull" << dendl; + if (underfull.empty() && overfull.empty()) { + ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl; break; } + if (overfull.empty() && !underfull.empty()) { + ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl; + overfull = more_overfull; + using_more_overfull = true; + } ldout(cct, 10) << " overfull " << overfull << " underfull " << underfull @@ -4597,21 +4598,23 @@ int OSDMap::calc_pg_upmaps( auto temp_pgs_by_osd = pgs_by_osd; // always start with fullest, break if we find any changes to make for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) { - if (skip_overfull) { + if (skip_overfull && !underfull.empty()) { ldout(cct, 10) << " skipping overfull " << dendl; break; // fall through to check underfull } int osd = p->second; float deviation = p->first; float target = osd_weight[osd] * pgs_per_weight; + ldout(cct, 10) << " Overfull search osd." << osd + << " target " << target + << " deviation " << deviation + << dendl; ceph_assert(target > 0); - float deviation_ratio = deviation / target; - if (deviation_ratio < max_deviation_ratio) { + if (!using_more_overfull && deviation <= max_deviation) { ldout(cct, 10) << " osd." << osd << " target " << target << " deviation " << deviation - << " -> ratio " << deviation_ratio - << " < max ratio " << max_deviation_ratio + << " < max deviation " << max_deviation << dendl; break; } @@ -4707,7 +4710,7 @@ int OSDMap::calc_pg_upmaps( ldout(cct, 10) << " trying " << pg << dendl; vector raw, orig, out; tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too - if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) { + if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) { continue; } ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl; @@ -4715,13 +4718,24 @@ int OSDMap::calc_pg_upmaps( continue; } ceph_assert(orig != out); + int pos = -1; + float max_dev = 0; for (unsigned i = 0; i < out.size(); ++i) { if (orig[i] == out[i]) continue; // skip invalid remappings if (existing.count(orig[i]) || existing.count(out[i])) continue; // we want new remappings only! + if (osd_deviation[orig[i]] > max_dev) { + max_dev = osd_deviation[orig[i]]; + pos = i; + ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl; + } + } + if (pos != -1) { + int i = pos; ldout(cct, 10) << " will try adding new remapping pair " << orig[i] << " -> " << out[i] << " for " << pg + << (orig[i] != osd ? " NOT selected osd" : "") << dendl; existing.insert(orig[i]); existing.insert(out[i]); @@ -4750,14 +4764,13 @@ int OSDMap::calc_pg_upmaps( float deviation = p.first; float target = osd_weight[osd] * pgs_per_weight; ceph_assert(target > 0); - float deviation_ratio = abs(deviation / target); - if (deviation_ratio < max_deviation_ratio) { - // respect max_deviation_ratio too + if (fabsf(deviation) < max_deviation) { + // respect max_deviation too ldout(cct, 10) << " osd." << osd << " target " << target << " deviation " << deviation - << " -> absolute ratio " << deviation_ratio - << " < max ratio " << max_deviation_ratio + << " -> absolute " << fabsf(deviation) + << " < max " << max_deviation << dendl; break; } @@ -4843,6 +4856,7 @@ int OSDMap::calc_pg_upmaps( float new_stddev = 0; map temp_osd_deviation; multimap temp_deviation_osd; + float cur_max_deviation = 0; for (auto& i : temp_pgs_by_osd) { // make sure osd is still there (belongs to this crush-tree) ceph_assert(osd_weight.count(i.first)); @@ -4855,7 +4869,9 @@ int OSDMap::calc_pg_upmaps( << dendl; temp_osd_deviation[i.first] = deviation; temp_deviation_osd.insert(make_pair(deviation, i.first)); - new_stddev += deviation * deviation; + new_stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); } ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl; if (new_stddev >= stddev) { @@ -4907,6 +4923,12 @@ int OSDMap::calc_pg_upmaps( pending_inc->new_pg_upmap_items[i.first] = i.second; ++num_changed; } + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " Optimization plan is almost perfect" + << dendl; + break; + } } ldout(cct, 10) << " num_changed = " << num_changed << dendl; return num_changed; @@ -5394,7 +5416,8 @@ void print_osd_utilization(const OSDMap& osdmap, } } -void OSDMap::check_health(health_check_map_t *checks) const +void OSDMap::check_health(CephContext *cct, + health_check_map_t *checks) const { int num_osds = get_num_osds(); @@ -5439,7 +5462,7 @@ void OSDMap::check_health(health_check_map_t *checks) const break; type = crush->get_bucket_type(parent_id); if (!subtree_type_is_down( - g_ceph_context, parent_id, type, + cct, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down)) break; current = parent_id; @@ -5568,7 +5591,7 @@ void OSDMap::check_health(health_check_map_t *checks) const { // An osd could configure failsafe ratio, to something different // but for now assume it is the same here. - float fsr = g_conf()->osd_failsafe_full_ratio; + float fsr = cct->_conf->osd_failsafe_full_ratio; if (fsr > 1.0) fsr /= 100; float fr = get_full_ratio(); float br = get_backfillfull_ratio(); @@ -5719,19 +5742,19 @@ void OSDMap::check_health(health_check_map_t *checks) const } // OLD_CRUSH_TUNABLES - if (g_conf()->mon_warn_on_legacy_crush_tunables) { + if (cct->_conf->mon_warn_on_legacy_crush_tunables) { string min = crush->get_min_required_version(); - if (min < g_conf()->mon_crush_min_required_version) { + if (min < cct->_conf->mon_crush_min_required_version) { ostringstream ss; ss << "crush map has legacy tunables (require " << min - << ", min is " << g_conf()->mon_crush_min_required_version << ")"; + << ", min is " << cct->_conf->mon_crush_min_required_version << ")"; auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str()); d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables"); } } // OLD_CRUSH_STRAW_CALC_VERSION - if (g_conf()->mon_warn_on_crush_straw_calc_version_zero) { + if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) { if (crush->get_straw_calc_version() == 0) { ostringstream ss; ss << "crush map has straw_calc_version=0"; @@ -5742,7 +5765,7 @@ void OSDMap::check_health(health_check_map_t *checks) const } // CACHE_POOL_NO_HIT_SET - if (g_conf()->mon_warn_on_cache_pools_without_hit_sets) { + if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) { list detail; for (map::const_iterator p = pools.begin(); p != pools.end(); @@ -5820,6 +5843,27 @@ void OSDMap::check_health(health_check_map_t *checks) const d.detail.swap(nearfull_detail); } } + + // POOL_PG_NUM_NOT_POWER_OF_TWO + if (cct->_conf.get_val("mon_warn_on_pool_pg_num_not_power_of_two")) { + list detail; + for (auto it : get_pools()) { + if (!isp2(it.second.get_pg_num_target())) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' pg_num " << it.second.get_pg_num_target() + << " is not a power of two"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have non-power-of-two pg_num"; + auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN, + ss.str()); + d.detail.swap(detail); + } + } } int OSDMap::parse_osd_id_list(const vector& ls, set *out,