#include "OSDMap.h"
#include <algorithm>
#include "common/config.h"
+#include "common/errno.h"
#include "common/Formatter.h"
#include "common/TextTable.h"
#include "include/ceph_features.h"
return 0;
}
+// ----------------------------------
+// OSDMap
bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
{
return num_osd;
}
-void OSDMap::count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const
+void OSDMap::get_full_pools(CephContext *cct,
+ set<int64_t> *full,
+ set<int64_t> *backfillfull,
+ set<int64_t> *nearfull) const
{
- *full = 0;
- *backfill = 0;
- *nearfull = 0;
+ assert(full);
+ assert(backfillfull);
+ assert(nearfull);
+ full->clear();
+ backfillfull->clear();
+ nearfull->clear();
+
+ vector<int> full_osds;
+ vector<int> backfillfull_osds;
+ vector<int> nearfull_osds;
for (int i = 0; i < max_osd; ++i) {
if (exists(i) && is_up(i) && is_in(i)) {
if (osd_state[i] & CEPH_OSD_FULL)
- ++(*full);
+ full_osds.push_back(i);
else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
- ++(*backfill);
+ backfillfull_osds.push_back(i);
else if (osd_state[i] & CEPH_OSD_NEARFULL)
- ++(*nearfull);
+ nearfull_osds.push_back(i);
}
}
+
+ for (auto i: full_osds) {
+ get_pool_ids_by_osd(cct, i, full);
+ }
+ for (auto i: backfillfull_osds) {
+ get_pool_ids_by_osd(cct, i, backfillfull);
+ }
+ for (auto i: nearfull_osds) {
+ get_pool_ids_by_osd(cct, i, nearfull);
+ }
}
static bool get_osd_utilization(
if (!is_up(osd))
continue;
const osd_xinfo_t &xi = get_xinfo(osd);
+ if (xi.features == 0)
+ continue; // bogus xinfo, maybe #20751 or similar, skipping
if (first) {
cached_up_osd_features = xi.features;
first = false;
vector<int> raw_up;
int primary;
tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
+ bool remove = false;
if (vectors_equal(raw_up, pg.second)) {
ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
<< pg.second << " that matches raw_up mapping" << dendl;
+ remove = true;
+ }
+ // oversized pg_temp?
+ if (pg.second.size() > tmpmap.get_pg_pool(pg.first.pool())->get_size()) {
+ ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
+ << pg.second << " exceeds pool size" << dendl;
+ remove = true;
+ }
+ if (remove) {
if (osdmap.pg_temp->count(pg.first))
pending_inc->new_pg_temp[pg.first].clear();
else
}
}
+void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
+ const OSDMap& osdmap,
+ Incremental *pending_inc)
+{
+ ldout(cct, 10) << __func__ << dendl;
+ OSDMap tmpmap;
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(*pending_inc);
+ set<pg_t> to_check;
+ set<pg_t> to_cancel;
+ map<int, map<int, float>> rule_weight_map;
+
+ for (auto& p : tmpmap.pg_upmap) {
+ to_check.insert(p.first);
+ }
+ for (auto& p : tmpmap.pg_upmap_items) {
+ to_check.insert(p.first);
+ }
+ for (auto& p : pending_inc->new_pg_upmap) {
+ to_check.insert(p.first);
+ }
+ for (auto& p : pending_inc->new_pg_upmap_items) {
+ to_check.insert(p.first);
+ }
+ for (auto& pg : to_check) {
+ auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
+ if (crush_rule < 0) {
+ lderr(cct) << __func__ << " unable to load crush-rule of pg "
+ << pg << dendl;
+ continue;
+ }
+ map<int, float> weight_map;
+ auto it = rule_weight_map.find(crush_rule);
+ if (it == rule_weight_map.end()) {
+ auto r = tmpmap.crush->get_rule_weight_osd_map(crush_rule, &weight_map);
+ if (r < 0) {
+ lderr(cct) << __func__ << " unable to get crush weight_map for "
+ << "crush_rule " << crush_rule << dendl;
+ continue;
+ }
+ rule_weight_map[crush_rule] = weight_map;
+ } else {
+ weight_map = it->second;
+ }
+ auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+ if (type < 0) {
+ lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+ << pg << dendl;
+ continue;
+ }
+ ldout(cct, 10) << __func__ << " pg " << pg
+ << " crush-rule-id " << crush_rule
+ << " weight_map " << weight_map
+ << " failure-domain-type " << type
+ << dendl;
+ vector<int> raw;
+ int primary;
+ tmpmap.pg_to_raw_up(pg, &raw, &primary);
+ set<int> parents;
+ for (auto osd : raw) {
+ if (type > 0) {
+ auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
+ if (parent >= 0) {
+ lderr(cct) << __func__ << " unable to get parent of raw osd."
+ << osd << " of pg " << pg
+ << dendl;
+ break;
+ }
+ auto r = parents.insert(parent);
+ if (!r.second) {
+ // two up-set osds come from same parent
+ to_cancel.insert(pg);
+ break;
+ }
+ }
+ // the above check validates collision only
+ // below we continue to check against crush-topology changing..
+ auto it = weight_map.find(osd);
+ if (it == weight_map.end()) {
+ // osd is gone or has been moved out of the specific crush-tree
+ to_cancel.insert(pg);
+ break;
+ }
+ auto adjusted_weight = tmpmap.get_weightf(it->first) * it->second;
+ if (adjusted_weight == 0) {
+ // osd is out/crush-out
+ to_cancel.insert(pg);
+ break;
+ }
+ }
+ }
+ for (auto &pg: to_cancel) {
+ { // pg_upmap
+ auto it = pending_inc->new_pg_upmap.find(pg);
+ if (it != pending_inc->new_pg_upmap.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap entry "
+ << it->first << "->" << it->second
+ << dendl;
+ pending_inc->new_pg_upmap.erase(it);
+ }
+ if (osdmap.pg_upmap.count(pg)) {
+ ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
+ << osdmap.pg_upmap.find(pg)->first << "->"
+ << osdmap.pg_upmap.find(pg)->second
+ << dendl;
+ pending_inc->old_pg_upmap.insert(pg);
+ }
+ }
+ { // pg_upmap_items
+ auto it = pending_inc->new_pg_upmap_items.find(pg);
+ if (it != pending_inc->new_pg_upmap_items.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap_items entry "
+ << it->first << "->" << it->second
+ << dendl;
+ pending_inc->new_pg_upmap_items.erase(it);
+ }
+ if (osdmap.pg_upmap_items.count(pg)) {
+ ldout(cct, 10) << __func__ << " cancel invalid "
+ << "pg_upmap_items entry "
+ << osdmap.pg_upmap_items.find(pg)->first << "->"
+ << osdmap.pg_upmap_items.find(pg)->second
+ << dendl;
+ pending_inc->old_pg_upmap_items.insert(pg);
+ }
+ }
+ }
+}
+
int OSDMap::apply_incremental(const Incremental &inc)
{
new_blacklist_entries = false;
if (p != pg_upmap.end()) {
// make sure targets aren't marked out
for (auto osd : p->second) {
- if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd_weight[osd] == 0) {
+ if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
+ osd_weight[osd] == 0) {
// reject/ignore the explicit mapping
return;
}
auto q = pg_upmap_items.find(pg);
if (q != pg_upmap_items.end()) {
- for (auto& i : *raw) {
- for (auto& r : q->second) {
- if (r.first != i) {
- continue;
- }
- if (!(r.second != CRUSH_ITEM_NONE &&
- r.second < max_osd &&
- osd_weight[r.second] == 0)) {
- i = r.second;
- }
- break;
+ // NOTE: this approach does not allow a bidirectional swap,
+ // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+ for (auto& r : q->second) {
+ // make sure the replacement value doesn't already appear
+ bool exists = false;
+ ssize_t pos = -1;
+ for (unsigned i = 0; i < raw->size(); ++i) {
+ int osd = (*raw)[i];
+ if (osd == r.second) {
+ exists = true;
+ break;
+ }
+ // ignore mapping if target is marked out (or invalid osd id)
+ if (osd == r.first &&
+ pos < 0 &&
+ !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
+ r.second >= 0 && osd_weight[r.second] == 0)) {
+ pos = i;
+ }
+ }
+ if (!exists && pos >= 0) {
+ (*raw)[pos] = r.second;
}
}
}
return false; // same primary (tho replicas may have changed)
}
+uint64_t OSDMap::get_encoding_features() const
+{
+ uint64_t f = SIGNIFICANT_FEATURES;
+ if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
+ f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
+ CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+ }
+ if (require_osd_release < CEPH_RELEASE_KRAKEN) {
+ f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
+ CEPH_FEATURE_MSG_ADDR2);
+ }
+ if (require_osd_release < CEPH_RELEASE_JEWEL) {
+ f &= ~(CEPH_FEATURE_SERVER_JEWEL |
+ CEPH_FEATURE_NEW_OSDOP_ENCODING |
+ CEPH_FEATURE_CRUSH_TUNABLES5);
+ }
+ return f;
+}
// serialize, unserialize
void OSDMap::encode_client_old(bufferlist& bl) const
ENCODE_START(8, 7, bl);
{
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
uint8_t v = 6;
if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
v = 3;
}
{
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
uint8_t target_v = 5;
if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
target_v = 1;
s += ",require_luminous_osds";
if (f & CEPH_OSDMAP_RECOVERY_DELETES)
s += ",recovery_deletes";
+ if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
+ s += ",purged_snapdirs";
if (s.length())
s.erase(0, 1);
return s;
out << " nearfull";
}
-bool OSDMap::crush_ruleset_in_use(int ruleset) const
+bool OSDMap::crush_rule_in_use(int rule_id) const
{
for (const auto &pool : pools) {
- if (pool.second.crush_rule == ruleset)
+ if (pool.second.crush_rule == rule_id)
return true;
}
return false;
}
+int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
+ ostream *ss) const
+{
+ for (auto& i : pools) {
+ auto& pool = i.second;
+ int ruleno = pool.get_crush_rule();
+ if (!newcrush->rule_exists(ruleno)) {
+ *ss << "pool " << i.first << " references crush_rule " << ruleno
+ << " but it is not present";
+ return -EINVAL;
+ }
+ if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
+ *ss << "rule " << ruleno << " mask ruleset does not match rule id";
+ return -EINVAL;
+ }
+ if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
+ *ss << "pool " << i.first << " type does not match rule " << ruleno;
+ return -EINVAL;
+ }
+ if (pool.get_size() < (int)newcrush->get_rule_mask_min_size(ruleno) ||
+ pool.get_size() > (int)newcrush->get_rule_mask_max_size(ruleno)) {
+ *ss << "pool " << i.first << " size " << pool.get_size() << " does not"
+ << " fall within rule " << ruleno
+ << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
+ << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
int nosd, int pg_bits, int pgp_bits,
bool default_pool)
tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
for (auto p : pmap) {
- osd_weight[p.first] += p.second;
- osd_weight_total += p.second;
+ auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+ if (adjusted_weight == 0) {
+ continue;
+ }
+ osd_weight[p.first] += adjusted_weight;
+ osd_weight_total += adjusted_weight;
}
}
for (auto& i : osd_weight) {
for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
int osd = p->second;
float deviation = p->first;
+ // make sure osd is still there (belongs to this crush-tree)
+ assert(osd_weight.count(osd));
float target = osd_weight[osd] * pgs_per_weight;
assert(target > 0);
if (deviation/target < max_deviation_ratio) {
return crush->get_leaves(name, osds);
}
+// get pools whose crush rules might reference the given osd
+void OSDMap::get_pool_ids_by_osd(CephContext *cct,
+ int osd,
+ set<int64_t> *pool_ids) const
+{
+ assert(pool_ids);
+ set<int> raw_rules;
+ int r = crush->get_rules_by_osd(osd, &raw_rules);
+ if (r < 0) {
+ lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
+ << dendl;
+ assert(r >= 0);
+ }
+ set<int> rules;
+ for (auto &i: raw_rules) {
+ // exclude any dead rule
+ if (crush_rule_in_use(i)) {
+ rules.insert(i);
+ }
+ }
+ for (auto &r: rules) {
+ get_pool_ids_by_rule(r, pool_ids);
+ }
+}
+
template <typename F>
class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
public:
*tbl << ""
<< ""
<< "" << "TOTAL"
- << si_t(pgs->get_osd_sum().kb << 10)
- << si_t(pgs->get_osd_sum().kb_used << 10)
- << si_t(pgs->get_osd_sum().kb_avail << 10)
+ << byte_u_t(pgs->get_osd_sum().kb << 10)
+ << byte_u_t(pgs->get_osd_sum().kb_used << 10)
+ << byte_u_t(pgs->get_osd_sum().kb_avail << 10)
<< lowprecision_t(average_util)
<< ""
<< TextTable::endrow;
<< c
<< weightf_t(qi.weight)
<< weightf_t(reweight)
- << si_t(kb << 10)
- << si_t(kb_used << 10)
- << si_t(kb_avail << 10)
+ << byte_u_t(kb << 10)
+ << byte_u_t(kb_used << 10)
+ << byte_u_t(kb_avail << 10)
<< lowprecision_t(util)
<< lowprecision_t(var);
{
// warn about flags
uint64_t warn_flags =
+ CEPH_OSDMAP_NEARFULL |
CEPH_OSDMAP_FULL |
CEPH_OSDMAP_PAUSERD |
CEPH_OSDMAP_PAUSEWR |
// OSD_UPGRADE_FINISHED
// none of these (yet) since we don't run until luminous upgrade is done.
- // POOL_FULL
+ // POOL_NEARFULL/BACKFILLFULL/FULL
{
- list<string> detail;
+ list<string> full_detail, backfillfull_detail, nearfull_detail;
for (auto it : get_pools()) {
const pg_pool_t &pool = it.second;
+ const string& pool_name = get_pool_name(it.first);
if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
- const string& pool_name = get_pool_name(it.first);
stringstream ss;
- ss << "pool '" << pool_name << "' is full";
- detail.push_back(ss.str());
+ if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
+ // may run out of space too,
+ // but we want EQUOTA taking precedence
+ ss << "pool '" << pool_name << "' is full (no quota)";
+ } else {
+ ss << "pool '" << pool_name << "' is full (no space)";
+ }
+ full_detail.push_back(ss.str());
+ } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is backfillfull";
+ backfillfull_detail.push_back(ss.str());
+ } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is nearfull";
+ nearfull_detail.push_back(ss.str());
}
}
- if (!detail.empty()) {
+ if (!full_detail.empty()) {
ostringstream ss;
- ss << detail.size() << " pool(s) full";
+ ss << full_detail.size() << " pool(s) full";
auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
- d.detail.swap(detail);
+ d.detail.swap(full_detail);
}
+ if (!backfillfull_detail.empty()) {
+ ostringstream ss;
+ ss << backfillfull_detail.size() << " pool(s) backfillfull";
+ auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
+ d.detail.swap(backfillfull_detail);
+ }
+ if (!nearfull_detail.empty()) {
+ ostringstream ss;
+ ss << nearfull_detail.size() << " pool(s) nearfull";
+ auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
+ d.detail.swap(nearfull_detail);
+ }
+ }
+}
+
+int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
+ ostream *ss) const
+{
+ out->clear();
+ for (auto i = ls.begin(); i != ls.end(); ++i) {
+ if (i == ls.begin() &&
+ (*i == "any" || *i == "all" || *i == "*")) {
+ get_all_osds(*out);
+ break;
+ }
+ long osd = parse_osd_id(i->c_str(), ss);
+ if (osd < 0) {
+ *ss << "invalid osd id '" << *i << "'";
+ return -EINVAL;
+ }
+ out->insert(osd);
}
+ return 0;
}