#include <sstream>
#include <boost/utility.hpp>
+#include <boost/regex.hpp>
#include "MDSMonitor.h"
#include "FSCommands.h"
#define dout_subsys ceph_subsys_mon
#undef dout_prefix
-#define dout_prefix _prefix(_dout, mon, fsmap)
-static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
+#define dout_prefix _prefix(_dout, mon, get_fsmap())
+static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
return *_dout << "mon." << mon->name << "@" << mon->rank
<< "(" << mon->get_state_name()
<< ").mds e" << fsmap.get_epoch() << " ";
}
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
/*
* Specialized implementation of cmd_getval to allow us to parse
* out strongly-typedef'd types
return cmd_getval(cct, cmdmap, k, (int64_t&)val);
}
-static const string MDS_METADATA_PREFIX("mds_metadata");
-
-
// my methods
-void MDSMonitor::print_map(FSMap &m, int dbl)
+void MDSMonitor::print_map(const FSMap &m, int dbl)
{
dout(dbl) << "print_map\n";
m.print(*_dout);
dout(10) << "create_initial" << dendl;
}
+void MDSMonitor::get_store_prefixes(std::set<string>& s)
+{
+ s.insert(service_name);
+ s.insert(MDS_METADATA_PREFIX);
+ s.insert(MDS_HEALTH_PREFIX);
+}
void MDSMonitor::update_from_paxos(bool *need_bootstrap)
{
version_t version = get_last_committed();
- if (version == fsmap.epoch)
+ if (version == get_fsmap().epoch)
return;
dout(10) << __func__ << " version " << version
- << ", my e " << fsmap.epoch << dendl;
- assert(version > fsmap.epoch);
+ << ", my e " << get_fsmap().epoch << dendl;
+ assert(version > get_fsmap().epoch);
+
+ load_health();
// read and decode
bufferlist fsmap_bl;
assert(fsmap_bl.length() > 0);
dout(10) << __func__ << " got " << version << dendl;
- fsmap.decode(fsmap_bl);
+ PaxosFSMap::decode(fsmap_bl);
// new map
- dout(4) << "new map" << dendl;
- print_map(fsmap, 0);
+ dout(0) << "new map" << dendl;
+ print_map(get_fsmap(), 0);
if (!g_conf->mon_mds_skip_sanity) {
- fsmap.sanity();
+ get_fsmap().sanity();
}
check_subs();
void MDSMonitor::create_pending()
{
- pending_fsmap = fsmap;
- pending_fsmap.epoch++;
+ auto &fsmap = PaxosFSMap::create_pending();
+
+ if (mon->osdmon()->is_readable()) {
+ const auto &osdmap = mon->osdmon()->osdmap;
+ fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+ }
- dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
+ dout(10) << "create_pending e" << fsmap.epoch << dendl;
}
void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
{
- dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
+ auto &pending = get_pending_fsmap_writeable();
+ auto &epoch = pending.epoch;
+ dout(10) << "encode_pending e" << epoch << dendl;
// print map iff 'debug mon = 30' or higher
- print_map(pending_fsmap, 30);
+ print_map(get_pending_fsmap(), 30);
if (!g_conf->mon_mds_skip_sanity) {
- pending_fsmap.sanity();
+ pending.sanity();
}
// Set 'modified' on maps modified this epoch
- for (auto &i : fsmap.filesystems) {
- if (i.second->mds_map.epoch == fsmap.epoch) {
- i.second->mds_map.modified = ceph_clock_now();
+ for (auto &p : pending.filesystems) {
+ if (p.second->mds_map.epoch == epoch) {
+ p.second->mds_map.modified = ceph_clock_now();
}
}
// apply to paxos
- assert(get_last_committed() + 1 == pending_fsmap.epoch);
- bufferlist fsmap_bl;
- pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
+ assert(get_last_committed() + 1 == pending.epoch);
+ bufferlist pending_bl;
+ pending.encode(pending_bl, mon->get_quorum_con_features());
/* put everything in the transaction */
- put_version(t, pending_fsmap.epoch, fsmap_bl);
- put_last_committed(t, pending_fsmap.epoch);
+ put_version(t, pending.epoch, pending_bl);
+ put_last_committed(t, pending.epoch);
// Encode MDSHealth data
for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
t->erase(MDS_HEALTH_PREFIX, stringify(*i));
}
pending_daemon_health_rm.clear();
- remove_from_metadata(t);
+ remove_from_metadata(pending, t);
+
+ // health
+ health_check_map_t new_checks;
+ const auto &info_map = pending.get_mds_info();
+ for (const auto &i : info_map) {
+ const auto &gid = i.first;
+ const auto &info = i.second;
+ if (pending_daemon_health_rm.count(gid)) {
+ continue;
+ }
+ MDSHealth health;
+ auto p = pending_daemon_health.find(gid);
+ if (p != pending_daemon_health.end()) {
+ health = p->second;
+ } else {
+ bufferlist bl;
+ mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
+ if (!bl.length()) {
+ derr << "Missing health data for MDS " << gid << dendl;
+ continue;
+ }
+ bufferlist::iterator bl_i = bl.begin();
+ health.decode(bl_i);
+ }
+ for (const auto &metric : health.metrics) {
+ const int rank = info.rank;
+ health_check_t *check = &new_checks.get_or_add(
+ mds_metric_name(metric.type),
+ metric.sev,
+ mds_metric_summary(metric.type));
+ ostringstream ss;
+ ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
+ bool first = true;
+ for (auto &p : metric.metadata) {
+ if (first) {
+ ss << " ";
+ } else {
+ ss << ", ";
+ }
+ ss << p.first << ": " << p.second;
+ first = false;
+ }
+ check->detail.push_back(ss.str());
+ }
+ }
+ pending.get_health_checks(&new_checks);
+ for (auto& p : new_checks.checks) {
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%num%"),
+ stringify(p.second.detail.size()));
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%plurals%"),
+ p.second.detail.size() > 1 ? "s" : "");
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%isorare%"),
+ p.second.detail.size() > 1 ? "are" : "is");
+ p.second.summary = boost::regex_replace(
+ p.second.summary,
+ boost::regex("%hasorhave%"),
+ p.second.detail.size() > 1 ? "have" : "has");
+ }
+ encode_health(new_checks, t);
}
version_t MDSMonitor::get_trim_to()
{
dout(10) << "update_logger" << dendl;
+ const auto &fsmap = get_fsmap();
+
uint64_t up = 0;
uint64_t in = 0;
uint64_t failed = 0;
mds_gid_t gid = mds_gid_t(m->get_global_id());
version_t seq = m->get_seq();
- dout(15) << "_note_beacon " << *m << " noting time" << dendl;
- last_beacon[gid].stamp = ceph_clock_now();
- last_beacon[gid].seq = seq;
+ dout(5) << "_note_beacon " << *m << " noting time" << dendl;
+ auto &beacon = last_beacon[gid];
+ beacon.stamp = mono_clock::now();
+ beacon.seq = seq;
}
bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
MDSMap::mds_info_t info;
epoch_t effective_epoch = 0;
+ const auto &fsmap = get_fsmap();
+
// check privileges, ignore if fails
MonSession *session = m->get_session();
assert(session);
goto ignore;
}
- dout(12) << "preprocess_beacon " << *m
+ dout(5) << "preprocess_beacon " << *m
<< " from " << m->get_orig_source_inst()
<< " " << m->get_compat()
<< dendl;
}
// fw to leader?
- if (!mon->is_leader())
+ if (!is_leader())
return false;
// booted, but not in map?
- if (!pending_fsmap.gid_exists(gid)) {
+ if (!fsmap.gid_exists(gid)) {
if (state != MDSMap::STATE_BOOT) {
dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
<< ceph_mds_state_name(state) << ")" << dendl;
+ /* We can't send an MDSMap this MDS was a part of because we no longer
+ * know which FS it was part of. Nor does this matter. Sending an empty
+ * MDSMap is sufficient for getting the MDS to respawn.
+ */
MDSMap null_map;
null_map.epoch = fsmap.epoch;
null_map.compat = fsmap.compat;
}
}
dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
- info = pending_fsmap.get_info_gid(gid);
+ info = fsmap.get_info_gid(gid);
// old seq?
if (info.state_seq > seq) {
// Work out the latest epoch that this daemon should have seen
{
- fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
+ fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
if (fscid == FS_CLUSTER_ID_NONE) {
- effective_epoch = pending_fsmap.standby_epochs.at(gid);
+ effective_epoch = fsmap.standby_epochs.at(gid);
} else {
- effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
+ effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
}
if (effective_epoch != m->get_last_epoch_seen()) {
dout(10) << "mds_beacon " << *m
// and return false (i.e. require proposal) if they
// do not match, to update our stored
if (!(pending_daemon_health[gid] == m->get_health())) {
- dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
+ dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
_note_beacon(m);
return false;
}
op->mark_mdsmon_event(__func__);
MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
+
+ const auto &fsmap = get_fsmap();
// check privileges, ignore message if fails
MonSession *session = m->get_session();
if (!session)
- goto done;
+ goto ignore;
if (!session->is_capable("mds", MON_CAP_X)) {
dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
<< session->caps << dendl;
- goto done;
+ goto ignore;
}
if (fsmap.gid_exists(m->global_id) &&
m->targets == fsmap.get_info_gid(m->global_id).export_targets)
- goto done;
+ goto ignore;
return false;
- done:
+ ignore:
+ mon->no_reply(op);
return true;
}
MDSMap::DaemonState state = m->get_state();
version_t seq = m->get_seq();
- dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
+ auto &pending = get_pending_fsmap_writeable();
+
+ dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
// Calculate deltas of health metrics created and removed
// Do this by type rather than MDSHealthMetric equality, because messages can
for (const auto &new_metric: new_health) {
if (old_types.count(new_metric.type) == 0) {
- std::stringstream msg;
- msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
- << new_metric.message;
- if (new_metric.sev == HEALTH_ERR) {
- mon->clog->error() << msg.str();
- } else if (new_metric.sev == HEALTH_WARN) {
- mon->clog->warn() << msg.str();
- } else {
- mon->clog->info() << msg.str();
- }
+ dout(10) << "MDS health message (" << m->get_orig_source_inst().name
+ << "): " << new_metric.sev << " " << new_metric.message << dendl;
}
}
// zap previous instance of this name?
if (g_conf->mds_enforce_unique_name) {
bool failed_mds = false;
- while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
+ while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
if (!mon->osdmon()->is_writeable()) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}
- mon->clog->info() << "MDS daemon '" << m->get_name() << "' restarted";
- fail_mds_gid(existing);
+ const MDSMap::mds_info_t &existing_info =
+ pending.get_info_gid(existing);
+ mon->clog->info() << existing_info.human_name() << " restarted";
+ fail_mds_gid(pending, existing);
failed_mds = true;
}
if (failed_mds) {
}
// Add this daemon to the map
- if (pending_fsmap.mds_roles.count(gid) == 0) {
+ if (pending.mds_roles.count(gid) == 0) {
MDSMap::mds_info_t new_info;
new_info.global_id = gid;
new_info.name = m->get_name();
new_info.standby_for_name = m->get_standby_for_name();
new_info.standby_for_fscid = m->get_standby_for_fscid();
new_info.standby_replay = m->get_standby_replay();
- pending_fsmap.insert(new_info);
+ pending.insert(new_info);
}
// Resolve standby_for_name to a rank
- const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
+ const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
if (!info.standby_for_name.empty()) {
- const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
+ const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
info.standby_for_name);
if (leaderinfo && (leaderinfo->rank >= 0)) {
- auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
- auto fs = pending_fsmap.get_filesystem(fscid);
- bool followable = fs->mds_map.is_followable(leaderinfo->rank);
+ const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
- pending_fsmap.modify_daemon(gid, [fscid, leaderinfo, followable](
+ pending.modify_daemon(gid, [fscid, leaderinfo](
MDSMap::mds_info_t *info) {
info->standby_for_rank = leaderinfo->rank;
info->standby_for_fscid = fscid;
}
// initialize the beacon timer
- last_beacon[gid].stamp = ceph_clock_now();
- last_beacon[gid].seq = seq;
+ auto &beacon = last_beacon[gid];
+ beacon.stamp = mono_clock::now();
+ beacon.seq = seq;
// new incompat?
- if (!pending_fsmap.compat.writeable(m->get_compat())) {
- dout(10) << " fsmap " << pending_fsmap.compat
+ if (!pending.compat.writeable(m->get_compat())) {
+ dout(10) << " fsmap " << pending.compat
<< " can't write to new mds' " << m->get_compat()
<< ", updating fsmap and killing old mds's"
<< dendl;
- pending_fsmap.update_compat(m->get_compat());
+ pending.update_compat(m->get_compat());
}
update_metadata(m->get_global_id(), m->get_sys_info());
} else {
// state update
- const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
+
+ if (!pending.gid_exists(gid)) {
+ /* gid has been removed from pending, send null map */
+ dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
+ << ceph_mds_state_name(state) << ")" << dendl;
+
+ /* We can't send an MDSMap this MDS was a part of because we no longer
+ * know which FS it was part of. Nor does this matter. Sending an empty
+ * MDSMap is sufficient for getting the MDS to respawn.
+ */
+ wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
+ if (r >= 0) {
+ const auto& fsmap = get_fsmap();
+ MDSMap null_map;
+ null_map.epoch = fsmap.epoch;
+ null_map.compat = fsmap.compat;
+ mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
+ } else {
+ dispatch(op); // try again
+ }
+ }));
+ return true;
+ }
+
+ const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
// Old MDS daemons don't mention that they're standby replay until
// after they've sent their boot beacon, so update this field.
if (info.standby_replay != m->get_standby_replay()) {
- pending_fsmap.modify_daemon(info.global_id, [&m](
+ pending.modify_daemon(info.global_id, [&m](
MDSMap::mds_info_t *i)
{
i->standby_replay = m->get_standby_replay();
}
if (info.laggy()) {
- dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
- pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
+ dout(1) << "prepare_beacon clearing laggy flag on " << addr << dendl;
+ pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
{
info->clear_laggy();
}
);
}
- dout(10) << "prepare_beacon mds." << info.rank
+ dout(5) << "prepare_beacon mds." << info.rank
<< " " << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state)
<< " standby_for_rank=" << m->get_standby_for_rank()
<< dendl;
if (state == MDSMap::STATE_STOPPED) {
- auto erased = pending_fsmap.stop(gid);
+ const auto fscid = pending.mds_roles.at(gid);
+ const auto &fs = pending.get_filesystem(fscid);
+
+ mon->clog->info() << info.human_name() << " finished "
+ << "deactivating rank " << info.rank << " in filesystem "
+ << fs->mds_map.fs_name << " (now has "
+ << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
+
+ auto erased = pending.stop(gid);
erased.push_back(gid);
for (const auto &erased_gid : erased) {
pending_daemon_health_rm.insert(erased_gid);
}
}
+
+
} else if (state == MDSMap::STATE_DAMAGED) {
if (!mon->osdmon()->is_writeable()) {
- dout(4) << __func__ << ": DAMAGED from rank " << info.rank
+ dout(1) << __func__ << ": DAMAGED from rank " << info.rank
<< " waiting for osdmon writeable to blacklist it" << dendl;
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
// Record this MDS rank as damaged, so that other daemons
// won't try to run it.
- dout(4) << __func__ << ": marking rank "
+ dout(0) << __func__ << ": marking rank "
<< info.rank << " damaged" << dendl;
utime_t until = ceph_clock_now();
- until += g_conf->mds_blacklist_interval;
+ until += g_conf->get_val<double>("mon_mds_blacklist_interval");
const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
request_proposal(mon->osdmon());
- pending_fsmap.damaged(gid, blacklist_epoch);
+ pending.damaged(gid, blacklist_epoch);
last_beacon.erase(gid);
// Respond to MDS, so that it knows it can continue to shut down
mon->send_reply(op,
new MMDSBeacon(
mon->monmap->fsid, m->get_global_id(),
- m->get_name(), fsmap.get_epoch(), state, seq,
+ m->get_name(), pending.get_epoch(), state, seq,
CEPH_FEATURES_SUPPORTED_DEFAULT));
} else if (state == MDSMap::STATE_DNE) {
if (!mon->osdmon()->is_writeable()) {
- dout(4) << __func__ << ": DNE from rank " << info.rank
+ dout(1) << __func__ << ": DNE from rank " << info.rank
<< " waiting for osdmon writeable to blacklist it" << dendl;
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}
- fail_mds_gid(gid);
+ fail_mds_gid(pending, gid);
assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
mon->send_reply(op,
new MMDSBeacon(
mon->monmap->fsid, m->get_global_id(),
- m->get_name(), fsmap.get_epoch(), state, seq,
+ m->get_name(), pending.get_epoch(), state, seq,
CEPH_FEATURES_SUPPORTED_DEFAULT));
} else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
// Standby daemons should never modify their own
<< ceph_mds_state_name(state) << dendl;
return true;
} else {
+ if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
+ const auto &fscid = pending.mds_roles.at(gid);
+ const auto &fs = pending.get_filesystem(fscid);
+ mon->clog->info() << info.human_name() << " is now active in "
+ << "filesystem " << fs->mds_map.fs_name << " as rank "
+ << info.rank;
+ }
+
// Made it through special cases and validations, record the
// daemon's reported state to the FSMap.
- pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
+ pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
info->state = state;
info->state_seq = seq;
});
}
}
- dout(7) << "prepare_beacon pending map now:" << dendl;
- print_map(pending_fsmap);
+ dout(5) << "prepare_beacon pending map now:" << dendl;
+ print_map(pending);
wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
if (r >= 0)
bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
{
+ auto &pending = get_pending_fsmap_writeable();
+
op->mark_mdsmon_event(__func__);
MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
mds_gid_t gid = m->global_id;
- if (pending_fsmap.gid_has_rank(gid)) {
+ if (pending.gid_has_rank(gid)) {
dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
- pending_fsmap.update_export_targets(gid, m->targets);
+ pending.update_export_targets(gid, m->targets);
} else {
dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
}
+ mon->no_reply(op);
return true;
}
void MDSMonitor::_updated(MonOpRequestRef op)
{
+ const auto &fsmap = get_fsmap();
op->mark_mdsmon_event(__func__);
MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
- mon->clog->info() << m->get_orig_source_inst() << " "
+ mon->clog->debug() << m->get_orig_source_inst() << " "
<< ceph_mds_state_name(m->get_state());
if (m->get_state() == MDSMap::STATE_STOPPED) {
tick();
update_logger();
- if (mon->is_leader())
- mon->clog->info() << "fsmap " << fsmap;
+ if (is_leader()) {
+ mon->clog->debug() << "fsmap " << get_fsmap();
+ }
}
void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
list<pair<health_status_t, string> > *detail,
CephContext* cct) const
{
+ const auto &fsmap = get_fsmap();
+
fsmap.get_health(summary, detail);
// For each MDS GID...
- const auto info_map = fsmap.get_mds_info();
+ const auto &info_map = fsmap.get_mds_info();
for (const auto &i : info_map) {
const auto &gid = i.first;
const auto &info = i.second;
health.decode(bl_i);
for (const auto &metric : health.metrics) {
- int const rank = info.rank;
+ const int rank = info.rank;
std::ostringstream message;
message << "mds" << rank << ": " << metric.message;
summary.push_back(std::make_pair(metric.sev, message.str()));
void MDSMonitor::dump_info(Formatter *f)
{
f->open_object_section("fsmap");
- fsmap.dump(f);
+ get_fsmap().dump(f);
f->close_section();
f->dump_unsigned("mdsmap_first_committed", get_first_committed());
stringstream ss, ds;
map<string, cmd_vartype> cmdmap;
+ const auto &fsmap = get_fsmap();
+
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
// ss has reason for failure
string rs = ss.str();
cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
string format;
cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
- boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ std::unique_ptr<Formatter> f(Formatter::create(format));
MonSession *session = m->get_session();
if (!session) {
int64_t epocharg;
epoch_t epoch;
- FSMap *p = &fsmap;
+ const FSMap *fsmapp = &get_fsmap();
+ FSMap dummy;
if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
epoch = epocharg;
bufferlist b;
int err = get_version(epoch, b);
if (err == -ENOENT) {
- p = 0;
r = -ENOENT;
+ goto out;
} else {
assert(err == 0);
assert(b.length());
- p = new FSMap;
- p->decode(b);
+ dummy.decode(b);
+ fsmapp = &dummy;
}
}
- if (p) {
- stringstream ds;
- const MDSMap *mdsmap = nullptr;
- MDSMap blank;
- blank.epoch = fsmap.epoch;
- if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
- mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
- } else {
- mdsmap = ␣
- }
- if (f != NULL) {
- f->open_object_section("mdsmap");
- mdsmap->dump(f.get());
- f->close_section();
- f->flush(ds);
- r = 0;
- } else {
- mdsmap->print(ds);
- r = 0;
- }
- if (r == 0) {
- rdata.append(ds);
- ss << "dumped fsmap epoch " << p->get_epoch();
- }
- if (p != &fsmap) {
- delete p;
- }
+
+ stringstream ds;
+ const MDSMap *mdsmapp = nullptr;
+ MDSMap blank;
+ blank.epoch = fsmapp->epoch;
+ if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+ mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
+ } else {
+ mdsmapp = ␣
}
+ if (f != NULL) {
+ f->open_object_section("mdsmap");
+ mdsmapp->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ mdsmapp->print(ds);
+ r = 0;
+ }
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << fsmapp->get_epoch();
} else if (prefix == "fs dump") {
int64_t epocharg;
epoch_t epoch;
- FSMap *p = &fsmap;
+ const FSMap *fsmapp = &fsmap;
+ FSMap dummy;
if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
epoch = epocharg;
bufferlist b;
int err = get_version(epoch, b);
if (err == -ENOENT) {
- p = 0;
r = -ENOENT;
+ goto out;
} else {
assert(err == 0);
assert(b.length());
- p = new FSMap;
- p->decode(b);
+ dummy.decode(b);
+ fsmapp = &dummy;
}
}
- if (p) {
- stringstream ds;
- if (f != NULL) {
- f->open_object_section("fsmap");
- p->dump(f.get());
- f->close_section();
- f->flush(ds);
- r = 0;
- } else {
- p->print(ds);
- r = 0;
- }
- if (r == 0) {
- rdata.append(ds);
- ss << "dumped fsmap epoch " << p->get_epoch();
- }
- if (p != &fsmap)
- delete p;
+
+ stringstream ds;
+ if (f != NULL) {
+ f->open_object_section("fsmap");
+ fsmapp->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ r = 0;
+ } else {
+ fsmapp->print(ds);
+ r = 0;
}
+
+ rdata.append(ds);
+ ss << "dumped fsmap epoch " << fsmapp->get_epoch();
} else if (prefix == "mds metadata") {
if (!f)
f.reset(Formatter::create("json-pretty"));
f->open_object_section("mds");
f->dump_string("name", info.name);
std::ostringstream get_err;
- r = dump_metadata(info.name, f.get(), get_err);
+ r = dump_metadata(fsmap, info.name, f.get(), get_err);
if (r == -EINVAL || r == -ENOENT) {
// Drop error, list what metadata we do have
dout(1) << get_err.str() << dendl;
derr << "Unexpected error reading metadata: " << cpp_strerror(r)
<< dendl;
ss << get_err.str();
+ f->close_section();
break;
}
f->close_section();
} else {
// Dump a single daemon's metadata
f->open_object_section("mds_metadata");
- r = dump_metadata(who, f.get(), ss);
+ r = dump_metadata(fsmap, who, f.get(), ss);
f->close_section();
}
f->flush(ds);
} else if (prefix == "fs get") {
string fs_name;
cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
- auto fs = fsmap.get_filesystem(fs_name);
+ const auto &fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
ss << "filesystem '" << fs_name << "' not found";
r = -ENOENT;
} else if (prefix == "fs ls") {
if (f) {
f->open_array_section("filesystems");
- {
- for (const auto i : fsmap.filesystems) {
- const auto fs = i.second;
- f->open_object_section("filesystem");
- {
- const MDSMap &mds_map = fs->mds_map;
- f->dump_string("name", mds_map.fs_name);
- /* Output both the names and IDs of pools, for use by
- * humans and machines respectively */
- f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
- mds_map.metadata_pool));
- f->dump_int("metadata_pool_id", mds_map.metadata_pool);
- f->open_array_section("data_pool_ids");
- {
- for (auto dpi = mds_map.data_pools.begin();
- dpi != mds_map.data_pools.end(); ++dpi) {
- f->dump_int("data_pool_id", *dpi);
- }
- }
- f->close_section();
-
- f->open_array_section("data_pools");
- {
- for (auto dpi = mds_map.data_pools.begin();
- dpi != mds_map.data_pools.end(); ++dpi) {
- const auto &name = mon->osdmon()->osdmap.get_pool_name(
- *dpi);
- f->dump_string("data_pool", name);
- }
- }
+ for (const auto &p : fsmap.filesystems) {
+ const auto &fs = p.second;
+ f->open_object_section("filesystem");
+ {
+ const MDSMap &mds_map = fs->mds_map;
+ f->dump_string("name", mds_map.fs_name);
+ /* Output both the names and IDs of pools, for use by
+ * humans and machines respectively */
+ f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
+ mds_map.metadata_pool));
+ f->dump_int("metadata_pool_id", mds_map.metadata_pool);
+ f->open_array_section("data_pool_ids");
+ for (const auto &id : mds_map.data_pools) {
+ f->dump_int("data_pool_id", id);
+ }
+ f->close_section();
- f->close_section();
+ f->open_array_section("data_pools");
+ for (const auto &id : mds_map.data_pools) {
+ const auto &name = mon->osdmon()->osdmap.get_pool_name(id);
+ f->dump_string("data_pool", name);
}
f->close_section();
}
+ f->close_section();
}
f->close_section();
f->flush(ds);
} else {
- for (const auto i : fsmap.filesystems) {
- const auto fs = i.second;
+ for (const auto &p : fsmap.filesystems) {
+ const auto &fs = p.second;
const MDSMap &mds_map = fs->mds_map;
const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
mds_map.metadata_pool);
ds << "name: " << mds_map.fs_name << ", metadata pool: "
<< md_pool_name << ", data pools: [";
- for (auto dpi : mds_map.data_pools) {
- const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
+ for (const auto &id : mds_map.data_pools) {
+ const string &pool_name = mon->osdmon()->osdmap.get_pool_name(id);
ds << pool_name << " ";
}
ds << "]" << std::endl;
r = 0;
}
+out:
if (r != -1) {
rdata.append(ds);
string rs;
return false;
}
-bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
+bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
{
- const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
- dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
+ const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
+ dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
epoch_t blacklist_epoch = 0;
if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
utime_t until = ceph_clock_now();
- until += g_conf->mds_blacklist_interval;
+ until += g_conf->get_val<double>("mon_mds_blacklist_interval");
blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
}
- pending_fsmap.erase(gid, blacklist_epoch);
+ fsmap.erase(gid, blacklist_epoch);
last_beacon.erase(gid);
if (pending_daemon_health.count(gid)) {
pending_daemon_health.erase(gid);
return blacklist_epoch != 0;
}
-mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
+mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
{
- const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
-
// Try parsing as a role
mds_role_t role;
std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
- int r = parse_role(arg, &role, ignore_err);
+ int r = fsmap.parse_role(arg, &role, ignore_err);
if (r == 0) {
// See if a GID is assigned to this role
- auto fs = relevant_fsmap->get_filesystem(role.fscid);
+ const auto &fs = fsmap.get_filesystem(role.fscid);
assert(fs != nullptr); // parse_role ensures it exists
if (fs->mds_map.is_up(role.rank)) {
dout(10) << __func__ << ": validated rank/GID " << role
unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
if (!err.empty()) {
// Not a role or a GID, try as a daemon name
- const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
+ const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
if (!mds_info) {
ss << "MDS named '" << arg
<< "' does not exist, or is not up";
dout(10) << __func__ << ": treating MDS reference '" << arg
<< "' as an integer " << maybe_gid << dendl;
- if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
+ if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
return mds_gid_t(maybe_gid);
}
}
return MDS_GID_NONE;
}
-int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
+int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
+ const std::string &arg, MDSMap::mds_info_t *failed_info)
{
- mds_gid_t gid = gid_from_arg(arg, ss);
+ assert(failed_info != nullptr);
+
+ mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
if (gid == MDS_GID_NONE) {
return 0;
}
if (!mon->osdmon()->is_writeable()) {
return -EAGAIN;
}
- fail_mds_gid(gid);
+
+ // Take a copy of the info before removing the MDS from the map,
+ // so that the caller knows which mds (if any) they ended up removing.
+ *failed_info = fsmap.get_info_gid(gid);
+
+ fail_mds_gid(fsmap, gid);
ss << "failed mds gid " << gid;
assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
return true;
}
- for (auto h : handlers) {
+ auto &pending = get_pending_fsmap_writeable();
+
+ bool batched_propose = false;
+ for (const auto &h : handlers) {
if (h->can_handle(prefix)) {
- r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
+ batched_propose = h->batched_propose();
+ if (batched_propose) {
+ paxos->plug();
+ }
+ r = h->handle(mon, pending, op, cmdmap, ss);
+ if (batched_propose) {
+ paxos->unplug();
+ }
+
if (r == -EAGAIN) {
// message has been enqueued for retry; return.
dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
} else {
if (r == 0) {
// On successful updates, print the updated map
- print_map(pending_fsmap);
+ print_map(pending);
}
// Successful or not, we're done: respond.
goto out;
}
}
- r = filesystem_command(op, prefix, cmdmap, ss);
+ r = filesystem_command(pending, op, prefix, cmdmap, ss);
if (r >= 0) {
goto out;
} else if (r == -EAGAIN) {
}
// Only handle legacy commands if there is a filesystem configured
- if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
- if (pending_fsmap.filesystems.size() == 0) {
+ if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+ if (pending.filesystems.size() == 0) {
ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
} else {
ss << "No filesystem set for use with legacy commands";
goto out;
}
- r = legacy_filesystem_command(op, prefix, cmdmap, ss);
+ r = legacy_filesystem_command(pending, op, prefix, cmdmap, ss);
if (r == -ENOSYS && ss.str().empty()) {
ss << "unrecognized command";
// success.. delay reply
wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
get_last_committed() + 1));
+ if (batched_propose) {
+ force_immediate_propose();
+ }
return true;
} else {
// reply immediately
}
}
-
-/**
- * Given one of the following forms:
- * <fs name>:<rank>
- * <fs id>:<rank>
- * <rank>
- *
- * Parse into a mds_role_t. The rank-only form is only valid
- * if legacy_client_ns is set.
- */
-int MDSMonitor::parse_role(
- const std::string &role_str,
- mds_role_t *role,
- std::ostream &ss)
-{
- const FSMap *relevant_fsmap = &fsmap;
- if (mon->is_leader()) {
- relevant_fsmap = &pending_fsmap;
- }
- return relevant_fsmap->parse_role(role_str, role, ss);
-}
-
int MDSMonitor::filesystem_command(
+ FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
map<string, cmd_vartype> &cmdmap,
if (prefix == "mds stop" ||
prefix == "mds deactivate") {
-
mds_role_t role;
- r = parse_role(whostr, &role, ss);
+ r = fsmap.parse_role(whostr, &role, ss);
if (r < 0 ) {
return r;
}
- auto fs = pending_fsmap.get_filesystem(role.fscid);
+ const auto &fs = fsmap.get_filesystem(role.fscid);
if (!fs->mds_map.is_active(role.rank)) {
r = -EEXIST;
r = 0;
mds_gid_t gid = fs->mds_map.up.at(role.rank);
ss << "telling mds." << role << " "
- << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
+ << fsmap.get_info_gid(gid).addr << " to deactivate";
- pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
+ fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
info->state = MDSMap::STATE_STOPPING;
});
}
<< cmd_vartype_stringify(cmdmap["state"]) << "'";
return -EINVAL;
}
- if (pending_fsmap.gid_exists(gid)) {
- pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
+ if (fsmap.gid_exists(gid)) {
+ fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
info->state = state;
});
ss << "set mds gid " << gid << " to state " << state << " "
} else if (prefix == "mds fail") {
string who;
cmd_getval(g_ceph_context, cmdmap, "who", who);
- r = fail_mds(ss, who);
+
+ MDSMap::mds_info_t failed_info;
+ r = fail_mds(fsmap, ss, who, &failed_info);
if (r < 0 && r == -EAGAIN) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return -EAGAIN; // don't propose yet; wait for message to be retried
+ } else if (r == 0) {
+ // Only log if we really did something (not when was already gone)
+ if (failed_info.global_id != MDS_GID_NONE) {
+ mon->clog->info() << failed_info.human_name() << " marked failed by "
+ << op->get_session()->entity_name;
+ }
}
} else if (prefix == "mds rm") {
mds_gid_t gid;
<< cmd_vartype_stringify(cmdmap["gid"]) << "'";
return -EINVAL;
}
- if (!pending_fsmap.gid_exists(gid)) {
+ if (!fsmap.gid_exists(gid)) {
ss << "mds gid " << gid << " dne";
r = 0;
} else {
- MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
+ const auto &info = fsmap.get_info_gid(gid);
+ MDSMap::DaemonState state = info.state;
if (state > 0) {
- ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
- << " rank " << pending_fsmap.get_info_gid(gid).rank;
+ ss << "cannot remove active mds." << info.name
+ << " rank " << info.rank;
return -EBUSY;
} else {
- pending_fsmap.erase(gid, {});
+ fsmap.erase(gid, {});
ss << "removed mds gid " << gid;
return 0;
}
std::string role_str;
cmd_getval(g_ceph_context, cmdmap, "who", role_str);
mds_role_t role;
- int r = parse_role(role_str, &role, ss);
+ int r = fsmap.parse_role(role_str, &role, ss);
if (r < 0) {
ss << "invalid role '" << role_str << "'";
return -EINVAL;
}
- pending_fsmap.modify_filesystem(
+ fsmap.modify_filesystem(
role.fscid,
[role](std::shared_ptr<Filesystem> fs)
{
<< cmd_vartype_stringify(cmdmap["feature"]) << "'";
return -EINVAL;
}
- if (pending_fsmap.compat.compat.contains(f)) {
+ if (fsmap.compat.compat.contains(f)) {
ss << "removing compat feature " << f;
- CompatSet modified = pending_fsmap.compat;
+ CompatSet modified = fsmap.compat;
modified.compat.remove(f);
- pending_fsmap.update_compat(modified);
+ fsmap.update_compat(modified);
} else {
- ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
+ ss << "compat feature " << f << " not present in " << fsmap.compat;
}
r = 0;
} else if (prefix == "mds compat rm_incompat") {
<< cmd_vartype_stringify(cmdmap["feature"]) << "'";
return -EINVAL;
}
- if (pending_fsmap.compat.incompat.contains(f)) {
+ if (fsmap.compat.incompat.contains(f)) {
ss << "removing incompat feature " << f;
- CompatSet modified = pending_fsmap.compat;
+ CompatSet modified = fsmap.compat;
modified.incompat.remove(f);
- pending_fsmap.update_compat(modified);
+ fsmap.update_compat(modified);
} else {
- ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
+ ss << "incompat feature " << f << " not present in " << fsmap.compat;
}
r = 0;
} else if (prefix == "mds repaired") {
std::string role_str;
cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
mds_role_t role;
- r = parse_role(role_str, &role, ss);
+ r = fsmap.parse_role(role_str, &role, ss);
if (r < 0) {
return r;
}
- bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
+ bool modified = fsmap.undamaged(role.fscid, role.rank);
if (modified) {
- dout(4) << "repaired: restoring rank " << role << dendl;
+ dout(1) << "repaired: restoring rank " << role << dendl;
} else {
- dout(4) << "repaired: no-op on rank " << role << dendl;
+ dout(1) << "repaired: no-op on rank " << role << dendl;
}
r = 0;
/**
* Helper to legacy_filesystem_command
*/
-void MDSMonitor::modify_legacy_filesystem(
+void MDSMonitor::modify_legacy_filesystem(FSMap &fsmap,
std::function<void(std::shared_ptr<Filesystem> )> fn)
{
- pending_fsmap.modify_filesystem(
- pending_fsmap.legacy_client_fscid,
+ fsmap.modify_filesystem(
+ fsmap.legacy_client_fscid,
fn
);
}
* @retval < 0 An error has occurred; **ss** may have been set.
*/
int MDSMonitor::legacy_filesystem_command(
+ FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
map<string, cmd_vartype> &cmdmap,
string whostr;
cmd_getval(g_ceph_context, cmdmap, "who", whostr);
- assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
+ assert (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
if (prefix == "mds set_max_mds") {
// NOTE: deprecated by "fs set max_mds"
}
const MDSMap& mdsmap =
- pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
+ fsmap.filesystems.at(fsmap.legacy_client_fscid)->mds_map;
if (!mdsmap.allows_multimds() &&
maxmds > mdsmap.get_max_mds() &&
return -EINVAL;
}
- modify_legacy_filesystem(
+ modify_legacy_filesystem(fsmap,
[maxmds](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_max_mds(maxmds);
ss << "max_mds = " << maxmds;
} else if (prefix == "mds cluster_down") {
// NOTE: deprecated by "fs set cluster_down"
- modify_legacy_filesystem(
+ modify_legacy_filesystem(fsmap,
[](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
r = 0;
} else if (prefix == "mds cluster_up") {
// NOTE: deprecated by "fs set cluster_up"
- modify_legacy_filesystem(
+ modify_legacy_filesystem(fsmap,
[](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
types.push_back("fsmap");
types.push_back("fsmap.user");
types.push_back("mdsmap");
- for (const auto &i : fsmap.filesystems) {
- auto fscid = i.first;
+ for (const auto &p : get_fsmap().filesystems) {
+ const auto &fscid = p.first;
std::ostringstream oss;
oss << "mdsmap." << fscid;
types.push_back(oss.str());
{
dout(20) << __func__ << ": " << sub->type << dendl;
+ const auto &fsmap = get_fsmap();
+
if (sub->type == "fsmap") {
if (sub->next <= fsmap.get_epoch()) {
sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
FSMapUser fsmap_u;
fsmap_u.epoch = fsmap.get_epoch();
fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
- for (auto p = fsmap.filesystems.begin();
- p != fsmap.filesystems.end();
- ++p) {
- FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
- fs_info.cid = p->first;
- fs_info.name= p->second->mds_map.fs_name;
+ for (const auto &p : fsmap.filesystems) {
+ FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
+ fs_info.cid = p.second->fscid;
+ fs_info.name = p.second->mds_map.fs_name;
}
sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
if (sub->onetime) {
if (is_mds) {
// What (if any) namespace are you assigned to?
auto mds_info = fsmap.get_mds_info();
- for (const auto &i : mds_info) {
- if (i.second.addr == sub->session->inst.addr) {
- mds_gid = i.first;
+ for (const auto &p : mds_info) {
+ if (p.second.addr == sub->session->inst.addr) {
+ mds_gid = p.first;
fscid = fsmap.mds_roles.at(mds_gid);
}
}
dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
// Work out the effective latest epoch
- MDSMap *mds_map = nullptr;
+ const MDSMap *mds_map = nullptr;
MDSMap null_map;
null_map.compat = fsmap.compat;
if (fscid == FS_CLUSTER_ID_NONE) {
// For a client, we should have already dropped out
assert(is_mds);
- if (fsmap.standby_daemons.count(mds_gid)) {
+ auto it = fsmap.standby_daemons.find(mds_gid);
+ if (it != fsmap.standby_daemons.end()) {
// For an MDS, we need to feed it an MDSMap with its own state in
- null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
- null_map.epoch = fsmap.standby_epochs[mds_gid];
+ null_map.mds_info[mds_gid] = it->second;
+ null_map.epoch = fsmap.standby_epochs.at(mds_gid);
} else {
null_map.epoch = fsmap.epoch;
}
mds_map = &null_map;
} else {
// Check the effective epoch
- mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
+ mds_map = &fsmap.get_filesystem(fscid)->mds_map;
}
assert(mds_map != nullptr);
paxos->trigger_propose();
}
-void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
+void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
{
bool update = false;
- for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
- i != pending_metadata.end(); ) {
- if (!pending_fsmap.gid_exists(i->first)) {
- pending_metadata.erase(i++);
+ for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
+ if (!fsmap.gid_exists(it->first)) {
+ it = pending_metadata.erase(it);
update = true;
} else {
- ++i;
+ ++it;
}
}
if (!update)
return 0;
}
-void MDSMonitor::count_metadata(const string& field, Formatter *f)
+void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
{
- map<string,int> by_val;
map<mds_gid_t,Metadata> meta;
load_metadata(meta);
for (auto& p : meta) {
auto q = p.second.find(field);
if (q == p.second.end()) {
- by_val["unknown"]++;
+ (*out)["unknown"]++;
} else {
- by_val[q->second]++;
+ (*out)[q->second]++;
}
}
+}
+
+void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
f->open_object_section(field.c_str());
for (auto& p : by_val) {
f->dump_int(p.first.c_str(), p.second);
f->close_section();
}
-int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
+int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
+ Formatter *f, ostream& err)
{
assert(f);
- mds_gid_t gid = gid_from_arg(who, err);
+ mds_gid_t gid = gid_from_arg(fsmap, who, err);
if (gid == MDS_GID_NONE) {
return -EINVAL;
}
{
assert(f);
+ const auto &fsmap = get_fsmap();
+
map<mds_gid_t, Metadata> metadata;
if (int r = load_metadata(metadata)) {
return r;
}
map<string, list<int> > mdses; // hostname => rank
- for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
- it != metadata.end(); ++it) {
- const Metadata& m = it->second;
+ for (const auto &p : metadata) {
+ const mds_gid_t& gid = p.first;
+ const Metadata& m = p.second;
Metadata::const_iterator hostname = m.find("hostname");
if (hostname == m.end()) {
// not likely though
continue;
}
- const mds_gid_t gid = it->first;
if (!fsmap.gid_exists(gid)) {
dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
continue;
* If a cluster is undersized (with respect to max_mds), then
* attempt to find daemons to grow it.
*/
-bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
+bool MDSMonitor::maybe_expand_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
{
- bool do_propose = false;
+ auto fs = fsmap.get_filesystem(fscid);
+ auto &mds_map = fs->mds_map;
if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- return do_propose;
+ return false;
}
- while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
- !fs->mds_map.is_degraded()) {
+ int in = mds_map.get_num_in_mds();
+ int max = mds_map.get_max_mds();
+
+ dout(20) << __func__ << " in " << in << " max " << max << dendl;
+
+ if (in < max) {
mds_rank_t mds = mds_rank_t(0);
string name;
- while (fs->mds_map.is_in(mds)) {
+ while (mds_map.is_in(mds)) {
mds++;
}
- mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
+ mds_gid_t newgid = fsmap.find_replacement_for({fscid, mds},
name, g_conf->mon_force_standby_active);
if (newgid == MDS_GID_NONE) {
- break;
+ return false;
}
- dout(1) << "adding standby " << pending_fsmap.get_info_gid(newgid).addr
+ const auto &new_info = fsmap.get_info_gid(newgid);
+ dout(1) << "assigned standby " << new_info.addr
<< " as mds." << mds << dendl;
- pending_fsmap.promote(newgid, fs, mds);
- do_propose = true;
+
+ mon->clog->info() << new_info.human_name() << " assigned to "
+ "filesystem " << mds_map.fs_name << " as rank "
+ << mds << " (now has " << mds_map.get_num_in_mds() + 1
+ << " ranks)";
+ fsmap.promote(newgid, fs, mds);
+ return true;
}
- return do_propose;
+ return false;
}
* is available, fail this daemon (remove from map) and pass its
* role to another daemon.
*/
-void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
- const beacon_info_t &beacon,
- bool *mds_propose, bool *osd_propose)
+void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
+ const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
{
assert(mds_propose != nullptr);
assert(osd_propose != nullptr);
- const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
- const auto fscid = pending_fsmap.mds_roles.at(gid);
-
- dout(10) << "no beacon from " << gid << " " << info.addr << " mds."
- << info.rank << "." << info.inc
- << " " << ceph_mds_state_name(info.state)
- << " since " << beacon.stamp << dendl;
+ const auto fscid = fsmap.mds_roles.at(gid);
// We will only take decisive action (replacing/removing a daemon)
// if we have some indicating that some other daemon(s) are successfully
// getting beacons through recently.
- utime_t latest_beacon;
- for (const auto & i : last_beacon) {
- latest_beacon = MAX(i.second.stamp, latest_beacon);
+ mono_time latest_beacon = mono_clock::zero();
+ for (const auto &p : last_beacon) {
+ latest_beacon = std::max(p.second.stamp, latest_beacon);
}
- const bool may_replace = latest_beacon >
- (ceph_clock_now() -
- MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
+ mono_time now = mono_clock::now();
+ chrono::duration<double> since = now-latest_beacon;
+ const bool may_replace = since.count() <
+ std::max(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5);
// are we in?
// and is there a non-laggy standby that can take over for us?
info.state != MDSMap::STATE_STANDBY &&
info.state != MDSMap::STATE_STANDBY_REPLAY &&
may_replace &&
- !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
- (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
+ !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
+ (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name,
g_conf->mon_force_standby_active)) != MDS_GID_NONE)
{
- MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
- dout(10) << " replacing " << gid << " " << info.addr << " mds."
+ MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
+ dout(1) << " replacing " << gid << " " << info.addr << " mds."
<< info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " with " << sgid << "/" << si.name << " " << si.addr << dendl;
- mon->clog->warn() << "MDS daemon '" << info.name << "'"
+ mon->clog->warn() << info.human_name()
<< " is not responding, replacing it "
<< "as rank " << info.rank
- << " with standby '" << si.name << "'";
+ << " with standby " << si.human_name();
// Remember what NS the old one was in
- const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
+ const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
// Remove the old one
- *osd_propose |= fail_mds_gid(gid);
+ *osd_propose |= fail_mds_gid(fsmap, gid);
// Promote the replacement
- auto fs = pending_fsmap.filesystems.at(fscid);
- pending_fsmap.promote(sgid, fs, info.rank);
+ auto fs = fsmap.filesystems.at(fscid);
+ fsmap.promote(sgid, fs, info.rank);
*mds_propose = true;
} else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
info.state == MDSMap::STATE_STANDBY) && may_replace) {
- dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
+ dout(1) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
<< "." << info.inc << " " << ceph_mds_state_name(info.state)
<< dendl;
- mon->clog->info() << "MDS standby '" << info.name
- << "' is not responding, removing it from the set of "
- << "standbys";
- fail_mds_gid(gid);
+ mon->clog->info() << "Standby " << info.human_name() << " is not "
+ "responding, dropping it";
+ fail_mds_gid(fsmap, gid);
*mds_propose = true;
} else if (!info.laggy()) {
- dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
+ dout(1) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
<< " " << ceph_mds_state_name(info.state)
<< " laggy" << dendl;
- pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
+ fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
info->laggy_since = ceph_clock_now();
});
*mds_propose = true;
}
}
-bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
+bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs)
{
assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
set<mds_rank_t>::iterator p = failed.begin();
while (p != failed.end()) {
mds_rank_t f = *p++;
- mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
+ mds_gid_t sgid = fsmap.find_replacement_for({fs->fscid, f}, {},
g_conf->mon_force_standby_active);
if (sgid) {
- const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
- dout(0) << " taking over failed mds." << f << " with " << sgid
+ const MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
+ dout(1) << " taking over failed mds." << f << " with " << sgid
<< "/" << si.name << " " << si.addr << dendl;
- pending_fsmap.promote(sgid, fs, f);
+ mon->clog->info() << "Standby " << si.human_name()
+ << " assigned to filesystem " << fs->mds_map.fs_name
+ << " as rank " << f;
+
+ fsmap.promote(sgid, fs, f);
do_propose = true;
}
}
// them while perhaps-modifying standby_daemons during the loop
// (if we promote anyone they are removed from standby_daemons)
std::vector<mds_gid_t> standby_gids;
- for (const auto &j : pending_fsmap.standby_daemons) {
+ for (const auto &j : fsmap.standby_daemons) {
standby_gids.push_back(j.first);
}
for (const auto &gid : standby_gids) {
- const auto &info = pending_fsmap.standby_daemons.at(gid);
+ const auto &info = fsmap.standby_daemons.at(gid);
assert(info.state == MDSMap::STATE_STANDBY);
if (!info.standby_replay) {
// the standby_for_rank refers to: lookup via legacy_client_fscid
mds_role_t target_role = {
info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
- pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
+ fsmap.legacy_client_fscid : info.standby_for_fscid,
info.standby_for_rank};
// It is possible that the map contains a standby_for_fscid
// that doesn't correspond to an existing filesystem, especially
// if we loaded from a version with a bug (#17466)
if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
- && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
+ && !fsmap.filesystem_exists(info.standby_for_fscid)) {
derr << "gid " << gid << " has invalid standby_for_fscid "
<< info.standby_for_fscid << dendl;
continue;
// If we managed to resolve a full target role
if (target_role.fscid != FS_CLUSTER_ID_NONE) {
- auto fs = pending_fsmap.get_filesystem(target_role.fscid);
+ const auto &fs = fsmap.get_filesystem(target_role.fscid);
if (fs->mds_map.is_followable(target_role.rank)) {
- do_propose |= try_standby_replay(
- info,
- *fs,
+ do_propose |= try_standby_replay(fsmap, info, *fs,
fs->mds_map.get_info(target_role.rank));
}
}
}
// check everyone
- for (auto fs_i : pending_fsmap.filesystems) {
- const MDSMap &mds_map = fs_i.second->mds_map;
- for (auto mds_i : mds_map.mds_info) {
- MDSMap::mds_info_t &cand_info = mds_i.second;
+ for (const auto &p : fsmap.filesystems) {
+ if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
+ info.standby_for_fscid != p.first)
+ continue;
+
+ bool assigned = false;
+ const auto &fs = p.second;
+ const MDSMap &mds_map = fs->mds_map;
+ for (const auto &mds_i : mds_map.mds_info) {
+ const MDSMap::mds_info_t &cand_info = mds_i.second;
if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
info.standby_for_rank != MDS_RANK_NONE) {
continue; // we're supposed to follow someone else
}
- if (try_standby_replay(info, *(fs_i.second), cand_info)) {
- do_propose = true;
+ if (try_standby_replay(fsmap, info, *fs, cand_info)) {
+ assigned = true;
break;
}
- continue;
}
}
+ if (assigned) {
+ do_propose = true;
+ break;
+ }
}
}
}
{
// make sure mds's are still alive
// ...if i am an active leader
- if (!is_active()) return;
- dout(10) << fsmap << dendl;
+ if (!is_active() || !is_leader()) return;
- bool do_propose = false;
+ auto &pending = get_pending_fsmap_writeable();
- if (!mon->is_leader()) return;
+ bool do_propose = false;
- do_propose |= pending_fsmap.check_health();
+ do_propose |= pending.check_health();
// expand mds cluster (add new nodes to @in)?
- for (auto i : pending_fsmap.filesystems) {
- do_propose |= maybe_expand_cluster(i.second);
+ for (auto &p : pending.filesystems) {
+ do_propose |= maybe_expand_cluster(pending, p.second->fscid);
}
- const auto now = ceph_clock_now();
- if (last_tick.is_zero()) {
+ mono_time now = mono_clock::now();
+ if (last_tick == decltype(last_tick)::min()) {
last_tick = now;
}
+ chrono::duration<double> since_last = now-last_tick;
- if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
+ if (since_last.count() >
+ (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
// This case handles either local slowness (calls being delayed
// for whatever reason) or cluster election slowness (a long gap
// between calls while an election happened)
- dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+ dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
"(slow election?) of " << now - last_tick << " seconds" << dendl;
- for (auto &i : last_beacon) {
- i.second.stamp = now;
+ for (auto &p : last_beacon) {
+ p.second.stamp = now;
}
}
last_tick = now;
- // check beacon timestamps
- utime_t cutoff = now;
- cutoff -= g_conf->mds_beacon_grace;
-
// make sure last_beacon is fully populated
- for (const auto &p : pending_fsmap.mds_roles) {
+ for (auto &p : pending.mds_roles) {
auto &gid = p.first;
- if (last_beacon.count(gid) == 0) {
- last_beacon[gid].stamp = now;
- last_beacon[gid].seq = 0;
- }
+ last_beacon.emplace(std::piecewise_construct,
+ std::forward_as_tuple(gid),
+ std::forward_as_tuple(mono_clock::now(), 0));
}
- // If the OSDMap is writeable, we can blacklist things, so we can
- // try failing any laggy MDS daemons. Consider each one for failure.
- if (mon->osdmon()->is_writeable()) {
- bool propose_osdmap = false;
- map<mds_gid_t, beacon_info_t>::iterator p = last_beacon.begin();
- while (p != last_beacon.end()) {
- mds_gid_t gid = p->first;
- auto beacon_info = p->second;
- ++p;
+ // check beacon timestamps
+ bool propose_osdmap = false;
+ bool osdmap_writeable = mon->osdmon()->is_writeable();
+ for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+ mds_gid_t gid = it->first;
+ auto beacon_info = it->second;
+ chrono::duration<double> since_last = now-beacon_info.stamp;
+
+ if (!pending.gid_exists(gid)) {
+ // clean it out
+ it = last_beacon.erase(it);
+ continue;
+ }
- if (!pending_fsmap.gid_exists(gid)) {
- // clean it out
- last_beacon.erase(gid);
- continue;
- }
- if (beacon_info.stamp < cutoff) {
- maybe_replace_gid(gid, beacon_info, &do_propose, &propose_osdmap);
+ if (since_last.count() >= g_conf->mds_beacon_grace) {
+ auto &info = pending.get_info_gid(gid);
+ dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+ << " (gid: " << gid << " addr: " << info.addr
+ << " state: " << ceph_mds_state_name(info.state) << ")"
+ << " since " << since_last.count() << "s" << dendl;
+ // If the OSDMap is writeable, we can blacklist things, so we can
+ // try failing any laggy MDS daemons. Consider each one for failure.
+ if (osdmap_writeable) {
+ maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
}
}
- if (propose_osdmap) {
- request_proposal(mon->osdmon());
- }
+ ++it;
+ }
+ if (propose_osdmap) {
+ request_proposal(mon->osdmon());
}
- for (auto i : pending_fsmap.filesystems) {
- auto fs = i.second;
+ for (auto &p : pending.filesystems) {
+ auto &fs = p.second;
if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- do_propose |= maybe_promote_standby(fs);
+ do_propose |= maybe_promote_standby(pending, fs);
}
}
* ainfo: the would-be leader
*/
bool MDSMonitor::try_standby_replay(
+ FSMap &fsmap,
const MDSMap::mds_info_t& finfo,
const Filesystem &leader_fs,
const MDSMap::mds_info_t& ainfo)
} else {
// Assign the new role to the standby
dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
- pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
+ fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
return true;
}
}
MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
: PaxosService(mn, p, service_name)
{
- handlers = FileSystemCommandHandler::load();
+ handlers = FileSystemCommandHandler::load(p);
}
void MDSMonitor::on_restart()
{
// Clear out the leader-specific state.
- last_tick = utime_t();
+ last_tick = mono_clock::now();
last_beacon.clear();
}