*
*/
+#include <regex>
#include <sstream>
#include <boost/utility.hpp>
-#include <boost/regex.hpp>
#include "MDSMonitor.h"
#include "FSCommands.h"
#include "Monitor.h"
#include "MonitorDBStore.h"
#include "OSDMonitor.h"
-#include "PGMonitor.h"
#include "common/strtol.h"
#include "common/perf_counters.h"
#include "messages/MMonCommand.h"
#include "messages/MGenericMessage.h"
-#include "include/assert.h"
+#include "include/ceph_assert.h"
#include "include/str_list.h"
#include "include/stringify.h"
#include "mds/mdstypes.h"
#define dout_subsys ceph_subsys_mon
#undef dout_prefix
#define dout_prefix _prefix(_dout, mon, get_fsmap())
+using namespace TOPNSPC::common;
+
static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
return *_dout << "mon." << mon->name << "@" << mon->rank
<< "(" << mon->get_state_name()
* Specialized implementation of cmd_getval to allow us to parse
* out strongly-typedef'd types
*/
-template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
+namespace TOPNSPC::common {
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
const std::string& k, mds_gid_t &val)
{
- return cmd_getval(cct, cmdmap, k, (int64_t&)val);
+ return cmd_getval(cmdmap, k, (int64_t&)val);
}
-template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
const std::string& k, mds_rank_t &val)
{
- return cmd_getval(cct, cmdmap, k, (int64_t&)val);
+ return cmd_getval(cmdmap, k, (int64_t&)val);
}
-template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
const std::string& k, MDSMap::DaemonState &val)
{
- return cmd_getval(cct, cmdmap, k, (int64_t&)val);
+ return cmd_getval(cmdmap, k, (int64_t&)val);
+}
}
-
// my methods
-void MDSMonitor::print_map(const FSMap &m, int dbl)
+template <int dblV>
+void MDSMonitor::print_map(const FSMap& m)
{
- dout(dbl) << "print_map\n";
+ dout(dblV) << "print_map\n";
m.print(*_dout);
*_dout << dendl;
}
dout(10) << "create_initial" << dendl;
}
-void MDSMonitor::get_store_prefixes(std::set<string>& s)
+void MDSMonitor::get_store_prefixes(std::set<string>& s) const
{
s.insert(service_name);
s.insert(MDS_METADATA_PREFIX);
dout(10) << __func__ << " version " << version
<< ", my e " << get_fsmap().epoch << dendl;
- assert(version > get_fsmap().epoch);
+ ceph_assert(version > get_fsmap().epoch);
load_health();
bufferlist fsmap_bl;
fsmap_bl.clear();
int err = get_version(version, fsmap_bl);
- assert(err == 0);
+ ceph_assert(err == 0);
- assert(fsmap_bl.length() > 0);
+ ceph_assert(fsmap_bl.length() > 0);
dout(10) << __func__ << " got " << version << dendl;
PaxosFSMap::decode(fsmap_bl);
// new map
dout(0) << "new map" << dendl;
- print_map(get_fsmap(), 0);
- if (!g_conf->mon_mds_skip_sanity) {
+ print_map<0>(get_fsmap());
+ if (!g_conf()->mon_mds_skip_sanity) {
get_fsmap().sanity();
}
check_subs();
- update_logger();
}
void MDSMonitor::init()
dout(10) << "encode_pending e" << epoch << dendl;
// print map iff 'debug mon = 30' or higher
- print_map(get_pending_fsmap(), 30);
- if (!g_conf->mon_mds_skip_sanity) {
+ print_map<30>(pending);
+ if (!g_conf()->mon_mds_skip_sanity) {
pending.sanity();
}
}
// apply to paxos
- assert(get_last_committed() + 1 == pending.epoch);
+ ceph_assert(get_last_committed() + 1 == pending.epoch);
bufferlist pending_bl;
pending.encode(pending_bl, mon->get_quorum_con_features());
derr << "Missing health data for MDS " << gid << dendl;
continue;
}
- bufferlist::iterator bl_i = bl.begin();
+ auto bl_i = bl.cbegin();
health.decode(bl_i);
}
for (const auto &metric : health.metrics) {
- const int rank = info.rank;
+ const auto rank = info.rank;
health_check_t *check = &new_checks.get_or_add(
mds_metric_name(metric.type),
metric.sev,
- mds_metric_summary(metric.type));
+ mds_metric_summary(metric.type),
+ 1);
ostringstream ss;
- ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
+ ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
bool first = true;
for (auto &p : metric.metadata) {
if (first) {
}
pending.get_health_checks(&new_checks);
for (auto& p : new_checks.checks) {
- p.second.summary = boost::regex_replace(
+ p.second.summary = std::regex_replace(
p.second.summary,
- boost::regex("%num%"),
+ std::regex("%num%"),
stringify(p.second.detail.size()));
- p.second.summary = boost::regex_replace(
+ p.second.summary = std::regex_replace(
p.second.summary,
- boost::regex("%plurals%"),
+ std::regex("%plurals%"),
p.second.detail.size() > 1 ? "s" : "");
- p.second.summary = boost::regex_replace(
+ p.second.summary = std::regex_replace(
p.second.summary,
- boost::regex("%isorare%"),
+ std::regex("%isorare%"),
p.second.detail.size() > 1 ? "are" : "is");
- p.second.summary = boost::regex_replace(
+ p.second.summary = std::regex_replace(
p.second.summary,
- boost::regex("%hasorhave%"),
+ std::regex("%hasorhave%"),
p.second.detail.size() > 1 ? "have" : "has");
}
encode_health(new_checks, t);
}
-version_t MDSMonitor::get_trim_to()
+version_t MDSMonitor::get_trim_to() const
{
version_t floor = 0;
- if (g_conf->mon_mds_force_trim_to > 0 &&
- g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
- floor = g_conf->mon_mds_force_trim_to;
+ if (g_conf()->mon_mds_force_trim_to > 0 &&
+ g_conf()->mon_mds_force_trim_to < (int)get_last_committed()) {
+ floor = g_conf()->mon_mds_force_trim_to;
dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
<< floor << dendl;
}
- unsigned max = g_conf->mon_max_mdsmap_epochs;
+ unsigned max = g_conf()->mon_max_mdsmap_epochs;
version_t last = get_last_committed();
if (last - get_first_committed() > max && floor < last - max)
return floor;
}
-void MDSMonitor::update_logger()
-{
- dout(10) << "update_logger" << dendl;
-
- const auto &fsmap = get_fsmap();
-
- uint64_t up = 0;
- uint64_t in = 0;
- uint64_t failed = 0;
- for (const auto &i : fsmap.filesystems) {
- const MDSMap &mds_map = i.second->mds_map;
-
- up += mds_map.get_num_up_mds();
- in += mds_map.get_num_in_mds();
- failed += mds_map.get_num_failed_mds();
- }
- mon->cluster_logger->set(l_cluster_num_mds_up, up);
- mon->cluster_logger->set(l_cluster_num_mds_in, in);
- mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
- mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
-}
-
bool MDSMonitor::preprocess_query(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
- dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+ auto m = op->get_req<PaxosServiceMessage>();
+ dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs() << dendl;
switch (m->get_type()) {
case MSG_MON_COMMAND:
try {
return preprocess_command(op);
- }
- catch (const bad_cmd_get& e) {
+ } catch (const bad_cmd_get& e) {
bufferlist bl;
mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
return true;
bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
+ auto m = op->get_req<MMDSBeacon>();
MDSMap::DaemonState state = m->get_state();
mds_gid_t gid = m->get_global_id();
version_t seq = m->get_seq();
const auto &fsmap = get_fsmap();
// check privileges, ignore if fails
- MonSession *session = m->get_session();
- assert(session);
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
if (!session->is_capable("mds", MON_CAP_X)) {
dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
<< session->caps << dendl;
}
dout(5) << "preprocess_beacon " << *m
- << " from " << m->get_orig_source_inst()
+ << " from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs()
<< " " << m->get_compat()
<< dendl;
// check compat
if (!m->get_compat().writeable(fsmap.compat)) {
- dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
+ dout(1) << " mds " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs()
+ << " can't write to fsmap " << fsmap.compat << dendl;
goto ignore;
}
MDSMap null_map;
null_map.epoch = fsmap.epoch;
null_map.compat = fsmap.compat;
- mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
+ auto m = make_message<MMDSMap>(mon->monmap->fsid, null_map);
+ mon->send_reply(op, m.detach());
return true;
} else {
return false; // not booted yet.
dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
info = fsmap.get_info_gid(gid);
+ if (state == MDSMap::STATE_DNE) {
+ return false;
+ }
+
// old seq?
if (info.state_seq > seq) {
dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
// ignore, already booted.
goto ignore;
}
+
+ // did the join_fscid change
+ if (m->get_fs().size()) {
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ auto f = fsmap.get_filesystem(m->get_fs());
+ if (f) {
+ fscid = f->fscid;
+ }
+ if (info.join_fscid != fscid) {
+ dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
+ << " (" << m->get_fs() << ")" << dendl;
+ _note_beacon(m);
+ return false;
+ }
+ } else {
+ if (info.join_fscid != FS_CLUSTER_ID_NONE) {
+ dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
+ _note_beacon(m);
+ return false;
+ }
+ }
+
// is there a state change here?
if (info.state != state) {
// legal state change?
reply:
// note time and reply
- assert(effective_epoch > 0);
+ ceph_assert(effective_epoch > 0);
_note_beacon(m);
- mon->send_reply(op,
- new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
- effective_epoch, state, seq,
- CEPH_FEATURES_SUPPORTED_DEFAULT));
+ {
+ auto beacon = make_message<MMDSBeacon>(mon->monmap->fsid,
+ m->get_global_id(), m->get_name(), effective_epoch,
+ state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ mon->send_reply(op, beacon.detach());
+ }
return true;
ignore:
bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
+ auto m = op->get_req<MMDSLoadTargets>();
dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
const auto &fsmap = get_fsmap();
// check privileges, ignore message if fails
- MonSession *session = m->get_session();
+ MonSession *session = op->get_session();
if (!session)
goto ignore;
if (!session->is_capable("mds", MON_CAP_X)) {
bool MDSMonitor::prepare_update(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
+ auto m = op->get_req<PaxosServiceMessage>();
dout(7) << "prepare_update " << *m << dendl;
switch (m->get_type()) {
case MSG_MON_COMMAND:
try {
return prepare_command(op);
- }
- catch (const bad_cmd_get& e) {
+ } catch (const bad_cmd_get& e) {
bufferlist bl;
mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
return true;
bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
+ auto m = op->get_req<MMDSBeacon>();
// -- this is an update --
- dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
- entity_addr_t addr = m->get_orig_source_inst().addr;
+ dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs() << dendl;
+ entity_addrvec_t addrs = m->get_orig_source_addrs();
mds_gid_t gid = m->get_global_id();
MDSMap::DaemonState state = m->get_state();
version_t seq = m->get_seq();
for (const auto &new_metric: new_health) {
if (old_types.count(new_metric.type) == 0) {
- dout(10) << "MDS health message (" << m->get_orig_source_inst().name
+ dout(10) << "MDS health message (" << m->get_orig_source()
<< "): " << new_metric.sev << " " << new_metric.message << dendl;
}
}
for (const auto &old_metric : old_health) {
if (new_types.count(old_metric.type) == 0) {
mon->clog->info() << "MDS health message cleared ("
- << m->get_orig_source_inst().name << "): " << old_metric.message;
+ << m->get_orig_source() << "): " << old_metric.message;
}
}
// boot?
if (state == MDSMap::STATE_BOOT) {
// zap previous instance of this name?
- if (g_conf->mds_enforce_unique_name) {
+ if (g_conf()->mds_enforce_unique_name) {
bool failed_mds = false;
while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
if (!mon->osdmon()->is_writeable()) {
failed_mds = true;
}
if (failed_mds) {
- assert(mon->osdmon()->is_writeable());
+ ceph_assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
}
}
MDSMap::mds_info_t new_info;
new_info.global_id = gid;
new_info.name = m->get_name();
- new_info.addr = addr;
+ new_info.addrs = addrs;
new_info.mds_features = m->get_mds_features();
new_info.state = MDSMap::STATE_STANDBY;
new_info.state_seq = seq;
- new_info.standby_for_rank = m->get_standby_for_rank();
- new_info.standby_for_name = m->get_standby_for_name();
- new_info.standby_for_fscid = m->get_standby_for_fscid();
- new_info.standby_replay = m->get_standby_replay();
pending.insert(new_info);
- }
-
- // Resolve standby_for_name to a rank
- const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
- if (!info.standby_for_name.empty()) {
- const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
- info.standby_for_name);
- if (leaderinfo && (leaderinfo->rank >= 0)) {
- const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
-
- pending.modify_daemon(gid, [fscid, leaderinfo](
- MDSMap::mds_info_t *info) {
- info->standby_for_rank = leaderinfo->rank;
- info->standby_for_fscid = fscid;
- });
+ if (m->get_fs().size()) {
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ auto f = pending.get_filesystem(m->get_fs());
+ if (f) {
+ fscid = f->fscid;
+ }
+ new_info.join_fscid = fscid;
}
}
* know which FS it was part of. Nor does this matter. Sending an empty
* MDSMap is sufficient for getting the MDS to respawn.
*/
- wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
+ wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
if (r >= 0) {
const auto& fsmap = get_fsmap();
MDSMap null_map;
null_map.epoch = fsmap.epoch;
null_map.compat = fsmap.compat;
- mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
+ auto m = make_message<MMDSMap>(mon->monmap->fsid, null_map);
+ mon->send_reply(op, m.detach());
} else {
dispatch(op); // try again
}
return true;
}
- const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
- // Old MDS daemons don't mention that they're standby replay until
- // after they've sent their boot beacon, so update this field.
- if (info.standby_replay != m->get_standby_replay()) {
- pending.modify_daemon(info.global_id, [&m](
- MDSMap::mds_info_t *i)
- {
- i->standby_replay = m->get_standby_replay();
- });
- }
-
+ const auto& info = pending.get_info_gid(gid);
if (info.state == MDSMap::STATE_STOPPING &&
state != MDSMap::STATE_STOPPING &&
state != MDSMap::STATE_STOPPED) {
}
if (info.laggy()) {
- dout(1) << "prepare_beacon clearing laggy flag on " << addr << dendl;
- pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
+ dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
+ pending.modify_daemon(info.global_id, [](auto& info)
{
- info->clear_laggy();
+ info.clear_laggy();
}
);
}
-
+
dout(5) << "prepare_beacon mds." << info.rank
<< " " << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state)
- << " standby_for_rank=" << m->get_standby_for_rank()
<< dendl;
+
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ if (m->get_fs().size()) {
+ auto f = pending.get_filesystem(m->get_fs());
+ if (f) {
+ fscid = f->fscid;
+ }
+ }
+ pending.modify_daemon(gid, [fscid](auto& info) {
+ info.join_fscid = fscid;
+ });
+
if (state == MDSMap::STATE_STOPPED) {
const auto fscid = pending.mds_roles.at(gid);
const auto &fs = pending.get_filesystem(fscid);
mon->clog->info() << info.human_name() << " finished "
- << "deactivating rank " << info.rank << " in filesystem "
+ << "stopping rank " << info.rank << " in filesystem "
<< fs->mds_map.fs_name << " (now has "
<< fs->mds_map.get_num_in_mds() - 1 << " ranks)";
auto erased = pending.stop(gid);
erased.push_back(gid);
- for (const auto &erased_gid : erased) {
+ for (const auto& erased_gid : erased) {
last_beacon.erase(erased_gid);
if (pending_daemon_health.count(erased_gid)) {
pending_daemon_health.erase(erased_gid);
<< info.rank << " damaged" << dendl;
utime_t until = ceph_clock_now();
- until += g_conf->get_val<double>("mon_mds_blacklist_interval");
- const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
+ until += g_conf().get_val<double>("mon_mds_blacklist_interval");
+ const auto blacklist_epoch = mon->osdmon()->blacklist(info.addrs, until);
request_proposal(mon->osdmon());
pending.damaged(gid, blacklist_epoch);
last_beacon.erase(gid);
// Respond to MDS, so that it knows it can continue to shut down
- mon->send_reply(op,
- new MMDSBeacon(
+ auto beacon = make_message<MMDSBeacon>(
mon->monmap->fsid, m->get_global_id(),
m->get_name(), pending.get_epoch(), state, seq,
- CEPH_FEATURES_SUPPORTED_DEFAULT));
+ CEPH_FEATURES_SUPPORTED_DEFAULT);
+ mon->send_reply(op, beacon.detach());
} else if (state == MDSMap::STATE_DNE) {
if (!mon->osdmon()->is_writeable()) {
dout(1) << __func__ << ": DNE from rank " << info.rank
}
fail_mds_gid(pending, gid);
- assert(mon->osdmon()->is_writeable());
+ ceph_assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
// Respond to MDS, so that it knows it can continue to shut down
- mon->send_reply(op,
- new MMDSBeacon(
- mon->monmap->fsid, m->get_global_id(),
- m->get_name(), pending.get_epoch(), state, seq,
- CEPH_FEATURES_SUPPORTED_DEFAULT));
+ auto beacon = make_message<MMDSBeacon>(mon->monmap->fsid,
+ m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
+ CEPH_FEATURES_SUPPORTED_DEFAULT);
+ mon->send_reply(op, beacon.detach());
} else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
// Standby daemons should never modify their own
// state. Reject any attempts to do so.
// Made it through special cases and validations, record the
// daemon's reported state to the FSMap.
- pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
- info->state = state;
- info->state_seq = seq;
+ pending.modify_daemon(gid, [state, seq](auto& info) {
+ info.state = state;
+ info.state_seq = seq;
});
}
}
dout(5) << "prepare_beacon pending map now:" << dendl;
print_map(pending);
- wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
+ wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
if (r >= 0)
_updated(op); // success
else if (r == -ECANCELED) {
auto &pending = get_pending_fsmap_writeable();
op->mark_mdsmon_event(__func__);
- MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
+ auto m = op->get_req<MMDSLoadTargets>();
mds_gid_t gid = m->global_id;
if (pending.gid_has_rank(gid)) {
dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
{
const auto &fsmap = get_fsmap();
op->mark_mdsmon_event(__func__);
- MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
+ auto m = op->get_req<MMDSBeacon>();
dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
- mon->clog->debug() << m->get_orig_source_inst() << " "
- << ceph_mds_state_name(m->get_state());
+ mon->clog->debug() << m->get_orig_source() << " "
+ << m->get_orig_source_addrs() << " "
+ << ceph_mds_state_name(m->get_state());
if (m->get_state() == MDSMap::STATE_STOPPED) {
// send the map manually (they're out of the map, so they won't get it automatic)
MDSMap null_map;
null_map.epoch = fsmap.epoch;
null_map.compat = fsmap.compat;
- mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
+ auto m = make_message<MMDSMap>(mon->monmap->fsid, null_map);
+ mon->send_reply(op, m.detach());
} else {
- mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
- m->get_global_id(),
- m->get_name(),
- fsmap.get_epoch(),
- m->get_state(),
- m->get_seq(),
- CEPH_FEATURES_SUPPORTED_DEFAULT));
+ auto beacon = make_message<MMDSBeacon>(mon->monmap->fsid,
+ m->get_global_id(), m->get_name(), fsmap.get_epoch(),
+ m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
+ mon->send_reply(op, beacon.detach());
}
}
void MDSMonitor::on_active()
{
tick();
- update_logger();
if (is_leader()) {
mon->clog->debug() << "fsmap " << get_fsmap();
}
}
-void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
- list<pair<health_status_t, string> > *detail,
- CephContext* cct) const
-{
- const auto &fsmap = get_fsmap();
-
- fsmap.get_health(summary, detail);
-
- // For each MDS GID...
- const auto &info_map = fsmap.get_mds_info();
- for (const auto &i : info_map) {
- const auto &gid = i.first;
- const auto &info = i.second;
-
- // Decode MDSHealth
- bufferlist bl;
- mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
- if (!bl.length()) {
- derr << "Missing health data for MDS " << gid << dendl;
- continue;
- }
- MDSHealth health;
- bufferlist::iterator bl_i = bl.begin();
- health.decode(bl_i);
-
- for (const auto &metric : health.metrics) {
- const int rank = info.rank;
- std::ostringstream message;
- message << "mds" << rank << ": " << metric.message;
- summary.push_back(std::make_pair(metric.sev, message.str()));
-
- if (detail) {
- // There is no way for us to clealy associate detail entries with summary entries (#7192), so
- // we duplicate the summary message in the detail string and tag the metadata on.
- std::ostringstream detail_message;
- detail_message << message.str();
- if (metric.metadata.size()) {
- detail_message << "(";
- auto k = metric.metadata.begin();
- while (k != metric.metadata.end()) {
- detail_message << k->first << ": " << k->second;
- if (boost::next(k) != metric.metadata.end()) {
- detail_message << ", ";
- }
- ++k;
- }
- detail_message << ")";
- }
- detail->push_back(std::make_pair(metric.sev, detail_message.str()));
- }
- }
- }
-}
-
void MDSMonitor::dump_info(Formatter *f)
{
f->open_object_section("fsmap");
bool MDSMonitor::preprocess_command(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
+ auto m = op->get_req<MMonCommand>();
int r = -1;
bufferlist rdata;
stringstream ss, ds;
- map<string, cmd_vartype> cmdmap;
const auto &fsmap = get_fsmap();
+ cmdmap_t cmdmap;
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
// ss has reason for failure
string rs = ss.str();
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval(cmdmap, "prefix", prefix);
string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+ cmd_getval(cmdmap, "format", format, string("plain"));
std::unique_ptr<Formatter> f(Formatter::create(format));
- MonSession *session = m->get_session();
+ MonSession *session = op->get_session();
if (!session) {
mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
return true;
ds << fsmap;
}
r = 0;
- } else if (prefix == "mds dump") {
- int64_t epocharg;
- epoch_t epoch;
-
- const FSMap *fsmapp = &get_fsmap();
- FSMap dummy;
- if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
- epoch = epocharg;
- bufferlist b;
- int err = get_version(epoch, b);
- if (err == -ENOENT) {
- r = -ENOENT;
- goto out;
+ } else if (prefix == "mds ok-to-stop") {
+ vector<string> ids;
+ if (!cmd_getval(cmdmap, "ids", ids)) {
+ r = -EINVAL;
+ ss << "must specify mds id";
+ goto out;
+ }
+ if (fsmap.is_any_degraded()) {
+ ss << "one or more filesystems is currently degraded";
+ r = -EBUSY;
+ goto out;
+ }
+ set<mds_gid_t> stopping;
+ for (auto& id : ids) {
+ ostringstream ess;
+ mds_gid_t gid = gid_from_arg(fsmap, id, ess);
+ if (gid == MDS_GID_NONE) {
+ // the mds doesn't exist, but no file systems are unhappy, so losing it
+ // can't have any effect.
+ continue;
+ }
+ stopping.insert(gid);
+ }
+ set<mds_gid_t> active;
+ set<mds_gid_t> standby;
+ for (auto gid : stopping) {
+ if (fsmap.gid_has_rank(gid)) {
+ // ignore standby-replay daemons (at this level)
+ if (!fsmap.is_standby_replay(gid)) {
+ auto standby = fsmap.get_standby_replay(gid);
+ if (standby == MDS_GID_NONE ||
+ stopping.count(standby)) {
+ // no standby-replay, or we're also stopping the standby-replay
+ // for this mds
+ active.insert(gid);
+ }
+ }
} else {
- assert(err == 0);
- assert(b.length());
- dummy.decode(b);
- fsmapp = &dummy;
+ // net loss of a standby
+ standby.insert(gid);
}
}
-
- stringstream ds;
- const MDSMap *mdsmapp = nullptr;
- MDSMap blank;
- blank.epoch = fsmapp->epoch;
- if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
- mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
- } else {
- mdsmapp = ␣
- }
- if (f != NULL) {
- f->open_object_section("mdsmap");
- mdsmapp->dump(f.get());
- f->close_section();
- f->flush(ds);
- r = 0;
- } else {
- mdsmapp->print(ds);
- r = 0;
+ if (fsmap.get_num_standby() - standby.size() < active.size()) {
+ r = -EBUSY;
+ ss << "insufficent standby MDS daemons to stop active gids "
+ << stringify(active)
+ << " and/or standby gids " << stringify(standby);;
+ goto out;
}
-
- rdata.append(ds);
- ss << "dumped fsmap epoch " << fsmapp->get_epoch();
+ r = 0;
+ ss << "should be safe to stop " << ids;
} else if (prefix == "fs dump") {
int64_t epocharg;
epoch_t epoch;
const FSMap *fsmapp = &fsmap;
FSMap dummy;
- if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
+ if (cmd_getval(cmdmap, "epoch", epocharg)) {
epoch = epocharg;
bufferlist b;
int err = get_version(epoch, b);
r = -ENOENT;
goto out;
} else {
- assert(err == 0);
- assert(b.length());
+ ceph_assert(err == 0);
+ ceph_assert(b.length());
dummy.decode(b);
fsmapp = &dummy;
}
f.reset(Formatter::create("json-pretty"));
string who;
- bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
+ bool all = !cmd_getval(cmdmap, "who", who);
dout(1) << "all = " << all << dendl;
if (all) {
r = 0;
if (!f)
f.reset(Formatter::create("json-pretty"));
string field;
- cmd_getval(g_ceph_context, cmdmap, "property", field);
+ cmd_getval(cmdmap, "property", field);
count_metadata(field, f.get());
f->flush(ds);
r = 0;
- } else if (prefix == "mds getmap") {
- epoch_t e;
- int64_t epocharg;
- bufferlist b;
- if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
- e = epocharg;
- int err = get_version(e, b);
- if (err == -ENOENT) {
- r = -ENOENT;
- } else {
- assert(err == 0);
- assert(b.length());
- FSMap mm;
- mm.decode(b);
- mm.encode(rdata, m->get_connection()->get_features());
- ss << "got fsmap epoch " << mm.get_epoch();
- r = 0;
- }
- } else {
- fsmap.encode(rdata, m->get_connection()->get_features());
- ss << "got fsmap epoch " << fsmap.get_epoch();
- r = 0;
- }
} else if (prefix == "mds compat show") {
if (f) {
f->open_object_section("mds_compat");
r = 0;
} else if (prefix == "fs get") {
string fs_name;
- cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+ cmd_getval(cmdmap, "fs_name", fs_name);
const auto &fs = fsmap.get_filesystem(fs_name);
if (fs == nullptr) {
ss << "filesystem '" << fs_name << "' not found";
bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
{
- const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
+ const auto& info = fsmap.get_info_gid(gid);
dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
ceph_assert(mon->osdmon()->is_writeable());
epoch_t blacklist_epoch = 0;
if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
utime_t until = ceph_clock_now();
- until += g_conf->get_val<double>("mon_mds_blacklist_interval");
- blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
+ until += g_conf().get_val<double>("mon_mds_blacklist_interval");
+ blacklist_epoch = mon->osdmon()->blacklist(info.addrs, until);
}
fsmap.erase(gid, blacklist_epoch);
if (r == 0) {
// See if a GID is assigned to this role
const auto &fs = fsmap.get_filesystem(role.fscid);
- assert(fs != nullptr); // parse_role ensures it exists
+ ceph_assert(fs != nullptr); // parse_role ensures it exists
if (fs->mds_map.is_up(role.rank)) {
dout(10) << __func__ << ": validated rank/GID " << role
<< " as a rank" << dendl;
int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
const std::string &arg, MDSMap::mds_info_t *failed_info)
{
- assert(failed_info != nullptr);
+ ceph_assert(failed_info != nullptr);
mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
if (gid == MDS_GID_NONE) {
fail_mds_gid(fsmap, gid);
ss << "failed mds gid " << gid;
- assert(mon->osdmon()->is_writeable());
+ ceph_assert(mon->osdmon()->is_writeable());
request_proposal(mon->osdmon());
return 0;
}
bool MDSMonitor::prepare_command(MonOpRequestRef op)
{
op->mark_mdsmon_event(__func__);
- MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
+ auto m = op->get_req<MMonCommand>();
int r = -EINVAL;
stringstream ss;
bufferlist rdata;
- map<string, cmd_vartype> cmdmap;
+ cmdmap_t cmdmap;
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
string rs = ss.str();
mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
}
string prefix;
- cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+ cmd_getval(cmdmap, "prefix", prefix);
/* Refuse access if message not associated with a valid session */
- MonSession *session = m->get_session();
+ MonSession *session = op->get_session();
if (!session) {
mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
return true;
goto out;
}
- // Only handle legacy commands if there is a filesystem configured
- if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
- if (pending.filesystems.size() == 0) {
- ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
- } else {
- ss << "No filesystem set for use with legacy commands";
- }
- r = -EINVAL;
- goto out;
- }
-
- r = legacy_filesystem_command(pending, op, prefix, cmdmap, ss);
-
if (r == -ENOSYS && ss.str().empty()) {
ss << "unrecognized command";
}
FSMap &fsmap,
MonOpRequestRef op,
std::string const &prefix,
- map<string, cmd_vartype> &cmdmap,
+ const cmdmap_t& cmdmap,
std::stringstream &ss)
{
dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
op->mark_mdsmon_event(__func__);
int r = 0;
string whostr;
- cmd_getval(g_ceph_context, cmdmap, "who", whostr);
+ cmd_getval(cmdmap, "role", whostr);
- if (prefix == "mds stop" ||
- prefix == "mds deactivate") {
- mds_role_t role;
- r = fsmap.parse_role(whostr, &role, ss);
- if (r < 0 ) {
- return r;
- }
- const auto &fs = fsmap.get_filesystem(role.fscid);
-
- if (!fs->mds_map.is_active(role.rank)) {
- r = -EEXIST;
- ss << "mds." << role << " not active ("
- << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
- } else if (fs->mds_map.get_root() == role.rank ||
- fs->mds_map.get_tableserver() == role.rank) {
- r = -EINVAL;
- ss << "can't tell the root (" << fs->mds_map.get_root()
- << ") or tableserver (" << fs->mds_map.get_tableserver()
- << ") to deactivate";
- } else if (role.rank != fs->mds_map.get_last_in_mds()) {
- r = -EINVAL;
- ss << "mds." << role << " doesn't have the max rank ("
- << fs->mds_map.get_last_in_mds() << ")";
- } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
- r = -EBUSY;
- ss << "must decrease max_mds or else MDS will immediately reactivate";
- } else {
- r = 0;
- mds_gid_t gid = fs->mds_map.up.at(role.rank);
- ss << "telling mds." << role << " "
- << fsmap.get_info_gid(gid).addr << " to deactivate";
-
- fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
- info->state = MDSMap::STATE_STOPPING;
- });
- }
- } else if (prefix == "mds set_state") {
+ if (prefix == "mds set_state") {
mds_gid_t gid;
- if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
+ if (!cmd_getval(cmdmap, "gid", gid)) {
ss << "error parsing 'gid' value '"
- << cmd_vartype_stringify(cmdmap["gid"]) << "'";
+ << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
return -EINVAL;
}
MDSMap::DaemonState state;
- if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
+ if (!cmd_getval(cmdmap, "state", state)) {
ss << "error parsing 'state' string value '"
- << cmd_vartype_stringify(cmdmap["state"]) << "'";
+ << cmd_vartype_stringify(cmdmap.at("state")) << "'";
return -EINVAL;
}
if (fsmap.gid_exists(gid)) {
- fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
- info->state = state;
+ fsmap.modify_daemon(gid, [state](auto& info) {
+ info.state = state;
});
ss << "set mds gid " << gid << " to state " << state << " "
<< ceph_mds_state_name(state);
}
} else if (prefix == "mds fail") {
string who;
- cmd_getval(g_ceph_context, cmdmap, "who", who);
+ cmd_getval(cmdmap, "role_or_gid", who);
MDSMap::mds_info_t failed_info;
r = fail_mds(fsmap, ss, who, &failed_info);
}
} else if (prefix == "mds rm") {
mds_gid_t gid;
- if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
+ if (!cmd_getval(cmdmap, "gid", gid)) {
ss << "error parsing 'gid' value '"
- << cmd_vartype_stringify(cmdmap["gid"]) << "'";
+ << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
return -EINVAL;
}
if (!fsmap.gid_exists(gid)) {
- ss << "mds gid " << gid << " dne";
+ ss << "mds gid " << gid << " does not exist";
r = 0;
} else {
const auto &info = fsmap.get_info_gid(gid);
}
}
} else if (prefix == "mds rmfailed") {
- string confirm;
- if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
- confirm != "--yes-i-really-mean-it") {
+ bool confirm = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+ if (!confirm) {
ss << "WARNING: this can make your filesystem inaccessible! "
"Add --yes-i-really-mean-it if you are sure you wish to continue.";
return -EPERM;
}
std::string role_str;
- cmd_getval(g_ceph_context, cmdmap, "who", role_str);
+ cmd_getval(cmdmap, "role", role_str);
mds_role_t role;
int r = fsmap.parse_role(role_str, &role, ss);
if (r < 0) {
return 0;
} else if (prefix == "mds compat rm_compat") {
int64_t f;
- if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
+ if (!cmd_getval(cmdmap, "feature", f)) {
ss << "error parsing feature value '"
- << cmd_vartype_stringify(cmdmap["feature"]) << "'";
+ << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
return -EINVAL;
}
if (fsmap.compat.compat.contains(f)) {
r = 0;
} else if (prefix == "mds compat rm_incompat") {
int64_t f;
- if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
+ if (!cmd_getval(cmdmap, "feature", f)) {
ss << "error parsing feature value '"
- << cmd_vartype_stringify(cmdmap["feature"]) << "'";
+ << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
return -EINVAL;
}
if (fsmap.compat.incompat.contains(f)) {
r = 0;
} else if (prefix == "mds repaired") {
std::string role_str;
- cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
+ cmd_getval(cmdmap, "role", role_str);
mds_role_t role;
r = fsmap.parse_role(role_str, &role, ss);
if (r < 0) {
bool modified = fsmap.undamaged(role.fscid, role.rank);
if (modified) {
- dout(1) << "repaired: restoring rank " << role << dendl;
+ ss << "repaired: restoring rank " << role;
} else {
- dout(1) << "repaired: no-op on rank " << role << dendl;
+ ss << "nothing to do: rank is not damaged";
}
r = 0;
- } else {
- return -ENOSYS;
- }
-
- return r;
-}
-
-/**
- * Helper to legacy_filesystem_command
- */
-void MDSMonitor::modify_legacy_filesystem(FSMap &fsmap,
- std::function<void(std::shared_ptr<Filesystem> )> fn)
-{
- fsmap.modify_filesystem(
- fsmap.legacy_client_fscid,
- fn
- );
-}
-
-
-
-/**
- * Handle a command that affects the filesystem (i.e. a filesystem
- * must exist for the command to act upon).
- *
- * @retval 0 Command was successfully handled and has side effects
- * @retval -EAGAIN Messages has been requeued for retry
- * @retval -ENOSYS Unknown command
- * @retval < 0 An error has occurred; **ss** may have been set.
- */
-int MDSMonitor::legacy_filesystem_command(
- FSMap &fsmap,
- MonOpRequestRef op,
- std::string const &prefix,
- map<string, cmd_vartype> &cmdmap,
- std::stringstream &ss)
-{
- dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
- op->mark_mdsmon_event(__func__);
- int r = 0;
- string whostr;
- cmd_getval(g_ceph_context, cmdmap, "who", whostr);
-
- assert (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
-
- if (prefix == "mds set_max_mds") {
- // NOTE: deprecated by "fs set max_mds"
- int64_t maxmds;
- if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
+ } else if (prefix == "mds freeze") {
+ std::string who;
+ cmd_getval(cmdmap, "role_or_gid", who);
+ mds_gid_t gid = gid_from_arg(fsmap, who, ss);
+ if (gid == MDS_GID_NONE) {
return -EINVAL;
}
- const MDSMap& mdsmap =
- fsmap.filesystems.at(fsmap.legacy_client_fscid)->mds_map;
-
- if (!mdsmap.allows_multimds() &&
- maxmds > mdsmap.get_max_mds() &&
- maxmds > 1) {
- ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
- return -EINVAL;
- }
-
- if (maxmds > MAX_MDS) {
- ss << "may not have more than " << MAX_MDS << " MDS ranks";
- return -EINVAL;
- }
-
- modify_legacy_filesystem(fsmap,
- [maxmds](std::shared_ptr<Filesystem> fs)
+ bool freeze = false;
{
- fs->mds_map.set_max_mds(maxmds);
- });
+ std::string str;
+ cmd_getval(cmdmap, "val", str);
+ if ((r = parse_bool(str, &freeze, ss)) != 0) {
+ return r;
+ }
+ }
- r = 0;
- ss << "max_mds = " << maxmds;
- } else if (prefix == "mds cluster_down") {
- // NOTE: deprecated by "fs set cluster_down"
- modify_legacy_filesystem(fsmap,
- [](std::shared_ptr<Filesystem> fs)
- {
- fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
- });
- ss << "marked fsmap DOWN";
- r = 0;
- } else if (prefix == "mds cluster_up") {
- // NOTE: deprecated by "fs set cluster_up"
- modify_legacy_filesystem(fsmap,
- [](std::shared_ptr<Filesystem> fs)
- {
- fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
- });
- ss << "unmarked fsmap DOWN";
+ auto f = [freeze,gid,&ss](auto& info) {
+ if (freeze) {
+ ss << "freezing mds." << gid;
+ info.freeze();
+ } else {
+ ss << "unfreezing mds." << gid;
+ info.unfreeze();
+ }
+ };
+ fsmap.modify_daemon(gid, f);
r = 0;
} else {
return -ENOSYS;
return r;
}
-
void MDSMonitor::check_subs()
{
- std::list<std::string> types;
-
// Subscriptions may be to "mdsmap" (MDS and legacy clients),
// "mdsmap.<namespace>", or to "fsmap" for the full state of all
// filesystems. Build a list of all the types we service
// subscriptions for.
- types.push_back("fsmap");
- types.push_back("fsmap.user");
- types.push_back("mdsmap");
+
+ std::vector<std::string> types = {
+ "fsmap",
+ "fsmap.user",
+ "mdsmap",
+ };
+
for (const auto &p : get_fsmap().filesystems) {
const auto &fscid = p.first;
- std::ostringstream oss;
- oss << "mdsmap." << fscid;
- types.push_back(oss.str());
+ CachedStackStringStream cos;
+ *cos << "mdsmap." << fscid;
+ types.push_back(std::string(cos->strv()));
}
for (const auto &type : types) {
- if (mon->session_map.subs.count(type) == 0)
+ auto& subs = mon->session_map.subs;
+ auto subs_it = subs.find(type);
+ if (subs_it == subs.end())
continue;
- xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
- while (!p.end()) {
- Subscription *sub = *p;
- ++p;
+ auto sub_it = subs_it->second->begin();
+ while (!sub_it.end()) {
+ auto sub = *sub_it;
+ ++sub_it; // N.B. check_sub may remove sub!
check_sub(sub);
}
}
return;
}
- const bool is_mds = sub->session->inst.name.is_mds();
+ const bool is_mds = sub->session->name.is_mds();
mds_gid_t mds_gid = MDS_GID_NONE;
fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
if (is_mds) {
// What (if any) namespace are you assigned to?
auto mds_info = fsmap.get_mds_info();
for (const auto &p : mds_info) {
- if (p.second.addr == sub->session->inst.addr) {
+ if (p.second.addrs == sub->session->addrs) {
mds_gid = p.first;
fscid = fsmap.mds_roles.at(mds_gid);
}
} else {
// You're a client. Did you request a particular
// namespace?
- if (sub->type.find("mdsmap.") == 0) {
+ if (sub->type.compare(0, 7, "mdsmap.") == 0) {
auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
std::string err;
null_map.compat = fsmap.compat;
if (fscid == FS_CLUSTER_ID_NONE) {
// For a client, we should have already dropped out
- assert(is_mds);
+ ceph_assert(is_mds);
auto it = fsmap.standby_daemons.find(mds_gid);
if (it != fsmap.standby_daemons.end()) {
mds_map = &fsmap.get_filesystem(fscid)->mds_map;
}
- assert(mds_map != nullptr);
+ ceph_assert(mds_map != nullptr);
dout(10) << __func__ << " selected MDS map epoch " <<
mds_map->epoch << " for namespace " << fscid << " for subscriber "
- << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
+ << sub->session->name << " who wants epoch " << sub->next << dendl;
if (sub->next > mds_map->epoch) {
return;
}
- auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
+ auto msg = make_message<MMDSMap>(mon->monmap->fsid, *mds_map);
- sub->session->con->send_message(msg);
+ sub->session->con->send_message(msg.detach());
if (sub->onetime) {
mon->session_map.remove_sub(sub);
} else {
MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
bufferlist bl;
- ::encode(pending_metadata, bl);
+ encode(pending_metadata, bl);
t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
paxos->trigger_propose();
}
if (!update)
return;
bufferlist bl;
- ::encode(pending_metadata, bl);
+ encode(pending_metadata, bl);
t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
}
bufferlist bl;
int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
if (r) {
- dout(1) << "Unable to load 'last_metadata'" << dendl;
+ dout(5) << "Unable to load 'last_metadata'" << dendl;
return r;
}
- bufferlist::iterator it = bl.begin();
- ::decode(m, it);
+ auto it = bl.cbegin();
+ ceph::decode(m, it);
return 0;
}
int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
Formatter *f, ostream& err)
{
- assert(f);
+ ceph_assert(f);
mds_gid_t gid = gid_from_arg(fsmap, who, err);
if (gid == MDS_GID_NONE) {
int MDSMonitor::print_nodes(Formatter *f)
{
- assert(f);
+ ceph_assert(f);
const auto &fsmap = get_fsmap();
return r;
}
- map<string, list<int> > mdses; // hostname => rank
+ map<string, list<string> > mdses; // hostname => mds
for (const auto &p : metadata) {
const mds_gid_t& gid = p.first;
const Metadata& m = p.second;
continue;
}
const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
- // FIXME: include filesystem name with rank here
- mdses[hostname->second].push_back(mds_info.rank);
+ mdses[hostname->second].push_back(mds_info.name);
}
dump_services(f, mdses, "mds");
/**
* If a cluster is undersized (with respect to max_mds), then
- * attempt to find daemons to grow it.
+ * attempt to find daemons to grow it. If the cluster is oversized
+ * (with respect to max_mds) then shrink it by stopping its highest rank.
*/
-bool MDSMonitor::maybe_expand_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
+bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
{
- auto fs = fsmap.get_filesystem(fscid);
+ auto ¤t_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
+ auto&& fs = fsmap.get_filesystem(fscid);
auto &mds_map = fs->mds_map;
- if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- return false;
- }
-
int in = mds_map.get_num_in_mds();
int max = mds_map.get_max_mds();
dout(20) << __func__ << " in " << in << " max " << max << dendl;
- if (in < max) {
+ /* Check that both the current epoch mds_map is resizeable as well as the
+ * current batch of changes in pending. This is important if an MDS is
+ * becoming active in the next epoch.
+ */
+ if (!current_mds_map.is_resizeable() ||
+ !mds_map.is_resizeable()) {
+ dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
+ return false;
+ }
+
+ if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
mds_rank_t mds = mds_rank_t(0);
- string name;
while (mds_map.is_in(mds)) {
mds++;
}
- mds_gid_t newgid = fsmap.find_replacement_for({fscid, mds},
- name, g_conf->mon_force_standby_active);
- if (newgid == MDS_GID_NONE) {
+ auto info = fsmap.find_replacement_for({fscid, mds});
+ if (!info) {
return false;
}
- const auto &new_info = fsmap.get_info_gid(newgid);
- dout(1) << "assigned standby " << new_info.addr
+ dout(1) << "assigned standby " << info->addrs
<< " as mds." << mds << dendl;
-
- mon->clog->info() << new_info.human_name() << " assigned to "
+ mon->clog->info() << info->human_name() << " assigned to "
"filesystem " << mds_map.fs_name << " as rank "
<< mds << " (now has " << mds_map.get_num_in_mds() + 1
<< " ranks)";
- fsmap.promote(newgid, fs, mds);
+ fsmap.promote(info->global_id, *fs, mds);
return true;
+ } else if (in > max) {
+ mds_rank_t target = in - 1;
+ const auto &info = mds_map.get_info(target);
+ if (mds_map.is_active(target)) {
+ dout(1) << "stopping " << target << dendl;
+ mon->clog->info() << "stopping " << info.human_name();
+ auto f = [](auto& info) {
+ info.state = MDSMap::STATE_STOPPING;
+ };
+ fsmap.modify_daemon(info.global_id, f);
+ return true;
+ } else {
+ dout(20) << "skipping stop of " << target << dendl;
+ return false;
+ }
}
return false;
/**
- * If a daemon is laggy, and a suitable replacement
- * is available, fail this daemon (remove from map) and pass its
- * role to another daemon.
+ * Fail a daemon and replace it with a suitable standby.
*/
-void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
- const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
+bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
{
- assert(mds_propose != nullptr);
- assert(osd_propose != nullptr);
+ ceph_assert(osd_propose != nullptr);
const auto fscid = fsmap.mds_roles.at(gid);
+ const auto& info = fsmap.get_info_gid(gid);
+ const auto rank = info.rank;
+ const auto state = info.state;
- // We will only take decisive action (replacing/removing a daemon)
- // if we have some indicating that some other daemon(s) are successfully
- // getting beacons through recently.
- mono_time latest_beacon = mono_clock::zero();
- for (const auto &p : last_beacon) {
- latest_beacon = std::max(p.second.stamp, latest_beacon);
- }
- mono_time now = mono_clock::now();
- chrono::duration<double> since = now-latest_beacon;
- const bool may_replace = since.count() <
- std::max(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5);
-
- // are we in?
- // and is there a non-laggy standby that can take over for us?
- mds_gid_t sgid;
- if (info.rank >= 0 &&
- info.state != MDSMap::STATE_STANDBY &&
- info.state != MDSMap::STATE_STANDBY_REPLAY &&
- may_replace &&
- !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
- (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name,
- g_conf->mon_force_standby_active)) != MDS_GID_NONE)
- {
-
- MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
- dout(1) << " replacing " << gid << " " << info.addr << " mds."
- << info.rank << "." << info.inc
- << " " << ceph_mds_state_name(info.state)
- << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
-
- mon->clog->warn() << info.human_name()
- << " is not responding, replacing it "
- << "as rank " << info.rank
- << " with standby " << si.human_name();
+ if (info.is_frozen()) {
+ return false;
+ } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
+ state == MDSMap::STATE_STANDBY) {
+ dout(1) << " failing and removing standby " << gid << " " << info.addrs
+ << " mds." << rank
+ << "." << info.inc << " " << ceph_mds_state_name(state)
+ << dendl;
+ *osd_propose |= fail_mds_gid(fsmap, gid);
+ return true;
+ } else if (rank >= 0 && rep_info) {
+ auto fs = fsmap.filesystems.at(fscid);
+ if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ return false;
+ }
+ // are we in?
+ // and is there a non-laggy standby that can take over for us?
+ dout(1) << " replacing " << gid << " " << info.addrs
+ << " mds." << rank << "." << info.inc
+ << " " << ceph_mds_state_name(state)
+ << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
+ << dendl;
- // Remember what NS the old one was in
- const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
+ mon->clog->warn() << "Replacing " << info.human_name()
+ << " as rank " << rank
+ << " with standby " << rep_info->human_name();
// Remove the old one
*osd_propose |= fail_mds_gid(fsmap, gid);
// Promote the replacement
- auto fs = fsmap.filesystems.at(fscid);
- fsmap.promote(sgid, fs, info.rank);
-
- *mds_propose = true;
- } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
- info.state == MDSMap::STATE_STANDBY) && may_replace) {
- dout(1) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
- << "." << info.inc << " " << ceph_mds_state_name(info.state)
- << dendl;
- mon->clog->info() << "Standby " << info.human_name() << " is not "
- "responding, dropping it";
- fail_mds_gid(fsmap, gid);
- *mds_propose = true;
- } else if (!info.laggy()) {
- dout(1) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
- << " " << ceph_mds_state_name(info.state)
- << " laggy" << dendl;
- fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
- info->laggy_since = ceph_clock_now();
- });
- *mds_propose = true;
+ fsmap.promote(rep_info->global_id, *fs, rank);
+
+ return true;
}
+ return false;
}
-bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs)
+bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
{
- assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
-
bool do_propose = false;
+ const auto now = mono_clock::now();
+ const bool osdmap_writeable = mon->osdmon()->is_writeable();
+ const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
+ const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
- // have a standby take over?
- set<mds_rank_t> failed;
- fs->mds_map.get_failed_mds_set(failed);
- if (!failed.empty()) {
- set<mds_rank_t>::iterator p = failed.begin();
- while (p != failed.end()) {
- mds_rank_t f = *p++;
- mds_gid_t sgid = fsmap.find_replacement_for({fs->fscid, f}, {},
- g_conf->mon_force_standby_active);
- if (sgid) {
- const MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
- dout(1) << " taking over failed mds." << f << " with " << sgid
- << "/" << si.name << " " << si.addr << dendl;
- mon->clog->info() << "Standby " << si.human_name()
- << " assigned to filesystem " << fs->mds_map.fs_name
- << " as rank " << f;
-
- fsmap.promote(sgid, fs, f);
- do_propose = true;
- }
- }
- } else if (!fs->mds_map.is_degraded()) {
- // There were no failures to replace, so try using any available standbys
- // as standby-replay daemons. Don't do this when the cluster is degraded
- // as a standby-replay daemon may try to read a journal being migrated.
+ if (mono_clock::is_zero(last_tick)) {
+ last_tick = now;
+ }
- // Take a copy of the standby GIDs so that we can iterate over
- // them while perhaps-modifying standby_daemons during the loop
- // (if we promote anyone they are removed from standby_daemons)
- std::vector<mds_gid_t> standby_gids;
- for (const auto &j : fsmap.standby_daemons) {
- standby_gids.push_back(j.first);
+ {
+ auto since_last = std::chrono::duration<double>(now-last_tick);
+
+ if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
+ // This case handles either local slowness (calls being delayed
+ // for whatever reason) or cluster election slowness (a long gap
+ // between calls while an election happened)
+ dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
+ "(slow election?) of " << since_last.count() << " seconds" << dendl;
+ for (auto& p : last_beacon) {
+ p.second.stamp = now;
+ }
}
+ }
- for (const auto &gid : standby_gids) {
- const auto &info = fsmap.standby_daemons.at(gid);
- assert(info.state == MDSMap::STATE_STANDBY);
+ // make sure last_beacon is fully populated
+ for (auto& p : fsmap.mds_roles) {
+ auto& gid = p.first;
+ last_beacon.emplace(std::piecewise_construct,
+ std::forward_as_tuple(gid),
+ std::forward_as_tuple(now, 0));
+ }
- if (!info.standby_replay) {
- continue;
- }
+ // We will only take decisive action (replacing/removing a daemon)
+ // if we have some indication that some other daemon(s) are successfully
+ // getting beacons through recently.
+ mono_time latest_beacon = mono_clock::zero();
+ for (const auto& p : last_beacon) {
+ latest_beacon = std::max(p.second.stamp, latest_beacon);
+ }
+ auto since = chrono::duration<double>(now-latest_beacon);
+ const bool may_replace = since.count() <
+ std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
- /*
- * This mds is standby but has no rank assigned.
- * See if we can find it somebody to shadow
- */
- dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
-
- // standby for someone specific?
- if (info.standby_for_rank >= 0) {
- // The mds_info_t may or may not tell us exactly which filesystem
- // the standby_for_rank refers to: lookup via legacy_client_fscid
- mds_role_t target_role = {
- info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
- fsmap.legacy_client_fscid : info.standby_for_fscid,
- info.standby_for_rank};
-
- // It is possible that the map contains a standby_for_fscid
- // that doesn't correspond to an existing filesystem, especially
- // if we loaded from a version with a bug (#17466)
- if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
- && !fsmap.filesystem_exists(info.standby_for_fscid)) {
- derr << "gid " << gid << " has invalid standby_for_fscid "
- << info.standby_for_fscid << dendl;
- continue;
- }
+ // check beacon timestamps
+ std::vector<mds_gid_t> to_remove;
+ for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+ auto& [gid, beacon_info] = *it;
+ auto since_last = chrono::duration<double>(now-beacon_info.stamp);
- // If we managed to resolve a full target role
- if (target_role.fscid != FS_CLUSTER_ID_NONE) {
- const auto &fs = fsmap.get_filesystem(target_role.fscid);
- if (fs->mds_map.is_followable(target_role.rank)) {
- do_propose |= try_standby_replay(fsmap, info, *fs,
- fs->mds_map.get_info(target_role.rank));
- }
- }
+ if (!fsmap.gid_exists(gid)) {
+ // gid no longer exists, remove from tracked beacons
+ it = last_beacon.erase(it);
+ continue;
+ }
- continue;
+ if (since_last.count() >= g_conf()->mds_beacon_grace) {
+ auto& info = fsmap.get_info_gid(gid);
+ dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+ << " (gid: " << gid << " addr: " << info.addrs
+ << " state: " << ceph_mds_state_name(info.state) << ")"
+ << " since " << since_last.count() << dendl;
+ // If the OSDMap is writeable, we can blacklist things, so we can
+ // try failing any laggy MDS daemons. Consider each one for failure.
+ if (!info.laggy()) {
+ dout(1) << " marking " << gid << " " << info.addrs
+ << " mds." << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
+ << " laggy" << dendl;
+ fsmap.modify_daemon(info.global_id, [](auto& info) {
+ info.laggy_since = ceph_clock_now();
+ });
+ do_propose = true;
}
+ if (osdmap_writeable && may_replace) {
+ to_remove.push_back(gid); // drop_mds may invalidate iterator
+ }
+ }
- // check everyone
- for (const auto &p : fsmap.filesystems) {
- if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
- info.standby_for_fscid != p.first)
- continue;
-
- bool assigned = false;
- const auto &fs = p.second;
- const MDSMap &mds_map = fs->mds_map;
- for (const auto &mds_i : mds_map.mds_info) {
- const MDSMap::mds_info_t &cand_info = mds_i.second;
- if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
- if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
- info.standby_for_rank != MDS_RANK_NONE) {
- continue; // we're supposed to follow someone else
- }
+ ++it;
+ }
- if (try_standby_replay(fsmap, info, *fs, cand_info)) {
- assigned = true;
- break;
+ for (const auto& gid : to_remove) {
+ auto info = fsmap.get_info_gid(gid);
+ const mds_info_t* rep_info = nullptr;
+ if (info.rank >= 0) {
+ auto fscid = fsmap.gid_fscid(gid);
+ rep_info = fsmap.find_replacement_for({fscid, info.rank});
+ }
+ bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
+ if (dropped) {
+ mon->clog->info() << "MDS " << info.human_name()
+ << " is removed because it is dead or otherwise unavailable.";
+ do_propose = true;
+ }
+ }
+
+ if (osdmap_writeable) {
+ for (auto& [fscid, fs] : fsmap.filesystems) {
+ if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
+ fs->mds_map.is_resizeable()) {
+ // Check if a rank or standby-replay should be replaced with a stronger
+ // affinity standby. This looks at ranks and standby-replay:
+ for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
+ const auto join_fscid = info.join_fscid;
+ if (join_fscid == fscid)
+ continue;
+ const auto rank = info.rank;
+ const auto state = info.state;
+ const mds_info_t* rep_info = nullptr;
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ rep_info = fsmap.get_available_standby(fscid);
+ } else if (state == MDSMap::STATE_ACTIVE) {
+ rep_info = fsmap.find_replacement_for({fscid, rank});
+ } else {
+ /* N.B. !is_degraded() */
+ ceph_abort_msg("invalid state in MDSMap");
+ }
+ if (!rep_info) {
+ break;
+ }
+ bool better_affinity = false;
+ if (join_fscid == FS_CLUSTER_ID_NONE) {
+ better_affinity = (rep_info->join_fscid == fscid);
+ } else {
+ better_affinity = (rep_info->join_fscid == fscid) ||
+ (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
+ }
+ if (better_affinity) {
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ mon->clog->info() << "Dropping low affinity standby-replay "
+ << info.human_name()
+ << " in favor of higher affinity standby.";
+ *propose_osdmap |= fail_mds_gid(fsmap, gid);
+ /* Now let maybe_promote_standby do the promotion. */
+ } else {
+ mon->clog->info() << "Dropping low affinity active "
+ << info.human_name()
+ << " in favor of higher affinity standby.";
+ do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
}
+ break; /* don't replace more than one per tick per fs */
}
}
- if (assigned) {
- do_propose = true;
- break;
- }
}
}
}
-
return do_propose;
}
-void MDSMonitor::tick()
+bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
{
- // make sure mds's are still alive
- // ...if i am an active leader
-
- if (!is_active() || !is_leader()) return;
-
- auto &pending = get_pending_fsmap_writeable();
+ if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+ return false;
+ }
bool do_propose = false;
- do_propose |= pending.check_health();
-
- // expand mds cluster (add new nodes to @in)?
- for (auto &p : pending.filesystems) {
- do_propose |= maybe_expand_cluster(pending, p.second->fscid);
- }
+ // have a standby take over?
+ set<mds_rank_t> failed;
+ fs.mds_map.get_failed_mds_set(failed);
+ for (const auto& rank : failed) {
+ auto info = fsmap.find_replacement_for({fs.fscid, rank});
+ if (info) {
+ dout(1) << " taking over failed mds." << rank << " with " << info->global_id
+ << "/" << info->name << " " << info->addrs << dendl;
+ mon->clog->info() << "Standby " << info->human_name()
+ << " assigned to filesystem " << fs.mds_map.fs_name
+ << " as rank " << rank;
- mono_time now = mono_clock::now();
- if (last_tick == decltype(last_tick)::min()) {
- last_tick = now;
+ fsmap.promote(info->global_id, fs, rank);
+ do_propose = true;
+ }
}
- chrono::duration<double> since_last = now-last_tick;
- if (since_last.count() >
- (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
- // This case handles either local slowness (calls being delayed
- // for whatever reason) or cluster election slowness (a long gap
- // between calls while an election happened)
- dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
- "(slow election?) of " << now - last_tick << " seconds" << dendl;
- for (auto &p : last_beacon) {
- p.second.stamp = now;
+ if (!fs.mds_map.is_degraded() && fs.mds_map.allows_standby_replay()) {
+ // There were no failures to replace, so try using any available standbys
+ // as standby-replay daemons. Don't do this when the cluster is degraded
+ // as a standby-replay daemon may try to read a journal being migrated.
+ for (;;) {
+ auto info = fsmap.get_available_standby(fs.fscid);
+ if (!info) break;
+ dout(20) << "standby available mds." << info->global_id << dendl;
+ bool changed = false;
+ for (const auto& rank : fs.mds_map.in) {
+ dout(20) << "examining " << rank << dendl;
+ if (fs.mds_map.is_followable(rank)) {
+ dout(1) << " setting mds." << info->global_id
+ << " to follow mds rank " << rank << dendl;
+ fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
+ do_propose = true;
+ changed = true;
+ break;
+ }
+ }
+ if (!changed) break;
}
}
- last_tick = now;
+ return do_propose;
+}
- // make sure last_beacon is fully populated
- for (auto &p : pending.mds_roles) {
- auto &gid = p.first;
- last_beacon.emplace(std::piecewise_construct,
- std::forward_as_tuple(gid),
- std::forward_as_tuple(mono_clock::now(), 0));
- }
+void MDSMonitor::tick()
+{
+ if (!is_active() || !is_leader()) return;
+ auto &pending = get_pending_fsmap_writeable();
- // check beacon timestamps
+ bool do_propose = false;
bool propose_osdmap = false;
- bool osdmap_writeable = mon->osdmon()->is_writeable();
- for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
- mds_gid_t gid = it->first;
- auto beacon_info = it->second;
- chrono::duration<double> since_last = now-beacon_info.stamp;
- if (!pending.gid_exists(gid)) {
- // clean it out
- it = last_beacon.erase(it);
- continue;
- }
+ do_propose |= pending.check_health();
+ /* Check health and affinity of ranks */
+ do_propose |= check_health(pending, &propose_osdmap);
- if (since_last.count() >= g_conf->mds_beacon_grace) {
- auto &info = pending.get_info_gid(gid);
- dout(1) << "no beacon from mds." << info.rank << "." << info.inc
- << " (gid: " << gid << " addr: " << info.addr
- << " state: " << ceph_mds_state_name(info.state) << ")"
- << " since " << since_last.count() << "s" << dendl;
- // If the OSDMap is writeable, we can blacklist things, so we can
- // try failing any laggy MDS daemons. Consider each one for failure.
- if (osdmap_writeable) {
- maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
- }
- }
+ /* Resize the cluster according to max_mds. */
+ for (auto& p : pending.filesystems) {
+ do_propose |= maybe_resize_cluster(pending, p.second->fscid);
+ }
- ++it;
+ /* Replace any failed ranks. */
+ for (auto& p : pending.filesystems) {
+ do_propose |= maybe_promote_standby(pending, *p.second);
}
+
if (propose_osdmap) {
request_proposal(mon->osdmon());
}
- for (auto &p : pending.filesystems) {
- auto &fs = p.second;
- if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
- do_propose |= maybe_promote_standby(pending, fs);
- }
- }
-
if (do_propose) {
propose_pending();
}
-}
-/**
- * finfo: the would-be follower
- * leader_fs: the Filesystem containing the would-be leader
- * ainfo: the would-be leader
- */
-bool MDSMonitor::try_standby_replay(
- FSMap &fsmap,
- const MDSMap::mds_info_t& finfo,
- const Filesystem &leader_fs,
- const MDSMap::mds_info_t& ainfo)
-{
- // someone else already following?
- if (leader_fs.has_standby_replay(ainfo.global_id)) {
- dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
- return false;
- } else {
- // Assign the new role to the standby
- dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
- fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
- return true;
- }
+ last_tick = mono_clock::now();
}
MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)