1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <boost/utility.hpp>
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
25 #include "common/strtol.h"
26 #include "common/perf_counters.h"
27 #include "common/config.h"
28 #include "common/cmdparse.h"
29 #include "messages/MMDSMap.h"
30 #include "messages/MFSMap.h"
31 #include "messages/MFSMapUser.h"
32 #include "messages/MMDSLoadTargets.h"
33 #include "messages/MMonCommand.h"
34 #include "messages/MGenericMessage.h"
36 #include "include/ceph_assert.h"
37 #include "include/str_list.h"
38 #include "include/stringify.h"
39 #include "mds/mdstypes.h"
42 using namespace TOPNSPC::common
;
50 using std::ostringstream
;
54 using std::string_view
;
55 using std::stringstream
;
59 using ceph::bufferlist
;
62 using ceph::ErasureCodeInterfaceRef
;
63 using ceph::ErasureCodeProfile
;
64 using ceph::Formatter
;
65 using ceph::JSONFormatter
;
66 using ceph::make_message
;
67 using ceph::mono_clock
;
68 using ceph::mono_time
;
70 #define dout_subsys ceph_subsys_mon
72 #define dout_prefix _prefix(_dout, mon, get_fsmap())
73 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
, const FSMap
& fsmap
) {
74 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
75 << "(" << mon
.get_state_name()
76 << ").mds e" << fsmap
.get_epoch() << " ";
79 static const string
MDS_METADATA_PREFIX("mds_metadata");
80 static const string
MDS_HEALTH_PREFIX("mds_health");
84 * Specialized implementation of cmd_getval to allow us to parse
85 * out strongly-typedef'd types
87 namespace TOPNSPC::common
{
88 template<> bool cmd_getval(const cmdmap_t
& cmdmap
,
89 std::string_view k
, mds_gid_t
&val
)
91 return cmd_getval(cmdmap
, k
, (int64_t&)val
);
94 template<> bool cmd_getval(const cmdmap_t
& cmdmap
,
95 std::string_view k
, mds_rank_t
&val
)
97 return cmd_getval(cmdmap
, k
, (int64_t&)val
);
100 template<> bool cmd_getval(const cmdmap_t
& cmdmap
,
101 std::string_view k
, MDSMap::DaemonState
&val
)
103 return cmd_getval(cmdmap
, k
, (int64_t&)val
);
109 void MDSMonitor::print_map(const FSMap
& m
)
111 dout(dblV
) << "print_map\n";
117 void MDSMonitor::create_initial()
119 dout(10) << "create_initial" << dendl
;
122 void MDSMonitor::get_store_prefixes(std::set
<string
>& s
) const
124 s
.insert(service_name
);
125 s
.insert(MDS_METADATA_PREFIX
);
126 s
.insert(MDS_HEALTH_PREFIX
);
129 void MDSMonitor::update_from_paxos(bool *need_bootstrap
)
131 version_t version
= get_last_committed();
132 if (version
== get_fsmap().epoch
)
135 dout(10) << __func__
<< " version " << version
136 << ", my e " << get_fsmap().epoch
<< dendl
;
137 ceph_assert(version
> get_fsmap().epoch
);
144 int err
= get_version(version
, fsmap_bl
);
145 ceph_assert(err
== 0);
147 ceph_assert(fsmap_bl
.length() > 0);
148 dout(10) << __func__
<< " got " << version
<< dendl
;
150 PaxosFSMap::decode(fsmap_bl
);
151 } catch (const ceph::buffer::malformed_input
& e
) {
152 derr
<< "unable to decode FSMap: " << e
.what() << dendl
;
157 dout(0) << "new map" << dendl
;
158 print_map
<0>(get_fsmap());
159 if (!g_conf()->mon_mds_skip_sanity
) {
160 get_fsmap().sanity();
166 void MDSMonitor::init()
168 (void)load_metadata(pending_metadata
);
171 void MDSMonitor::create_pending()
173 auto &fsmap
= PaxosFSMap::create_pending();
175 if (mon
.osdmon()->is_readable()) {
176 const auto &osdmap
= mon
.osdmon()->osdmap
;
177 fsmap
.sanitize([&osdmap
](int64_t pool
){return osdmap
.have_pg_pool(pool
);});
180 dout(10) << "create_pending e" << fsmap
.epoch
<< dendl
;
183 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
185 auto &pending
= get_pending_fsmap_writeable();
186 auto &epoch
= pending
.epoch
;
188 dout(10) << "encode_pending e" << epoch
<< dendl
;
190 // print map iff 'debug mon = 30' or higher
191 print_map
<30>(pending
);
192 if (!g_conf()->mon_mds_skip_sanity
) {
193 pending
.sanity(true);
196 // Set 'modified' on maps modified this epoch
197 for (auto &p
: pending
.filesystems
) {
198 if (p
.second
->mds_map
.epoch
== epoch
) {
199 p
.second
->mds_map
.modified
= ceph_clock_now();
204 ceph_assert(get_last_committed() + 1 == pending
.epoch
);
205 bufferlist pending_bl
;
206 pending
.encode(pending_bl
, mon
.get_quorum_con_features());
208 /* put everything in the transaction */
209 put_version(t
, pending
.epoch
, pending_bl
);
210 put_last_committed(t
, pending
.epoch
);
212 // Encode MDSHealth data
213 for (std::map
<uint64_t, MDSHealth
>::iterator i
= pending_daemon_health
.begin();
214 i
!= pending_daemon_health
.end(); ++i
) {
216 i
->second
.encode(bl
);
217 t
->put(MDS_HEALTH_PREFIX
, stringify(i
->first
), bl
);
220 for (std::set
<uint64_t>::iterator i
= pending_daemon_health_rm
.begin();
221 i
!= pending_daemon_health_rm
.end(); ++i
) {
222 t
->erase(MDS_HEALTH_PREFIX
, stringify(*i
));
224 pending_daemon_health_rm
.clear();
225 remove_from_metadata(pending
, t
);
228 health_check_map_t new_checks
;
229 const auto &info_map
= pending
.get_mds_info();
230 for (const auto &i
: info_map
) {
231 const auto &gid
= i
.first
;
232 const auto &info
= i
.second
;
233 if (pending_daemon_health_rm
.count(gid
)) {
237 auto p
= pending_daemon_health
.find(gid
);
238 if (p
!= pending_daemon_health
.end()) {
242 mon
.store
->get(MDS_HEALTH_PREFIX
, stringify(gid
), bl
);
244 derr
<< "Missing health data for MDS " << gid
<< dendl
;
247 auto bl_i
= bl
.cbegin();
250 for (const auto &metric
: health
.metrics
) {
251 if (metric
.type
== MDS_HEALTH_DUMMY
) {
254 const auto rank
= info
.rank
;
255 health_check_t
*check
= &new_checks
.get_or_add(
256 mds_metric_name(metric
.type
),
258 mds_metric_summary(metric
.type
),
261 ss
<< "mds." << info
.name
<< "(mds." << rank
<< "): " << metric
.message
;
263 for (auto &p
: metric
.metadata
) {
269 ss
<< p
.first
<< ": " << p
.second
;
272 check
->detail
.push_back(ss
.str());
275 pending
.get_health_checks(&new_checks
);
276 for (auto& p
: new_checks
.checks
) {
277 p
.second
.summary
= std::regex_replace(
280 stringify(p
.second
.detail
.size()));
281 p
.second
.summary
= std::regex_replace(
283 std::regex("%plurals%"),
284 p
.second
.detail
.size() > 1 ? "s" : "");
285 p
.second
.summary
= std::regex_replace(
287 std::regex("%isorare%"),
288 p
.second
.detail
.size() > 1 ? "are" : "is");
289 p
.second
.summary
= std::regex_replace(
291 std::regex("%hasorhave%"),
292 p
.second
.detail
.size() > 1 ? "have" : "has");
294 encode_health(new_checks
, t
);
297 version_t
MDSMonitor::get_trim_to() const
300 if (g_conf()->mon_mds_force_trim_to
> 0 &&
301 g_conf()->mon_mds_force_trim_to
<= (int)get_last_committed()) {
302 floor
= g_conf()->mon_mds_force_trim_to
;
303 dout(10) << __func__
<< " explicit mon_mds_force_trim_to = "
307 unsigned max
= g_conf()->mon_max_mdsmap_epochs
;
308 version_t last
= get_last_committed();
310 if (last
- get_first_committed() > max
&& floor
< last
- max
) {
314 dout(20) << __func__
<< " = " << floor
<< dendl
;
318 bool MDSMonitor::preprocess_query(MonOpRequestRef op
)
320 op
->mark_mdsmon_event(__func__
);
321 auto m
= op
->get_req
<PaxosServiceMessage
>();
322 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source()
323 << " " << m
->get_orig_source_addrs() << dendl
;
325 switch (m
->get_type()) {
328 return preprocess_beacon(op
);
330 case MSG_MON_COMMAND
:
332 return preprocess_command(op
);
333 } catch (const bad_cmd_get
& e
) {
335 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
339 case MSG_MDS_OFFLOAD_TARGETS
:
340 return preprocess_offload_targets(op
);
348 void MDSMonitor::_note_beacon(MMDSBeacon
*m
)
350 mds_gid_t gid
= mds_gid_t(m
->get_global_id());
351 version_t seq
= m
->get_seq();
353 dout(5) << "_note_beacon " << *m
<< " noting time" << dendl
;
354 auto &beacon
= last_beacon
[gid
];
355 beacon
.stamp
= mono_clock::now();
359 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op
)
361 op
->mark_mdsmon_event(__func__
);
362 auto m
= op
->get_req
<MMDSBeacon
>();
363 MDSMap::DaemonState state
= m
->get_state();
364 mds_gid_t gid
= m
->get_global_id();
365 version_t seq
= m
->get_seq();
366 MDSMap::mds_info_t info
;
367 epoch_t effective_epoch
= 0;
369 const auto &fsmap
= get_fsmap();
371 // check privileges, ignore if fails
372 MonSession
*session
= op
->get_session();
375 if (!session
->is_capable("mds", MON_CAP_X
)) {
376 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
377 << session
->caps
<< dendl
;
381 if (m
->get_fsid() != mon
.monmap
->fsid
) {
382 dout(0) << "preprocess_beacon on fsid " << m
->get_fsid() << " != " << mon
.monmap
->fsid
<< dendl
;
386 dout(5) << "preprocess_beacon " << *m
387 << " from " << m
->get_orig_source()
388 << " " << m
->get_orig_source_addrs()
389 << " " << m
->get_compat()
392 // make sure the address has a port
393 if (m
->get_orig_source_addr().get_port() == 0) {
394 dout(1) << " ignoring boot message without a port" << dendl
;
402 // booted, but not in map?
403 if (!fsmap
.gid_exists(gid
)) {
404 if (state
!= MDSMap::STATE_BOOT
) {
405 dout(7) << "mds_beacon " << *m
<< " is not in fsmap (state "
406 << ceph_mds_state_name(state
) << ")" << dendl
;
408 /* We can't send an MDSMap this MDS was a part of because we no longer
409 * know which FS it was part of. Nor does this matter. Sending an empty
410 * MDSMap is sufficient for getting the MDS to respawn.
412 auto m
= make_message
<MMDSMap
>(mon
.monmap
->fsid
, MDSMap::create_null_mdsmap());
413 mon
.send_reply(op
, m
.detach());
416 /* check if we've already recorded its entry in pending */
417 const auto& pending
= get_pending_fsmap();
418 if (pending
.gid_exists(gid
)) {
419 /* MDS is already booted. */
422 return false; // not booted yet.
426 dout(10) << __func__
<< ": GID exists in map: " << gid
<< dendl
;
427 info
= fsmap
.get_info_gid(gid
);
429 if (state
== MDSMap::STATE_DNE
) {
434 if (info
.state_seq
> seq
) {
435 dout(7) << "mds_beacon " << *m
<< " has old seq, ignoring" << dendl
;
439 // Work out the latest epoch that this daemon should have seen
441 fs_cluster_id_t fscid
= fsmap
.mds_roles
.at(gid
);
442 if (fscid
== FS_CLUSTER_ID_NONE
) {
443 effective_epoch
= fsmap
.standby_epochs
.at(gid
);
445 effective_epoch
= fsmap
.get_filesystem(fscid
)->mds_map
.epoch
;
447 if (effective_epoch
!= m
->get_last_epoch_seen()) {
448 dout(10) << "mds_beacon " << *m
449 << " ignoring requested state, because mds hasn't seen latest map" << dendl
;
456 return false; // no longer laggy, need to update map.
458 if (state
== MDSMap::STATE_BOOT
) {
459 // ignore, already booted.
463 // did the join_fscid change
464 if (m
->get_fs().size()) {
465 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
466 auto f
= fsmap
.get_filesystem(m
->get_fs());
470 if (info
.join_fscid
!= fscid
) {
471 dout(10) << __func__
<< " standby mds_join_fs changed to " << fscid
472 << " (" << m
->get_fs() << ")" << dendl
;
477 if (info
.join_fscid
!= FS_CLUSTER_ID_NONE
) {
478 dout(10) << __func__
<< " standby mds_join_fs was cleared" << dendl
;
484 // is there a state change here?
485 if (info
.state
!= state
) {
490 // Comparing known daemon health with m->get_health()
491 // and return false (i.e. require proposal) if they
492 // do not match, to update our stored
493 if (!(pending_daemon_health
[gid
] == m
->get_health())) {
494 dout(10) << __func__
<< " health metrics for gid " << gid
<< " were updated" << dendl
;
500 // note time and reply
501 ceph_assert(effective_epoch
> 0);
504 auto beacon
= make_message
<MMDSBeacon
>(mon
.monmap
->fsid
,
505 m
->get_global_id(), m
->get_name(), effective_epoch
,
506 state
, seq
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
507 mon
.send_reply(op
, beacon
.detach());
512 // I won't reply this beacon, drop it.
517 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op
)
519 op
->mark_mdsmon_event(__func__
);
520 auto m
= op
->get_req
<MMDSLoadTargets
>();
521 dout(10) << "preprocess_offload_targets " << *m
<< " from " << m
->get_orig_source() << dendl
;
523 const auto &fsmap
= get_fsmap();
525 // check privileges, ignore message if fails
526 MonSession
*session
= op
->get_session();
529 if (!session
->is_capable("mds", MON_CAP_X
)) {
530 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
531 << session
->caps
<< dendl
;
535 if (fsmap
.gid_exists(m
->global_id
) &&
536 m
->targets
== fsmap
.get_info_gid(m
->global_id
).export_targets
)
547 bool MDSMonitor::prepare_update(MonOpRequestRef op
)
549 op
->mark_mdsmon_event(__func__
);
550 auto m
= op
->get_req
<PaxosServiceMessage
>();
551 dout(7) << "prepare_update " << *m
<< dendl
;
553 switch (m
->get_type()) {
556 return prepare_beacon(op
);
558 case MSG_MON_COMMAND
:
560 return prepare_command(op
);
561 } catch (const bad_cmd_get
& e
) {
563 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
564 return false; /* nothing to propose */
567 case MSG_MDS_OFFLOAD_TARGETS
:
568 return prepare_offload_targets(op
);
574 return false; /* nothing to propose! */
577 bool MDSMonitor::prepare_beacon(MonOpRequestRef op
)
579 op
->mark_mdsmon_event(__func__
);
580 auto m
= op
->get_req
<MMDSBeacon
>();
581 // -- this is an update --
582 dout(12) << "prepare_beacon " << *m
<< " from " << m
->get_orig_source()
583 << " " << m
->get_orig_source_addrs() << dendl
;
584 entity_addrvec_t addrs
= m
->get_orig_source_addrs();
585 mds_gid_t gid
= m
->get_global_id();
586 MDSMap::DaemonState state
= m
->get_state();
587 version_t seq
= m
->get_seq();
589 auto &pending
= get_pending_fsmap_writeable();
591 dout(15) << __func__
<< " got health from gid " << gid
<< " with " << m
->get_health().metrics
.size() << " metrics." << dendl
;
593 // Calculate deltas of health metrics created and removed
594 // Do this by type rather than MDSHealthMetric equality, because messages can
595 // change a lot when they include e.g. a number of items.
596 const auto &old_health
= pending_daemon_health
[gid
].metrics
;
597 const auto &new_health
= m
->get_health().metrics
;
599 std::set
<mds_metric_t
> old_types
;
600 for (const auto &i
: old_health
) {
601 old_types
.insert(i
.type
);
604 std::set
<mds_metric_t
> new_types
;
605 for (const auto &i
: new_health
) {
606 if (i
.type
== MDS_HEALTH_DUMMY
) {
609 new_types
.insert(i
.type
);
612 for (const auto &new_metric
: new_health
) {
613 if (new_metric
.type
== MDS_HEALTH_DUMMY
) {
616 if (old_types
.count(new_metric
.type
) == 0) {
617 dout(10) << "MDS health message (" << m
->get_orig_source()
618 << "): " << new_metric
.sev
<< " " << new_metric
.message
<< dendl
;
622 // Log the disappearance of health messages at INFO
623 for (const auto &old_metric
: old_health
) {
624 if (new_types
.count(old_metric
.type
) == 0) {
625 mon
.clog
->info() << "MDS health message cleared ("
626 << m
->get_orig_source() << "): " << old_metric
.message
;
631 pending_daemon_health
[gid
] = m
->get_health();
633 const auto& cs
= m
->get_compat();
634 if (state
== MDSMap::STATE_BOOT
) {
635 // zap previous instance of this name?
636 if (g_conf()->mds_enforce_unique_name
) {
637 bool failed_mds
= false;
638 while (mds_gid_t existing
= pending
.find_mds_gid_by_name(m
->get_name())) {
639 if (!mon
.osdmon()->is_writeable()) {
640 mon
.osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
643 const auto& existing_info
= pending
.get_info_gid(existing
);
644 mon
.clog
->info() << existing_info
.human_name() << " restarted";
645 fail_mds_gid(pending
, existing
);
649 ceph_assert(mon
.osdmon()->is_writeable());
650 request_proposal(mon
.osdmon());
654 // Add this daemon to the map
655 if (pending
.mds_roles
.count(gid
) == 0) {
656 MDSMap::mds_info_t new_info
;
657 new_info
.global_id
= gid
;
658 new_info
.name
= m
->get_name();
659 new_info
.addrs
= addrs
;
660 new_info
.mds_features
= m
->get_mds_features();
661 new_info
.state
= MDSMap::STATE_STANDBY
;
662 new_info
.state_seq
= seq
;
663 new_info
.compat
= cs
;
664 if (m
->get_fs().size()) {
665 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
666 auto f
= pending
.get_filesystem(m
->get_fs());
670 new_info
.join_fscid
= fscid
;
672 pending
.insert(new_info
);
675 // initialize the beacon timer
676 auto &beacon
= last_beacon
[gid
];
677 beacon
.stamp
= mono_clock::now();
680 update_metadata(m
->get_global_id(), m
->get_sys_info());
684 if (!pending
.gid_exists(gid
)) {
685 /* gid has been removed from pending, send null map */
686 dout(5) << "mds_beacon " << *m
<< " is not in fsmap (state "
687 << ceph_mds_state_name(state
) << ")" << dendl
;
689 /* We can't send an MDSMap this MDS was a part of because we no longer
690 * know which FS it was part of. Nor does this matter. Sending an empty
691 * MDSMap is sufficient for getting the MDS to respawn.
696 const auto& info
= pending
.get_info_gid(gid
);
698 // did the reported compat change? That's illegal!
699 if (cs
.compare(info
.compat
) != 0) {
700 if (!mon
.osdmon()->is_writeable()) {
701 mon
.osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
704 mon
.clog
->warn() << info
.human_name() << " compat changed unexpectedly";
705 fail_mds_gid(pending
, gid
);
706 request_proposal(mon
.osdmon());
710 if (state
== MDSMap::STATE_DNE
) {
711 dout(1) << __func__
<< ": DNE from " << info
<< dendl
;
715 // legal state change?
716 if ((info
.state
== MDSMap::STATE_STANDBY
&& state
!= info
.state
) ||
717 (info
.state
== MDSMap::STATE_STANDBY_REPLAY
&& state
!= info
.state
&& state
!= MDSMap::STATE_DAMAGED
)) {
718 // Standby daemons should never modify their own state.
719 // Except that standby-replay can indicate the rank is damaged due to failure to replay.
720 // Reject any attempts to do so.
721 derr
<< "standby " << gid
<< " attempted to change state to "
722 << ceph_mds_state_name(state
) << ", rejecting" << dendl
;
724 } else if (info
.state
!= MDSMap::STATE_STANDBY
&& state
!= info
.state
&&
725 !MDSMap::state_transition_valid(info
.state
, state
)) {
726 // Validate state transitions for daemons that hold a rank
727 derr
<< "daemon " << gid
<< " (rank " << info
.rank
<< ") "
728 << "reported invalid state transition "
729 << ceph_mds_state_name(info
.state
) << " -> "
730 << ceph_mds_state_name(state
) << dendl
;
735 dout(1) << "prepare_beacon clearing laggy flag on " << addrs
<< dendl
;
736 pending
.modify_daemon(info
.global_id
, [](auto& info
)
743 dout(5) << "prepare_beacon mds." << info
.rank
744 << " " << ceph_mds_state_name(info
.state
)
745 << " -> " << ceph_mds_state_name(state
)
748 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
749 if (m
->get_fs().size()) {
750 auto f
= pending
.get_filesystem(m
->get_fs());
755 pending
.modify_daemon(gid
, [fscid
](auto& info
) {
756 info
.join_fscid
= fscid
;
759 if (state
== MDSMap::STATE_STOPPED
) {
760 const auto fscid
= pending
.mds_roles
.at(gid
);
761 const auto &fs
= pending
.get_filesystem(fscid
);
763 mon
.clog
->info() << info
.human_name() << " finished "
764 << "stopping rank " << info
.rank
<< " in filesystem "
765 << fs
->mds_map
.fs_name
<< " (now has "
766 << fs
->mds_map
.get_num_in_mds() - 1 << " ranks)";
768 auto erased
= pending
.stop(gid
);
769 erased
.push_back(gid
);
771 for (const auto& erased_gid
: erased
) {
772 last_beacon
.erase(erased_gid
);
773 if (pending_daemon_health
.count(erased_gid
)) {
774 pending_daemon_health
.erase(erased_gid
);
775 pending_daemon_health_rm
.insert(erased_gid
);
778 } else if (state
== MDSMap::STATE_DAMAGED
) {
779 if (!mon
.osdmon()->is_writeable()) {
780 dout(1) << __func__
<< ": DAMAGED from rank " << info
.rank
781 << " waiting for osdmon writeable to blocklist it" << dendl
;
782 mon
.osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
786 auto rank
= info
.rank
;
788 // Record this MDS rank as damaged, so that other daemons
789 // won't try to run it.
790 dout(0) << __func__
<< ": marking rank " << rank
<< " damaged" << dendl
;
792 auto fs
= pending
.get_filesystem(gid
);
793 auto rankgid
= fs
->mds_map
.get_gid(rank
);
794 auto rankinfo
= pending
.get_info_gid(rankgid
);
795 auto followergid
= fs
->mds_map
.get_standby_replay(rank
);
797 ceph_assert(gid
== rankgid
|| gid
== followergid
);
799 utime_t until
= ceph_clock_now();
800 until
+= g_conf().get_val
<double>("mon_mds_blocklist_interval");
801 const auto blocklist_epoch
= mon
.osdmon()->blocklist(rankinfo
.addrs
, until
);
802 if (followergid
!= MDS_GID_NONE
) {
803 fail_mds_gid(pending
, followergid
);
804 last_beacon
.erase(followergid
);
806 request_proposal(mon
.osdmon());
807 pending
.damaged(rankgid
, blocklist_epoch
);
808 last_beacon
.erase(rankgid
);
810 /* MDS expects beacon reply back */
812 if (info
.state
!= MDSMap::STATE_ACTIVE
&& state
== MDSMap::STATE_ACTIVE
) {
813 const auto &fscid
= pending
.mds_roles
.at(gid
);
814 const auto &fs
= pending
.get_filesystem(fscid
);
815 mon
.clog
->info() << info
.human_name() << " is now active in "
816 << "filesystem " << fs
->mds_map
.fs_name
<< " as rank "
820 // Made it through special cases and validations, record the
821 // daemon's reported state to the FSMap.
822 pending
.modify_daemon(gid
, [state
, seq
](auto& info
) {
824 info
.state_seq
= seq
;
829 dout(5) << "prepare_beacon pending map now:" << dendl
;
832 wait_for_finished_proposal(op
, new LambdaContext([op
, this](int r
){
834 _updated(op
); // success
835 else if (r
== -ECANCELED
) {
838 dispatch(op
); // try again
845 if (!mon
.osdmon()->is_writeable()) {
846 dout(1) << __func__
<< ": waiting for writeable OSDMap to evict" << dendl
;
847 mon
.osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
851 fail_mds_gid(pending
, gid
);
852 request_proposal(mon
.osdmon());
853 dout(5) << __func__
<< ": pending map now:" << dendl
;
859 wait_for_finished_proposal(op
, new LambdaContext([op
, this](int r
){
861 auto m
= make_message
<MMDSMap
>(mon
.monmap
->fsid
, MDSMap::create_null_mdsmap());
862 mon
.send_reply(op
, m
.detach());
864 dispatch(op
); // try again
871 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op
)
873 auto &pending
= get_pending_fsmap_writeable();
874 bool propose
= false;
876 op
->mark_mdsmon_event(__func__
);
877 auto m
= op
->get_req
<MMDSLoadTargets
>();
878 mds_gid_t gid
= m
->global_id
;
879 if (pending
.gid_has_rank(gid
)) {
880 dout(10) << "prepare_offload_targets " << gid
<< " " << m
->targets
<< dendl
;
881 pending
.update_export_targets(gid
, m
->targets
);
884 dout(10) << "prepare_offload_targets " << gid
<< " not in map" << dendl
;
890 bool MDSMonitor::should_propose(double& delay
)
892 // delegate to PaxosService to assess whether we should propose
893 return PaxosService::should_propose(delay
);
896 void MDSMonitor::_updated(MonOpRequestRef op
)
898 const auto &fsmap
= get_fsmap();
899 op
->mark_mdsmon_event(__func__
);
900 auto m
= op
->get_req
<MMDSBeacon
>();
901 dout(10) << "_updated " << m
->get_orig_source() << " " << *m
<< dendl
;
902 mon
.clog
->debug() << m
->get_orig_source() << " "
903 << m
->get_orig_source_addrs() << " "
904 << ceph_mds_state_name(m
->get_state());
906 if (m
->get_state() == MDSMap::STATE_STOPPED
) {
907 // send the map manually (they're out of the map, so they won't get it automatic)
908 auto m
= make_message
<MMDSMap
>(mon
.monmap
->fsid
, MDSMap::create_null_mdsmap());
909 mon
.send_reply(op
, m
.detach());
911 auto beacon
= make_message
<MMDSBeacon
>(mon
.monmap
->fsid
,
912 m
->get_global_id(), m
->get_name(), fsmap
.get_epoch(),
913 m
->get_state(), m
->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT
);
914 mon
.send_reply(op
, beacon
.detach());
918 void MDSMonitor::on_active()
923 mon
.clog
->debug() << "fsmap " << get_fsmap();
927 void MDSMonitor::dump_info(Formatter
*f
)
929 f
->open_object_section("fsmap");
933 f
->dump_unsigned("mdsmap_first_committed", get_first_committed());
934 f
->dump_unsigned("mdsmap_last_committed", get_last_committed());
937 bool MDSMonitor::preprocess_command(MonOpRequestRef op
)
939 op
->mark_mdsmon_event(__func__
);
940 auto m
= op
->get_req
<MMonCommand
>();
946 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
947 // ss has reason for failure
948 string rs
= ss
.str();
949 mon
.reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
954 cmd_getval(cmdmap
, "prefix", prefix
);
955 string format
= cmd_getval_or
<string
>(cmdmap
, "format", "plain");
956 std::unique_ptr
<Formatter
> f(Formatter::create(format
));
958 MonSession
*session
= op
->get_session();
960 mon
.reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
964 // to use const qualifier filter fsmap beforehand
965 FSMap _fsmap_copy
= get_fsmap();
966 _fsmap_copy
.filter(session
->get_allowed_fs_names());
967 const auto& fsmap
= _fsmap_copy
;
969 if (prefix
== "mds stat") {
971 f
->open_object_section("mds_stat");
979 } else if (prefix
== "mds ok-to-stop") {
981 if (!cmd_getval(cmdmap
, "ids", ids
)) {
983 ss
<< "must specify mds id";
986 if (fsmap
.is_any_degraded()) {
987 ss
<< "one or more filesystems is currently degraded";
991 set
<mds_gid_t
> stopping
;
992 for (auto& id
: ids
) {
994 mds_gid_t gid
= gid_from_arg(fsmap
, id
, ess
);
995 if (gid
== MDS_GID_NONE
) {
996 // the mds doesn't exist, but no file systems are unhappy, so losing it
997 // can't have any effect.
1000 stopping
.insert(gid
);
1002 set
<mds_gid_t
> active
;
1003 set
<mds_gid_t
> standby
;
1004 for (auto gid
: stopping
) {
1005 if (fsmap
.gid_has_rank(gid
)) {
1006 // ignore standby-replay daemons (at this level)
1007 if (!fsmap
.is_standby_replay(gid
)) {
1008 auto standby
= fsmap
.get_standby_replay(gid
);
1009 if (standby
== MDS_GID_NONE
||
1010 stopping
.count(standby
)) {
1011 // no standby-replay, or we're also stopping the standby-replay
1017 // net loss of a standby
1018 standby
.insert(gid
);
1021 if (fsmap
.get_num_standby() - standby
.size() < active
.size()) {
1023 ss
<< "insufficent standby MDS daemons to stop active gids "
1024 << stringify(active
)
1025 << " and/or standby gids " << stringify(standby
);;
1029 ss
<< "should be safe to stop " << ids
;
1030 } else if (prefix
== "fs dump") {
1034 const FSMap
*fsmapp
= &fsmap
;
1036 if (cmd_getval(cmdmap
, "epoch", epocharg
)) {
1039 int err
= get_version(epoch
, b
);
1040 if (err
== -ENOENT
) {
1044 ceph_assert(err
== 0);
1045 ceph_assert(b
.length());
1053 f
->open_object_section("fsmap");
1054 fsmapp
->dump(f
.get());
1064 ss
<< "dumped fsmap epoch " << fsmapp
->get_epoch();
1065 } else if (prefix
== "mds metadata") {
1067 f
.reset(Formatter::create("json-pretty"));
1070 bool all
= !cmd_getval(cmdmap
, "who", who
);
1071 dout(1) << "all = " << all
<< dendl
;
1074 // Dump all MDSs' metadata
1075 const auto all_info
= fsmap
.get_mds_info();
1077 f
->open_array_section("mds_metadata");
1078 for(const auto &i
: all_info
) {
1079 const auto &info
= i
.second
;
1081 f
->open_object_section("mds");
1082 f
->dump_string("name", info
.name
);
1083 std::ostringstream get_err
;
1084 r
= dump_metadata(fsmap
, info
.name
, f
.get(), get_err
);
1085 if (r
== -EINVAL
|| r
== -ENOENT
) {
1086 // Drop error, list what metadata we do have
1087 dout(1) << get_err
.str() << dendl
;
1089 } else if (r
!= 0) {
1090 derr
<< "Unexpected error reading metadata: " << cpp_strerror(r
)
1092 ss
<< get_err
.str();
1100 // Dump a single daemon's metadata
1101 f
->open_object_section("mds_metadata");
1102 r
= dump_metadata(fsmap
, who
, f
.get(), ss
);
1106 } else if (prefix
== "mds versions") {
1108 f
.reset(Formatter::create("json-pretty"));
1109 count_metadata("ceph_version", f
.get());
1112 } else if (prefix
== "mds count-metadata") {
1114 f
.reset(Formatter::create("json-pretty"));
1116 cmd_getval(cmdmap
, "property", field
);
1117 count_metadata(field
, f
.get());
1120 } else if (prefix
== "fs compat show") {
1122 cmd_getval(cmdmap
, "fs_name", fs_name
);
1123 const auto &fs
= fsmap
.get_filesystem(fs_name
);
1124 if (fs
== nullptr) {
1125 ss
<< "filesystem '" << fs_name
<< "' not found";
1131 f
->open_object_section("mds_compat");
1132 fs
->mds_map
.compat
.dump(f
.get());
1136 ds
<< fs
->mds_map
.compat
;
1139 } else if (prefix
== "mds compat show") {
1141 f
->open_object_section("mds_compat");
1142 fsmap
.default_compat
.dump(f
.get());
1146 ds
<< fsmap
.default_compat
;
1149 } else if (prefix
== "fs get") {
1151 cmd_getval(cmdmap
, "fs_name", fs_name
);
1152 const auto &fs
= fsmap
.get_filesystem(fs_name
);
1153 if (fs
== nullptr) {
1154 ss
<< "filesystem '" << fs_name
<< "' not found";
1158 f
->open_object_section("filesystem");
1168 } else if (prefix
== "fs ls") {
1170 f
->open_array_section("filesystems");
1171 for (const auto &p
: fsmap
.filesystems
) {
1172 const auto &fs
= p
.second
;
1173 f
->open_object_section("filesystem");
1175 const MDSMap
&mds_map
= fs
->mds_map
;
1176 f
->dump_string("name", mds_map
.fs_name
);
1177 /* Output both the names and IDs of pools, for use by
1178 * humans and machines respectively */
1179 f
->dump_string("metadata_pool", mon
.osdmon()->osdmap
.get_pool_name(
1180 mds_map
.metadata_pool
));
1181 f
->dump_int("metadata_pool_id", mds_map
.metadata_pool
);
1182 f
->open_array_section("data_pool_ids");
1183 for (const auto &id
: mds_map
.data_pools
) {
1184 f
->dump_int("data_pool_id", id
);
1188 f
->open_array_section("data_pools");
1189 for (const auto &id
: mds_map
.data_pools
) {
1190 const auto &name
= mon
.osdmon()->osdmap
.get_pool_name(id
);
1191 f
->dump_string("data_pool", name
);
1200 for (const auto &p
: fsmap
.filesystems
) {
1201 const auto &fs
= p
.second
;
1202 const MDSMap
&mds_map
= fs
->mds_map
;
1203 const string
&md_pool_name
= mon
.osdmon()->osdmap
.get_pool_name(
1204 mds_map
.metadata_pool
);
1206 ds
<< "name: " << mds_map
.fs_name
<< ", metadata pool: "
1207 << md_pool_name
<< ", data pools: [";
1208 for (const auto &id
: mds_map
.data_pools
) {
1209 const string
&pool_name
= mon
.osdmon()->osdmap
.get_pool_name(id
);
1210 ds
<< pool_name
<< " ";
1212 ds
<< "]" << std::endl
;
1215 if (fsmap
.filesystems
.empty()) {
1216 ds
<< "No filesystems enabled" << std::endl
;
1220 } else if (prefix
== "fs feature ls") {
1222 f
->open_array_section("cephfs_features");
1223 for (size_t i
= 0; i
<= CEPHFS_FEATURE_MAX
; ++i
) {
1224 f
->open_object_section("feature");
1225 f
->dump_int("index", i
);
1226 f
->dump_string("name", cephfs_feature_name(i
));
1232 for (size_t i
= 0; i
<= CEPHFS_FEATURE_MAX
; ++i
) {
1233 ds
<< i
<< " " << cephfs_feature_name(i
) << std::endl
;
1237 } else if (prefix
== "fs lsflags") {
1239 cmd_getval(cmdmap
, "fs_name", fs_name
);
1240 const auto &fs
= fsmap
.get_filesystem(fs_name
);
1242 ss
<< "filesystem '" << fs_name
<< "' not found";
1245 const MDSMap
&mds_map
= fs
->mds_map
;
1247 mds_map
.dump_flags_state(f
.get());
1251 mds_map
.print_flags(ds
);
1262 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
1268 bool MDSMonitor::fail_mds_gid(FSMap
&fsmap
, mds_gid_t gid
)
1270 const auto& info
= fsmap
.get_info_gid(gid
);
1271 dout(1) << "fail_mds_gid " << gid
<< " mds." << info
.name
<< " role " << info
.rank
<< dendl
;
1273 ceph_assert(mon
.osdmon()->is_writeable());
1275 epoch_t blocklist_epoch
= 0;
1276 if (info
.rank
>= 0 && info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
1277 utime_t until
= ceph_clock_now();
1278 until
+= g_conf().get_val
<double>("mon_mds_blocklist_interval");
1279 blocklist_epoch
= mon
.osdmon()->blocklist(info
.addrs
, until
);
1282 fsmap
.erase(gid
, blocklist_epoch
);
1283 last_beacon
.erase(gid
);
1284 if (pending_daemon_health
.count(gid
)) {
1285 pending_daemon_health
.erase(gid
);
1286 pending_daemon_health_rm
.insert(gid
);
1289 return blocklist_epoch
!= 0;
1292 mds_gid_t
MDSMonitor::gid_from_arg(const FSMap
&fsmap
, const std::string
&arg
, std::ostream
&ss
)
1294 // Try parsing as a role
1296 std::ostringstream ignore_err
; // Don't spam 'ss' with parse_role errors
1297 int r
= fsmap
.parse_role(arg
, &role
, ignore_err
);
1299 // See if a GID is assigned to this role
1300 const auto &fs
= fsmap
.get_filesystem(role
.fscid
);
1301 ceph_assert(fs
!= nullptr); // parse_role ensures it exists
1302 if (fs
->mds_map
.is_up(role
.rank
)) {
1303 dout(10) << __func__
<< ": validated rank/GID " << role
1304 << " as a rank" << dendl
;
1305 return fs
->mds_map
.get_mds_info(role
.rank
).global_id
;
1309 // Try parsing as a gid
1311 unsigned long long maybe_gid
= strict_strtoll(arg
.c_str(), 10, &err
);
1313 // Not a role or a GID, try as a daemon name
1314 const MDSMap::mds_info_t
*mds_info
= fsmap
.find_by_name(arg
);
1316 ss
<< "MDS named '" << arg
1317 << "' does not exist, or is not up";
1318 return MDS_GID_NONE
;
1320 dout(10) << __func__
<< ": resolved MDS name '" << arg
1321 << "' to GID " << mds_info
->global_id
<< dendl
;
1322 return mds_info
->global_id
;
1324 // Not a role, but parses as a an integer, might be a GID
1325 dout(10) << __func__
<< ": treating MDS reference '" << arg
1326 << "' as an integer " << maybe_gid
<< dendl
;
1328 if (fsmap
.gid_exists(mds_gid_t(maybe_gid
))) {
1329 return mds_gid_t(maybe_gid
);
1333 dout(1) << __func__
<< ": rank/GID " << arg
1334 << " not a existent rank or GID" << dendl
;
1335 return MDS_GID_NONE
;
1338 int MDSMonitor::fail_mds(FSMap
&fsmap
, std::ostream
&ss
,
1339 const std::string
&arg
, MDSMap::mds_info_t
*failed_info
)
1341 ceph_assert(failed_info
!= nullptr);
1343 mds_gid_t gid
= gid_from_arg(fsmap
, arg
, ss
);
1344 if (gid
== MDS_GID_NONE
) {
1347 if (!mon
.osdmon()->is_writeable()) {
1351 // Take a copy of the info before removing the MDS from the map,
1352 // so that the caller knows which mds (if any) they ended up removing.
1353 *failed_info
= fsmap
.get_info_gid(gid
);
1355 fail_mds_gid(fsmap
, gid
);
1356 ss
<< "failed mds gid " << gid
;
1357 ceph_assert(mon
.osdmon()->is_writeable());
1358 request_proposal(mon
.osdmon());
1362 bool MDSMonitor::prepare_command(MonOpRequestRef op
)
1364 op
->mark_mdsmon_event(__func__
);
1365 auto m
= op
->get_req
<MMonCommand
>();
1371 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
1372 string rs
= ss
.str();
1373 mon
.reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
1378 cmd_getval(cmdmap
, "prefix", prefix
);
1380 /* Refuse access if message not associated with a valid session */
1381 MonSession
*session
= op
->get_session();
1383 mon
.reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
1387 auto &pending
= get_pending_fsmap_writeable();
1389 bool batched_propose
= false;
1390 for (const auto &h
: handlers
) {
1391 r
= h
->can_handle(prefix
, op
, pending
, cmdmap
, ss
);
1393 ; // pass, since we got the right handler.
1394 } else if (r
== 0) {
1400 batched_propose
= h
->batched_propose();
1401 if (batched_propose
) {
1404 r
= h
->handle(&mon
, pending
, op
, cmdmap
, ss
);
1405 if (batched_propose
) {
1410 // message has been enqueued for retry; return.
1411 dout(4) << __func__
<< " enqueue for retry by prepare_command" << dendl
;
1415 // On successful updates, print the updated map
1418 // Successful or not, we're done: respond.
1423 r
= filesystem_command(pending
, op
, prefix
, cmdmap
, ss
);
1426 } else if (r
== -EAGAIN
) {
1427 // Do not reply, the message has been enqueued for retry
1428 dout(4) << __func__
<< " enqueue for retry by filesystem_command" << dendl
;
1430 } else if (r
!= -ENOSYS
) {
1434 if (r
== -ENOSYS
&& ss
.str().empty()) {
1435 ss
<< "unrecognized command";
1439 dout(4) << __func__
<< " done, r=" << r
<< dendl
;
1440 /* Compose response */
1445 // success.. delay reply
1446 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, r
, rs
,
1447 get_last_committed() + 1));
1448 if (batched_propose
) {
1449 force_immediate_propose();
1453 // reply immediately
1454 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
1459 int MDSMonitor::filesystem_command(
1462 std::string
const &prefix
,
1463 const cmdmap_t
& cmdmap
,
1464 std::stringstream
&ss
)
1466 dout(4) << __func__
<< " prefix='" << prefix
<< "'" << dendl
;
1467 op
->mark_mdsmon_event(__func__
);
1470 cmd_getval(cmdmap
, "role", whostr
);
1472 if (prefix
== "mds set_state") {
1474 if (!cmd_getval(cmdmap
, "gid", gid
)) {
1475 ss
<< "error parsing 'gid' value '"
1476 << cmd_vartype_stringify(cmdmap
.at("gid")) << "'";
1479 MDSMap::DaemonState state
;
1480 if (!cmd_getval(cmdmap
, "state", state
)) {
1481 ss
<< "error parsing 'state' string value '"
1482 << cmd_vartype_stringify(cmdmap
.at("state")) << "'";
1485 if (fsmap
.gid_exists(gid
, op
->get_session()->get_allowed_fs_names())) {
1486 fsmap
.modify_daemon(gid
, [state
](auto& info
) {
1489 ss
<< "set mds gid " << gid
<< " to state " << state
<< " "
1490 << ceph_mds_state_name(state
);
1493 } else if (prefix
== "mds fail") {
1495 cmd_getval(cmdmap
, "role_or_gid", who
);
1497 MDSMap::mds_info_t failed_info
;
1498 mds_gid_t gid
= gid_from_arg(fsmap
, who
, ss
);
1499 if (gid
== MDS_GID_NONE
) {
1500 ss
<< "MDS named '" << who
<< "' does not exist, is not up or you "
1501 << "lack the permission to see.";
1504 if(!fsmap
.gid_exists(gid
, op
->get_session()->get_allowed_fs_names())) {
1505 ss
<< "MDS named '" << who
<< "' does not exist, is not up or you "
1506 << "lack the permission to see.";
1509 string_view fs_name
= fsmap
.fs_name_from_gid(gid
);
1510 if (!op
->get_session()->fs_name_capable(fs_name
, MON_CAP_W
)) {
1511 ss
<< "Permission denied.";
1515 r
= fail_mds(fsmap
, ss
, who
, &failed_info
);
1516 if (r
< 0 && r
== -EAGAIN
) {
1517 mon
.osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
1518 return -EAGAIN
; // don't propose yet; wait for message to be retried
1519 } else if (r
== 0) {
1520 // Only log if we really did something (not when was already gone)
1521 if (failed_info
.global_id
!= MDS_GID_NONE
) {
1522 mon
.clog
->info() << failed_info
.human_name() << " marked failed by "
1523 << op
->get_session()->entity_name
;
1526 } else if (prefix
== "mds rm") {
1528 if (!cmd_getval(cmdmap
, "gid", gid
)) {
1529 ss
<< "error parsing 'gid' value '"
1530 << cmd_vartype_stringify(cmdmap
.at("gid")) << "'";
1533 if (!fsmap
.gid_exists(gid
, op
->get_session()->get_allowed_fs_names())) {
1534 ss
<< "mds gid " << gid
<< " does not exist";
1537 string_view fs_name
= fsmap
.fs_name_from_gid(gid
);
1538 if (!op
->get_session()->fs_name_capable(fs_name
, MON_CAP_W
)) {
1539 ss
<< "Permission denied.";
1542 const auto &info
= fsmap
.get_info_gid(gid
);
1543 MDSMap::DaemonState state
= info
.state
;
1545 ss
<< "cannot remove active mds." << info
.name
1546 << " rank " << info
.rank
;
1549 fsmap
.erase(gid
, {});
1550 ss
<< "removed mds gid " << gid
;
1553 } else if (prefix
== "mds rmfailed") {
1554 bool confirm
= false;
1555 cmd_getval(cmdmap
, "yes_i_really_mean_it", confirm
);
1557 ss
<< "WARNING: this can make your filesystem inaccessible! "
1558 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1562 std::string role_str
;
1563 cmd_getval(cmdmap
, "role", role_str
);
1565 const auto fs_names
= op
->get_session()->get_allowed_fs_names();
1566 int r
= fsmap
.parse_role(role_str
, &role
, ss
, fs_names
);
1568 ss
<< "invalid role '" << role_str
<< "'";
1571 string_view fs_name
= fsmap
.get_filesystem(role
.fscid
)->mds_map
.get_fs_name();
1572 if (!op
->get_session()->fs_name_capable(fs_name
, MON_CAP_W
)) {
1573 ss
<< "Permission denied.";
1577 fsmap
.modify_filesystem(
1579 [role
](std::shared_ptr
<Filesystem
> fs
)
1581 fs
->mds_map
.failed
.erase(role
.rank
);
1584 ss
<< "removed failed mds." << role
;
1586 /* TODO: convert to fs commands to update defaults */
1587 } else if (prefix
== "mds compat rm_compat") {
1589 if (!cmd_getval(cmdmap
, "feature", f
)) {
1590 ss
<< "error parsing feature value '"
1591 << cmd_vartype_stringify(cmdmap
.at("feature")) << "'";
1594 if (fsmap
.default_compat
.compat
.contains(f
)) {
1595 ss
<< "removing compat feature " << f
;
1596 fsmap
.default_compat
.compat
.remove(f
);
1598 ss
<< "compat feature " << f
<< " not present in " << fsmap
.default_compat
;
1601 } else if (prefix
== "mds compat rm_incompat") {
1603 if (!cmd_getval(cmdmap
, "feature", f
)) {
1604 ss
<< "error parsing feature value '"
1605 << cmd_vartype_stringify(cmdmap
.at("feature")) << "'";
1608 if (fsmap
.default_compat
.incompat
.contains(f
)) {
1609 ss
<< "removing incompat feature " << f
;
1610 fsmap
.default_compat
.incompat
.remove(f
);
1612 ss
<< "incompat feature " << f
<< " not present in " << fsmap
.default_compat
;
1615 } else if (prefix
== "mds repaired") {
1616 std::string role_str
;
1617 cmd_getval(cmdmap
, "role", role_str
);
1619 const auto fs_names
= op
->get_session()->get_allowed_fs_names();
1620 r
= fsmap
.parse_role(role_str
, &role
, ss
, fs_names
);
1624 string_view fs_name
= fsmap
.get_filesystem(role
.fscid
)->mds_map
.get_fs_name();
1625 if (!op
->get_session()->fs_name_capable(fs_name
, MON_CAP_W
)) {
1626 ss
<< "Permission denied.";
1630 bool modified
= fsmap
.undamaged(role
.fscid
, role
.rank
);
1632 ss
<< "repaired: restoring rank " << role
;
1634 ss
<< "nothing to do: rank is not damaged";
1638 } else if (prefix
== "mds freeze") {
1640 cmd_getval(cmdmap
, "role_or_gid", who
);
1641 mds_gid_t gid
= gid_from_arg(fsmap
, who
, ss
);
1642 if (gid
== MDS_GID_NONE
) {
1646 string_view fs_name
= fsmap
.fs_name_from_gid(gid
);
1647 if (!op
->get_session()->fs_name_capable(fs_name
, MON_CAP_W
)) {
1648 ss
<< "Permission denied.";
1652 bool freeze
= false;
1655 cmd_getval(cmdmap
, "val", str
);
1656 if ((r
= parse_bool(str
, &freeze
, ss
)) != 0) {
1661 auto f
= [freeze
,gid
,&ss
](auto& info
) {
1663 ss
<< "freezing mds." << gid
;
1666 ss
<< "unfreezing mds." << gid
;
1670 fsmap
.modify_daemon(gid
, f
);
1679 void MDSMonitor::check_subs()
1681 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1682 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1683 // filesystems. Build a list of all the types we service
1684 // subscriptions for.
1686 std::vector
<std::string
> types
= {
1692 for (const auto &p
: get_fsmap().filesystems
) {
1693 const auto &fscid
= p
.first
;
1694 CachedStackStringStream cos
;
1695 *cos
<< "mdsmap." << fscid
;
1696 types
.push_back(std::string(cos
->strv()));
1699 for (const auto &type
: types
) {
1700 auto& subs
= mon
.session_map
.subs
;
1701 auto subs_it
= subs
.find(type
);
1702 if (subs_it
== subs
.end())
1704 auto sub_it
= subs_it
->second
->begin();
1705 while (!sub_it
.end()) {
1707 ++sub_it
; // N.B. check_sub may remove sub!
1714 void MDSMonitor::check_sub(Subscription
*sub
)
1716 dout(20) << __func__
<< ": " << sub
->type
<< dendl
;
1718 // to use const qualifier filter fsmap beforehand
1719 FSMap _fsmap_copy
= get_fsmap();
1720 _fsmap_copy
.filter(sub
->session
->get_allowed_fs_names());
1721 const auto& fsmap
= _fsmap_copy
;
1722 if (sub
->next
> fsmap
.get_epoch()) {
1726 if (sub
->type
== "fsmap") {
1727 sub
->session
->con
->send_message(new MFSMap(mon
.monmap
->fsid
, fsmap
));
1729 mon
.session_map
.remove_sub(sub
);
1731 sub
->next
= fsmap
.get_epoch() + 1;
1733 } else if (sub
->type
== "fsmap.user") {
1735 fsmap_u
.epoch
= fsmap
.get_epoch();
1736 fsmap_u
.legacy_client_fscid
= fsmap
.legacy_client_fscid
;
1737 for (const auto &p
: fsmap
.filesystems
) {
1738 FSMapUser::fs_info_t
& fs_info
= fsmap_u
.filesystems
[p
.second
->fscid
];
1739 fs_info
.cid
= p
.second
->fscid
;
1740 fs_info
.name
= p
.second
->mds_map
.fs_name
;
1742 sub
->session
->con
->send_message(new MFSMapUser(mon
.monmap
->fsid
, fsmap_u
));
1744 mon
.session_map
.remove_sub(sub
);
1746 sub
->next
= fsmap
.get_epoch() + 1;
1748 } else if (sub
->type
.compare(0, 6, "mdsmap") == 0) {
1749 const bool is_mds
= sub
->session
->name
.is_mds();
1750 mds_gid_t mds_gid
= MDS_GID_NONE
;
1751 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
1753 // What (if any) namespace are you assigned to?
1754 auto mds_info
= fsmap
.get_mds_info();
1755 for (const auto &p
: mds_info
) {
1756 if (p
.second
.addrs
== sub
->session
->addrs
) {
1758 fscid
= fsmap
.mds_roles
.at(mds_gid
);
1762 // You're a client. Did you request a particular
1764 if (sub
->type
.compare(0, 7, "mdsmap.") == 0) {
1765 auto namespace_id_str
= sub
->type
.substr(std::string("mdsmap.").size());
1766 dout(10) << __func__
<< ": namespace_id " << namespace_id_str
<< dendl
;
1768 fscid
= strict_strtoll(namespace_id_str
.c_str(), 10, &err
);
1770 // Client asked for a non-existent namespace, send them nothing
1771 dout(1) << "Invalid client subscription '" << sub
->type
1776 // Unqualified request for "mdsmap": give it the one marked
1777 // for use by legacy clients.
1778 if (fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
1779 fscid
= fsmap
.legacy_client_fscid
;
1781 dout(1) << "Client subscribed for legacy filesystem but "
1782 "none is configured" << dendl
;
1786 if (!fsmap
.filesystem_exists(fscid
)) {
1787 // Client asked for a non-existent namespace, send them nothing
1788 // TODO: something more graceful for when a client has a filesystem
1789 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1791 dout(1) << "Client subscribed to non-existent namespace '" <<
1792 fscid
<< "'" << dendl
;
1796 dout(10) << __func__
<< ": is_mds=" << is_mds
<< ", fscid=" << fscid
<< dendl
;
1798 // Work out the effective latest epoch
1799 const MDSMap
*mds_map
= nullptr;
1800 MDSMap null_map
= MDSMap::create_null_mdsmap();
1801 if (fscid
== FS_CLUSTER_ID_NONE
) {
1802 // For a client, we should have already dropped out
1803 ceph_assert(is_mds
);
1805 auto it
= fsmap
.standby_daemons
.find(mds_gid
);
1806 if (it
!= fsmap
.standby_daemons
.end()) {
1807 // For an MDS, we need to feed it an MDSMap with its own state in
1808 null_map
.mds_info
[mds_gid
] = it
->second
;
1809 null_map
.epoch
= fsmap
.standby_epochs
.at(mds_gid
);
1811 null_map
.epoch
= fsmap
.epoch
;
1813 mds_map
= &null_map
;
1815 // Check the effective epoch
1816 mds_map
= &fsmap
.get_filesystem(fscid
)->mds_map
;
1819 ceph_assert(mds_map
!= nullptr);
1820 dout(10) << __func__
<< " selected MDS map epoch " <<
1821 mds_map
->epoch
<< " for namespace " << fscid
<< " for subscriber "
1822 << sub
->session
->name
<< " who wants epoch " << sub
->next
<< dendl
;
1824 if (sub
->next
> mds_map
->epoch
) {
1827 auto msg
= make_message
<MMDSMap
>(mon
.monmap
->fsid
, *mds_map
);
1829 sub
->session
->con
->send_message(msg
.detach());
1831 mon
.session_map
.remove_sub(sub
);
1833 sub
->next
= mds_map
->get_epoch() + 1;
1839 void MDSMonitor::update_metadata(mds_gid_t gid
,
1840 const map
<string
, string
>& metadata
)
1842 dout(20) << __func__
<< ": mds." << gid
<< ": " << metadata
<< dendl
;
1843 if (metadata
.empty()) {
1844 dout(5) << __func__
<< ": mds." << gid
<< ": no metadata!" << dendl
;
1847 pending_metadata
[gid
] = metadata
;
1849 MonitorDBStore::TransactionRef t
= paxos
.get_pending_transaction();
1851 encode(pending_metadata
, bl
);
1852 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1855 void MDSMonitor::remove_from_metadata(const FSMap
&fsmap
, MonitorDBStore::TransactionRef t
)
1857 bool update
= false;
1858 for (auto it
= pending_metadata
.begin(); it
!= pending_metadata
.end(); ) {
1859 if (!fsmap
.gid_exists(it
->first
)) {
1860 it
= pending_metadata
.erase(it
);
1869 encode(pending_metadata
, bl
);
1870 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1873 int MDSMonitor::load_metadata(map
<mds_gid_t
, Metadata
>& m
)
1876 int r
= mon
.store
->get(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1878 dout(5) << "Unable to load 'last_metadata'" << dendl
;
1882 auto it
= bl
.cbegin();
1883 ceph::decode(m
, it
);
1887 void MDSMonitor::count_metadata(const std::string
&field
, map
<string
,int> *out
)
1889 map
<mds_gid_t
,Metadata
> meta
;
1890 load_metadata(meta
);
1891 for (auto& p
: meta
) {
1892 auto q
= p
.second
.find(field
);
1893 if (q
== p
.second
.end()) {
1894 (*out
)["unknown"]++;
1896 (*out
)[q
->second
]++;
1901 void MDSMonitor::count_metadata(const std::string
&field
, Formatter
*f
)
1903 map
<string
,int> by_val
;
1904 count_metadata(field
, &by_val
);
1905 f
->open_object_section(field
.c_str());
1906 for (auto& p
: by_val
) {
1907 f
->dump_int(p
.first
.c_str(), p
.second
);
1912 void MDSMonitor::get_versions(std::map
<string
, list
<string
> > &versions
)
1914 map
<mds_gid_t
,Metadata
> meta
;
1915 load_metadata(meta
);
1916 const auto &fsmap
= get_fsmap();
1917 std::map
<mds_gid_t
, mds_info_t
> map
= fsmap
.get_mds_info();
1918 dout(10) << __func__
<< " mds meta=" << meta
<< dendl
;
1919 for (auto& p
: meta
) {
1920 auto q
= p
.second
.find("ceph_version_short");
1921 if (q
== p
.second
.end()) continue;
1922 versions
[q
->second
].push_back(string("mds.") + map
[p
.first
].name
);
1926 int MDSMonitor::dump_metadata(const FSMap
& fsmap
, const std::string
&who
,
1927 Formatter
*f
, ostream
& err
)
1931 mds_gid_t gid
= gid_from_arg(fsmap
, who
, err
);
1932 if (gid
== MDS_GID_NONE
) {
1936 map
<mds_gid_t
, Metadata
> metadata
;
1937 if (int r
= load_metadata(metadata
)) {
1938 err
<< "Unable to load 'last_metadata'";
1942 if (!metadata
.count(gid
)) {
1945 const Metadata
& m
= metadata
[gid
];
1946 for (Metadata::const_iterator p
= m
.begin(); p
!= m
.end(); ++p
) {
1947 f
->dump_string(p
->first
.c_str(), p
->second
);
1952 int MDSMonitor::print_nodes(Formatter
*f
)
1956 const auto &fsmap
= get_fsmap();
1958 map
<mds_gid_t
, Metadata
> metadata
;
1959 if (int r
= load_metadata(metadata
)) {
1963 map
<string
, list
<string
> > mdses
; // hostname => mds
1964 for (const auto &p
: metadata
) {
1965 const mds_gid_t
& gid
= p
.first
;
1966 const Metadata
& m
= p
.second
;
1967 Metadata::const_iterator hostname
= m
.find("hostname");
1968 if (hostname
== m
.end()) {
1969 // not likely though
1972 if (!fsmap
.gid_exists(gid
)) {
1973 dout(5) << __func__
<< ": GID " << gid
<< " not existent" << dendl
;
1976 const MDSMap::mds_info_t
& mds_info
= fsmap
.get_info_gid(gid
);
1977 mdses
[hostname
->second
].push_back(mds_info
.name
);
1980 dump_services(f
, mdses
, "mds");
1985 * If a cluster is undersized (with respect to max_mds), then
1986 * attempt to find daemons to grow it. If the cluster is oversized
1987 * (with respect to max_mds) then shrink it by stopping its highest rank.
1989 bool MDSMonitor::maybe_resize_cluster(FSMap
&fsmap
, fs_cluster_id_t fscid
)
1991 auto&& fs
= fsmap
.get_filesystem(fscid
);
1992 auto &mds_map
= fs
->mds_map
;
1994 int in
= mds_map
.get_num_in_mds();
1995 int max
= mds_map
.get_max_mds();
1997 dout(20) << __func__
<< " in " << in
<< " max " << max
<< dendl
;
1999 /* Check that both the current epoch mds_map is resizeable as well as the
2000 * current batch of changes in pending. This is important if an MDS is
2001 * becoming active in the next epoch.
2003 if (!get_fsmap().filesystem_exists(fscid
) ||
2004 !get_fsmap().get_filesystem(fscid
)->mds_map
.is_resizeable() ||
2005 !mds_map
.is_resizeable()) {
2006 dout(5) << __func__
<< " mds_map is not currently resizeable" << dendl
;
2010 if (in
< max
&& !mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
)) {
2011 mds_rank_t mds
= mds_rank_t(0);
2012 while (mds_map
.is_in(mds
)) {
2015 auto info
= fsmap
.find_replacement_for({fscid
, mds
});
2020 dout(1) << "assigned standby " << info
->addrs
2021 << " as mds." << mds
<< dendl
;
2022 mon
.clog
->info() << info
->human_name() << " assigned to "
2023 "filesystem " << mds_map
.fs_name
<< " as rank "
2024 << mds
<< " (now has " << mds_map
.get_num_in_mds() + 1
2026 fsmap
.promote(info
->global_id
, *fs
, mds
);
2028 } else if (in
> max
) {
2029 mds_rank_t target
= in
- 1;
2030 const auto &info
= mds_map
.get_info(target
);
2031 if (mds_map
.is_active(target
)) {
2032 dout(1) << "stopping " << target
<< dendl
;
2033 mon
.clog
->info() << "stopping " << info
.human_name();
2034 auto f
= [](auto& info
) {
2035 info
.state
= MDSMap::STATE_STOPPING
;
2037 fsmap
.modify_daemon(info
.global_id
, f
);
2040 dout(20) << "skipping stop of " << target
<< dendl
;
2050 * Fail a daemon and replace it with a suitable standby.
2052 bool MDSMonitor::drop_mds(FSMap
&fsmap
, mds_gid_t gid
, const mds_info_t
* rep_info
, bool *osd_propose
)
2054 ceph_assert(osd_propose
!= nullptr);
2056 const auto fscid
= fsmap
.mds_roles
.at(gid
);
2057 const auto& info
= fsmap
.get_info_gid(gid
);
2058 const auto rank
= info
.rank
;
2059 const auto state
= info
.state
;
2061 if (info
.is_frozen()) {
2063 } else if (state
== MDSMap::STATE_STANDBY_REPLAY
||
2064 state
== MDSMap::STATE_STANDBY
) {
2065 dout(1) << " failing and removing standby " << gid
<< " " << info
.addrs
2067 << "." << info
.inc
<< " " << ceph_mds_state_name(state
)
2069 *osd_propose
|= fail_mds_gid(fsmap
, gid
);
2071 } else if (rank
>= 0 && rep_info
) {
2072 auto fs
= fsmap
.filesystems
.at(fscid
);
2073 if (fs
->mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
)) {
2077 // and is there a non-laggy standby that can take over for us?
2078 dout(1) << " replacing " << gid
<< " " << info
.addrs
2079 << " mds." << rank
<< "." << info
.inc
2080 << " " << ceph_mds_state_name(state
)
2081 << " with " << rep_info
->global_id
<< "/" << rep_info
->name
<< " " << rep_info
->addrs
2084 mon
.clog
->warn() << "Replacing " << info
.human_name()
2085 << " as rank " << rank
2086 << " with standby " << rep_info
->human_name();
2088 // Remove the old one
2089 *osd_propose
|= fail_mds_gid(fsmap
, gid
);
2091 // Promote the replacement
2092 fsmap
.promote(rep_info
->global_id
, *fs
, rank
);
2099 bool MDSMonitor::check_health(FSMap
& fsmap
, bool* propose_osdmap
)
2101 bool do_propose
= false;
2102 const auto now
= mono_clock::now();
2103 const bool osdmap_writeable
= mon
.osdmon()->is_writeable();
2104 const auto mds_beacon_grace
= g_conf().get_val
<double>("mds_beacon_grace");
2105 const auto mds_beacon_interval
= g_conf().get_val
<double>("mds_beacon_interval");
2107 if (mono_clock::is_zero(last_tick
)) {
2112 auto since_last
= std::chrono::duration
<double>(now
-last_tick
);
2114 if (since_last
.count() > (mds_beacon_grace
-mds_beacon_interval
)) {
2115 // This case handles either local slowness (calls being delayed
2116 // for whatever reason) or cluster election slowness (a long gap
2117 // between calls while an election happened)
2118 dout(1) << __func__
<< ": resetting beacon timeouts due to mon delay "
2119 "(slow election?) of " << since_last
.count() << " seconds" << dendl
;
2120 for (auto& p
: last_beacon
) {
2121 p
.second
.stamp
= now
;
2126 // make sure last_beacon is fully populated
2127 for (auto& p
: fsmap
.mds_roles
) {
2128 auto& gid
= p
.first
;
2129 last_beacon
.emplace(std::piecewise_construct
,
2130 std::forward_as_tuple(gid
),
2131 std::forward_as_tuple(now
, 0));
2134 // We will only take decisive action (replacing/removing a daemon)
2135 // if we have some indication that some other daemon(s) are successfully
2136 // getting beacons through recently.
2137 mono_time latest_beacon
= mono_clock::zero();
2138 for (const auto& p
: last_beacon
) {
2139 latest_beacon
= std::max(p
.second
.stamp
, latest_beacon
);
2141 auto since
= std::chrono::duration
<double>(now
-latest_beacon
);
2142 const bool may_replace
= since
.count() <
2143 std::max(g_conf()->mds_beacon_interval
, g_conf()->mds_beacon_grace
* 0.5);
2145 // check beacon timestamps
2146 std::vector
<mds_gid_t
> to_remove
;
2147 const bool mon_down
= mon
.is_mon_down();
2148 const auto mds_beacon_mon_down_grace
=
2149 g_conf().get_val
<std::chrono::seconds
>("mds_beacon_mon_down_grace");
2150 const auto quorum_age
= std::chrono::seconds(mon
.quorum_age());
2151 const bool new_quorum
= quorum_age
< mds_beacon_mon_down_grace
;
2152 for (auto it
= last_beacon
.begin(); it
!= last_beacon
.end(); ) {
2153 auto& [gid
, beacon_info
] = *it
;
2154 auto since_last
= std::chrono::duration
<double>(now
-beacon_info
.stamp
);
2156 if (!fsmap
.gid_exists(gid
)) {
2157 // gid no longer exists, remove from tracked beacons
2158 it
= last_beacon
.erase(it
);
2162 if (since_last
.count() >= g_conf()->mds_beacon_grace
) {
2163 auto& info
= fsmap
.get_info_gid(gid
);
2164 dout(1) << "no beacon from mds." << info
.rank
<< "." << info
.inc
2165 << " (gid: " << gid
<< " addr: " << info
.addrs
2166 << " state: " << ceph_mds_state_name(info
.state
) << ")"
2167 << " since " << since_last
.count() << dendl
;
2168 if ((mon_down
|| new_quorum
) && since_last
< mds_beacon_mon_down_grace
) {
2169 /* The MDS may be sending beacons to a monitor not yet in quorum or
2170 * temporarily partitioned. Hold off on removal for a little longer...
2172 dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl
;
2176 // If the OSDMap is writeable, we can blocklist things, so we can
2177 // try failing any laggy MDS daemons. Consider each one for failure.
2178 if (!info
.laggy()) {
2179 dout(1) << " marking " << gid
<< " " << info
.addrs
2180 << " mds." << info
.rank
<< "." << info
.inc
2181 << " " << ceph_mds_state_name(info
.state
)
2182 << " laggy" << dendl
;
2183 fsmap
.modify_daemon(info
.global_id
, [](auto& info
) {
2184 info
.laggy_since
= ceph_clock_now();
2188 if (osdmap_writeable
&& may_replace
) {
2189 to_remove
.push_back(gid
); // drop_mds may invalidate iterator
2196 for (const auto& gid
: to_remove
) {
2197 auto info
= fsmap
.get_info_gid(gid
);
2198 const mds_info_t
* rep_info
= nullptr;
2199 if (info
.rank
>= 0) {
2200 auto fscid
= fsmap
.fscid_from_gid(gid
);
2201 rep_info
= fsmap
.find_replacement_for({fscid
, info
.rank
});
2203 bool dropped
= drop_mds(fsmap
, gid
, rep_info
, propose_osdmap
);
2205 mon
.clog
->info() << "MDS " << info
.human_name()
2206 << " is removed because it is dead or otherwise unavailable.";
2211 if (osdmap_writeable
) {
2212 for (auto& [fscid
, fs
] : fsmap
.filesystems
) {
2213 if (!fs
->mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
) &&
2214 fs
->mds_map
.is_resizeable()) {
2215 // Check if a rank or standby-replay should be replaced with a stronger
2216 // affinity standby. This looks at ranks and standby-replay:
2217 for (const auto& [gid
, info
] : fs
->mds_map
.get_mds_info()) {
2218 const auto join_fscid
= info
.join_fscid
;
2219 if (join_fscid
== fscid
)
2221 const auto rank
= info
.rank
;
2222 const auto state
= info
.state
;
2223 const mds_info_t
* rep_info
= nullptr;
2224 if (state
== MDSMap::STATE_STANDBY_REPLAY
) {
2225 rep_info
= fsmap
.get_available_standby(*fs
);
2226 } else if (state
== MDSMap::STATE_ACTIVE
) {
2227 rep_info
= fsmap
.find_replacement_for({fscid
, rank
});
2229 /* N.B. !is_degraded() */
2230 ceph_abort_msg("invalid state in MDSMap");
2235 bool better_affinity
= false;
2236 if (join_fscid
== FS_CLUSTER_ID_NONE
) {
2237 better_affinity
= (rep_info
->join_fscid
== fscid
);
2239 better_affinity
= (rep_info
->join_fscid
== fscid
) ||
2240 (rep_info
->join_fscid
== FS_CLUSTER_ID_NONE
);
2242 if (better_affinity
) {
2243 if (state
== MDSMap::STATE_STANDBY_REPLAY
) {
2244 mon
.clog
->info() << "Dropping low affinity standby-replay "
2245 << info
.human_name()
2246 << " in favor of higher affinity standby.";
2247 *propose_osdmap
|= fail_mds_gid(fsmap
, gid
);
2248 /* Now let maybe_promote_standby do the promotion. */
2250 mon
.clog
->info() << "Dropping low affinity active "
2251 << info
.human_name()
2252 << " in favor of higher affinity standby.";
2253 do_propose
|= drop_mds(fsmap
, gid
, rep_info
, propose_osdmap
);
2255 break; /* don't replace more than one per tick per fs */
2264 bool MDSMonitor::maybe_promote_standby(FSMap
&fsmap
, Filesystem
& fs
)
2266 if (fs
.mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
)) {
2270 bool do_propose
= false;
2272 // have a standby take over?
2273 set
<mds_rank_t
> failed
;
2274 fs
.mds_map
.get_failed_mds_set(failed
);
2275 for (const auto& rank
: failed
) {
2276 auto info
= fsmap
.find_replacement_for({fs
.fscid
, rank
});
2278 dout(1) << " taking over failed mds." << rank
<< " with " << info
->global_id
2279 << "/" << info
->name
<< " " << info
->addrs
<< dendl
;
2280 mon
.clog
->info() << "Standby " << info
->human_name()
2281 << " assigned to filesystem " << fs
.mds_map
.fs_name
2282 << " as rank " << rank
;
2284 fsmap
.promote(info
->global_id
, fs
, rank
);
2289 if (fs
.mds_map
.is_resizeable() && fs
.mds_map
.allows_standby_replay()) {
2290 // There were no failures to replace, so try using any available standbys
2291 // as standby-replay daemons. Don't do this when the cluster is degraded
2292 // as a standby-replay daemon may try to read a journal being migrated.
2294 auto info
= fsmap
.get_available_standby(fs
);
2296 dout(20) << "standby available mds." << info
->global_id
<< dendl
;
2297 bool changed
= false;
2298 for (const auto& rank
: fs
.mds_map
.in
) {
2299 dout(20) << "examining " << rank
<< dendl
;
2300 if (fs
.mds_map
.is_followable(rank
)) {
2301 dout(1) << " setting mds." << info
->global_id
2302 << " to follow mds rank " << rank
<< dendl
;
2303 fsmap
.assign_standby_replay(info
->global_id
, fs
.fscid
, rank
);
2309 if (!changed
) break;
2316 void MDSMonitor::tick()
2318 if (!is_active() || !is_leader()) return;
2320 auto &pending
= get_pending_fsmap_writeable();
2322 bool do_propose
= false;
2323 bool propose_osdmap
= false;
2325 if (check_fsmap_struct_version
) {
2326 /* Allow time for trimming otherwise PaxosService::is_writeable will always
2330 auto now
= clock::now();
2331 auto elapsed
= now
- last_fsmap_struct_flush
;
2332 if (elapsed
> std::chrono::seconds(30)) {
2335 auto v
= get_first_committed();
2336 int err
= get_version(v
, bl
);
2338 derr
<< "could not get version " << v
<< dendl
;
2343 } catch (const ceph::buffer::malformed_input
& e
) {
2344 dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e
.what() << dendl
;
2346 /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
2347 if (fsmap
.is_struct_old()) {
2348 dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl
;
2350 last_fsmap_struct_flush
= now
;
2352 dout(20) << "struct is recent" << dendl
;
2353 check_fsmap_struct_version
= false;
2358 do_propose
|= pending
.check_health();
2360 /* Check health and affinity of ranks */
2361 do_propose
|= check_health(pending
, &propose_osdmap
);
2363 /* Resize the cluster according to max_mds. */
2364 for (auto& p
: pending
.filesystems
) {
2365 do_propose
|= maybe_resize_cluster(pending
, p
.second
->fscid
);
2368 /* Replace any failed ranks. */
2369 for (auto& p
: pending
.filesystems
) {
2370 do_propose
|= maybe_promote_standby(pending
, *p
.second
);
2373 if (propose_osdmap
) {
2374 request_proposal(mon
.osdmon());
2381 last_tick
= mono_clock::now();
2384 MDSMonitor::MDSMonitor(Monitor
&mn
, Paxos
&p
, string service_name
)
2385 : PaxosService(mn
, p
, service_name
)
2387 handlers
= FileSystemCommandHandler::load(&p
);
2390 void MDSMonitor::on_restart()
2392 // Clear out the leader-specific state.
2393 last_tick
= mono_clock::now();
2394 last_beacon
.clear();