1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
43 #define dout_subsys ceph_subsys_mon
45 #define dout_prefix _prefix(_dout, mon, get_fsmap())
46 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const FSMap
& fsmap
) {
47 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
48 << "(" << mon
->get_state_name()
49 << ").mds e" << fsmap
.get_epoch() << " ";
52 static const string
MDS_METADATA_PREFIX("mds_metadata");
53 static const string
MDS_HEALTH_PREFIX("mds_health");
57 * Specialized implementation of cmd_getval to allow us to parse
58 * out strongly-typedef'd types
60 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
61 const std::string
& k
, mds_gid_t
&val
)
63 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
66 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
67 const std::string
& k
, mds_rank_t
&val
)
69 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
72 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
73 const std::string
& k
, MDSMap::DaemonState
&val
)
75 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
80 void MDSMonitor::print_map(const FSMap
&m
, int dbl
)
82 dout(dbl
) << "print_map\n";
88 void MDSMonitor::create_initial()
90 dout(10) << "create_initial" << dendl
;
93 void MDSMonitor::get_store_prefixes(std::set
<string
>& s
)
95 s
.insert(service_name
);
96 s
.insert(MDS_METADATA_PREFIX
);
97 s
.insert(MDS_HEALTH_PREFIX
);
100 void MDSMonitor::update_from_paxos(bool *need_bootstrap
)
102 version_t version
= get_last_committed();
103 if (version
== get_fsmap().epoch
)
106 dout(10) << __func__
<< " version " << version
107 << ", my e " << get_fsmap().epoch
<< dendl
;
108 assert(version
> get_fsmap().epoch
);
115 int err
= get_version(version
, fsmap_bl
);
118 assert(fsmap_bl
.length() > 0);
119 dout(10) << __func__
<< " got " << version
<< dendl
;
120 PaxosFSMap::decode(fsmap_bl
);
123 dout(0) << "new map" << dendl
;
124 print_map(get_fsmap(), 0);
125 if (!g_conf
->mon_mds_skip_sanity
) {
126 get_fsmap().sanity();
133 void MDSMonitor::init()
135 (void)load_metadata(pending_metadata
);
138 void MDSMonitor::create_pending()
140 auto &fsmap
= PaxosFSMap::create_pending();
142 if (mon
->osdmon()->is_readable()) {
143 const auto &osdmap
= mon
->osdmon()->osdmap
;
144 fsmap
.sanitize([&osdmap
](int64_t pool
){return osdmap
.have_pg_pool(pool
);});
147 dout(10) << "create_pending e" << fsmap
.epoch
<< dendl
;
150 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
152 auto &pending
= get_pending_fsmap_writeable();
153 auto &epoch
= pending
.epoch
;
155 dout(10) << "encode_pending e" << epoch
<< dendl
;
157 // print map iff 'debug mon = 30' or higher
158 print_map(get_pending_fsmap(), 30);
159 if (!g_conf
->mon_mds_skip_sanity
) {
163 // Set 'modified' on maps modified this epoch
164 for (auto &p
: pending
.filesystems
) {
165 if (p
.second
->mds_map
.epoch
== epoch
) {
166 p
.second
->mds_map
.modified
= ceph_clock_now();
171 assert(get_last_committed() + 1 == pending
.epoch
);
172 bufferlist pending_bl
;
173 pending
.encode(pending_bl
, mon
->get_quorum_con_features());
175 /* put everything in the transaction */
176 put_version(t
, pending
.epoch
, pending_bl
);
177 put_last_committed(t
, pending
.epoch
);
179 // Encode MDSHealth data
180 for (std::map
<uint64_t, MDSHealth
>::iterator i
= pending_daemon_health
.begin();
181 i
!= pending_daemon_health
.end(); ++i
) {
183 i
->second
.encode(bl
);
184 t
->put(MDS_HEALTH_PREFIX
, stringify(i
->first
), bl
);
187 for (std::set
<uint64_t>::iterator i
= pending_daemon_health_rm
.begin();
188 i
!= pending_daemon_health_rm
.end(); ++i
) {
189 t
->erase(MDS_HEALTH_PREFIX
, stringify(*i
));
191 pending_daemon_health_rm
.clear();
192 remove_from_metadata(pending
, t
);
195 health_check_map_t new_checks
;
196 const auto &info_map
= pending
.get_mds_info();
197 for (const auto &i
: info_map
) {
198 const auto &gid
= i
.first
;
199 const auto &info
= i
.second
;
200 if (pending_daemon_health_rm
.count(gid
)) {
204 auto p
= pending_daemon_health
.find(gid
);
205 if (p
!= pending_daemon_health
.end()) {
209 mon
->store
->get(MDS_HEALTH_PREFIX
, stringify(gid
), bl
);
211 derr
<< "Missing health data for MDS " << gid
<< dendl
;
214 bufferlist::iterator bl_i
= bl
.begin();
217 for (const auto &metric
: health
.metrics
) {
218 const int rank
= info
.rank
;
219 health_check_t
*check
= &new_checks
.get_or_add(
220 mds_metric_name(metric
.type
),
222 mds_metric_summary(metric
.type
));
224 ss
<< "mds" << info
.name
<< "(mds." << rank
<< "): " << metric
.message
;
226 for (auto &p
: metric
.metadata
) {
232 ss
<< p
.first
<< ": " << p
.second
;
235 check
->detail
.push_back(ss
.str());
238 pending
.get_health_checks(&new_checks
);
239 for (auto& p
: new_checks
.checks
) {
240 p
.second
.summary
= boost::regex_replace(
242 boost::regex("%num%"),
243 stringify(p
.second
.detail
.size()));
244 p
.second
.summary
= boost::regex_replace(
246 boost::regex("%plurals%"),
247 p
.second
.detail
.size() > 1 ? "s" : "");
248 p
.second
.summary
= boost::regex_replace(
250 boost::regex("%isorare%"),
251 p
.second
.detail
.size() > 1 ? "are" : "is");
252 p
.second
.summary
= boost::regex_replace(
254 boost::regex("%hasorhave%"),
255 p
.second
.detail
.size() > 1 ? "have" : "has");
257 encode_health(new_checks
, t
);
260 version_t
MDSMonitor::get_trim_to()
263 if (g_conf
->mon_mds_force_trim_to
> 0 &&
264 g_conf
->mon_mds_force_trim_to
< (int)get_last_committed()) {
265 floor
= g_conf
->mon_mds_force_trim_to
;
266 dout(10) << __func__
<< " explicit mon_mds_force_trim_to = "
270 unsigned max
= g_conf
->mon_max_mdsmap_epochs
;
271 version_t last
= get_last_committed();
273 if (last
- get_first_committed() > max
&& floor
< last
- max
)
278 void MDSMonitor::update_logger()
280 dout(10) << "update_logger" << dendl
;
282 const auto &fsmap
= get_fsmap();
287 for (const auto &i
: fsmap
.filesystems
) {
288 const MDSMap
&mds_map
= i
.second
->mds_map
;
290 up
+= mds_map
.get_num_up_mds();
291 in
+= mds_map
.get_num_in_mds();
292 failed
+= mds_map
.get_num_failed_mds();
294 mon
->cluster_logger
->set(l_cluster_num_mds_up
, up
);
295 mon
->cluster_logger
->set(l_cluster_num_mds_in
, in
);
296 mon
->cluster_logger
->set(l_cluster_num_mds_failed
, failed
);
297 mon
->cluster_logger
->set(l_cluster_mds_epoch
, fsmap
.get_epoch());
300 bool MDSMonitor::preprocess_query(MonOpRequestRef op
)
302 op
->mark_mdsmon_event(__func__
);
303 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
304 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
306 switch (m
->get_type()) {
309 return preprocess_beacon(op
);
311 case MSG_MON_COMMAND
:
312 return preprocess_command(op
);
314 case MSG_MDS_OFFLOAD_TARGETS
:
315 return preprocess_offload_targets(op
);
323 void MDSMonitor::_note_beacon(MMDSBeacon
*m
)
325 mds_gid_t gid
= mds_gid_t(m
->get_global_id());
326 version_t seq
= m
->get_seq();
328 dout(5) << "_note_beacon " << *m
<< " noting time" << dendl
;
329 auto &beacon
= last_beacon
[gid
];
330 beacon
.stamp
= mono_clock::now();
334 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op
)
336 op
->mark_mdsmon_event(__func__
);
337 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
338 MDSMap::DaemonState state
= m
->get_state();
339 mds_gid_t gid
= m
->get_global_id();
340 version_t seq
= m
->get_seq();
341 MDSMap::mds_info_t info
;
342 epoch_t effective_epoch
= 0;
344 const auto &fsmap
= get_fsmap();
346 // check privileges, ignore if fails
347 MonSession
*session
= m
->get_session();
349 if (!session
->is_capable("mds", MON_CAP_X
)) {
350 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
351 << session
->caps
<< dendl
;
355 if (m
->get_fsid() != mon
->monmap
->fsid
) {
356 dout(0) << "preprocess_beacon on fsid " << m
->get_fsid() << " != " << mon
->monmap
->fsid
<< dendl
;
360 dout(5) << "preprocess_beacon " << *m
361 << " from " << m
->get_orig_source_inst()
362 << " " << m
->get_compat()
365 // make sure the address has a port
366 if (m
->get_orig_source_addr().get_port() == 0) {
367 dout(1) << " ignoring boot message without a port" << dendl
;
372 if (!m
->get_compat().writeable(fsmap
.compat
)) {
373 dout(1) << " mds " << m
->get_source_inst() << " can't write to fsmap " << fsmap
.compat
<< dendl
;
381 // booted, but not in map?
382 if (!fsmap
.gid_exists(gid
)) {
383 if (state
!= MDSMap::STATE_BOOT
) {
384 dout(7) << "mds_beacon " << *m
<< " is not in fsmap (state "
385 << ceph_mds_state_name(state
) << ")" << dendl
;
387 /* We can't send an MDSMap this MDS was a part of because we no longer
388 * know which FS it was part of. Nor does this matter. Sending an empty
389 * MDSMap is sufficient for getting the MDS to respawn.
392 null_map
.epoch
= fsmap
.epoch
;
393 null_map
.compat
= fsmap
.compat
;
394 mon
->send_reply(op
, new MMDSMap(mon
->monmap
->fsid
, &null_map
));
397 return false; // not booted yet.
400 dout(10) << __func__
<< ": GID exists in map: " << gid
<< dendl
;
401 info
= fsmap
.get_info_gid(gid
);
404 if (info
.state_seq
> seq
) {
405 dout(7) << "mds_beacon " << *m
<< " has old seq, ignoring" << dendl
;
409 // Work out the latest epoch that this daemon should have seen
411 fs_cluster_id_t fscid
= fsmap
.mds_roles
.at(gid
);
412 if (fscid
== FS_CLUSTER_ID_NONE
) {
413 effective_epoch
= fsmap
.standby_epochs
.at(gid
);
415 effective_epoch
= fsmap
.get_filesystem(fscid
)->mds_map
.epoch
;
417 if (effective_epoch
!= m
->get_last_epoch_seen()) {
418 dout(10) << "mds_beacon " << *m
419 << " ignoring requested state, because mds hasn't seen latest map" << dendl
;
426 return false; // no longer laggy, need to update map.
428 if (state
== MDSMap::STATE_BOOT
) {
429 // ignore, already booted.
432 // is there a state change here?
433 if (info
.state
!= state
) {
434 // legal state change?
435 if ((info
.state
== MDSMap::STATE_STANDBY
||
436 info
.state
== MDSMap::STATE_STANDBY_REPLAY
) && state
> 0) {
437 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info
.state
)
438 << " -> " << ceph_mds_state_name(state
) << ")" << dendl
;
442 if ((state
== MDSMap::STATE_STANDBY
|| state
== MDSMap::STATE_STANDBY_REPLAY
)
443 && info
.rank
!= MDS_RANK_NONE
)
445 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
446 "held rank " << info
.rank
<< " while requesting state "
447 << ceph_mds_state_name(state
) << dendl
;
455 // Comparing known daemon health with m->get_health()
456 // and return false (i.e. require proposal) if they
457 // do not match, to update our stored
458 if (!(pending_daemon_health
[gid
] == m
->get_health())) {
459 dout(10) << __func__
<< " health metrics for gid " << gid
<< " were updated" << dendl
;
465 // note time and reply
466 assert(effective_epoch
> 0);
469 new MMDSBeacon(mon
->monmap
->fsid
, m
->get_global_id(), m
->get_name(),
470 effective_epoch
, state
, seq
,
471 CEPH_FEATURES_SUPPORTED_DEFAULT
));
475 // I won't reply this beacon, drop it.
480 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op
)
482 op
->mark_mdsmon_event(__func__
);
483 MMDSLoadTargets
*m
= static_cast<MMDSLoadTargets
*>(op
->get_req());
484 dout(10) << "preprocess_offload_targets " << *m
<< " from " << m
->get_orig_source() << dendl
;
486 const auto &fsmap
= get_fsmap();
488 // check privileges, ignore message if fails
489 MonSession
*session
= m
->get_session();
492 if (!session
->is_capable("mds", MON_CAP_X
)) {
493 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
494 << session
->caps
<< dendl
;
498 if (fsmap
.gid_exists(m
->global_id
) &&
499 m
->targets
== fsmap
.get_info_gid(m
->global_id
).export_targets
)
510 bool MDSMonitor::prepare_update(MonOpRequestRef op
)
512 op
->mark_mdsmon_event(__func__
);
513 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
514 dout(7) << "prepare_update " << *m
<< dendl
;
516 switch (m
->get_type()) {
519 return prepare_beacon(op
);
521 case MSG_MON_COMMAND
:
522 return prepare_command(op
);
524 case MSG_MDS_OFFLOAD_TARGETS
:
525 return prepare_offload_targets(op
);
534 bool MDSMonitor::prepare_beacon(MonOpRequestRef op
)
536 op
->mark_mdsmon_event(__func__
);
537 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
538 // -- this is an update --
539 dout(12) << "prepare_beacon " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
540 entity_addr_t addr
= m
->get_orig_source_inst().addr
;
541 mds_gid_t gid
= m
->get_global_id();
542 MDSMap::DaemonState state
= m
->get_state();
543 version_t seq
= m
->get_seq();
545 auto &pending
= get_pending_fsmap_writeable();
547 dout(15) << __func__
<< " got health from gid " << gid
<< " with " << m
->get_health().metrics
.size() << " metrics." << dendl
;
549 // Calculate deltas of health metrics created and removed
550 // Do this by type rather than MDSHealthMetric equality, because messages can
551 // change a lot when they include e.g. a number of items.
552 const auto &old_health
= pending_daemon_health
[gid
].metrics
;
553 const auto &new_health
= m
->get_health().metrics
;
555 std::set
<mds_metric_t
> old_types
;
556 for (const auto &i
: old_health
) {
557 old_types
.insert(i
.type
);
560 std::set
<mds_metric_t
> new_types
;
561 for (const auto &i
: new_health
) {
562 new_types
.insert(i
.type
);
565 for (const auto &new_metric
: new_health
) {
566 if (old_types
.count(new_metric
.type
) == 0) {
567 dout(10) << "MDS health message (" << m
->get_orig_source_inst().name
568 << "): " << new_metric
.sev
<< " " << new_metric
.message
<< dendl
;
572 // Log the disappearance of health messages at INFO
573 for (const auto &old_metric
: old_health
) {
574 if (new_types
.count(old_metric
.type
) == 0) {
575 mon
->clog
->info() << "MDS health message cleared ("
576 << m
->get_orig_source_inst().name
<< "): " << old_metric
.message
;
581 pending_daemon_health
[gid
] = m
->get_health();
584 if (state
== MDSMap::STATE_BOOT
) {
585 // zap previous instance of this name?
586 if (g_conf
->mds_enforce_unique_name
) {
587 bool failed_mds
= false;
588 while (mds_gid_t existing
= pending
.find_mds_gid_by_name(m
->get_name())) {
589 if (!mon
->osdmon()->is_writeable()) {
590 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
593 const MDSMap::mds_info_t
&existing_info
=
594 pending
.get_info_gid(existing
);
595 mon
->clog
->info() << existing_info
.human_name() << " restarted";
596 fail_mds_gid(pending
, existing
);
600 assert(mon
->osdmon()->is_writeable());
601 request_proposal(mon
->osdmon());
605 // Add this daemon to the map
606 if (pending
.mds_roles
.count(gid
) == 0) {
607 MDSMap::mds_info_t new_info
;
608 new_info
.global_id
= gid
;
609 new_info
.name
= m
->get_name();
610 new_info
.addr
= addr
;
611 new_info
.mds_features
= m
->get_mds_features();
612 new_info
.state
= MDSMap::STATE_STANDBY
;
613 new_info
.state_seq
= seq
;
614 new_info
.standby_for_rank
= m
->get_standby_for_rank();
615 new_info
.standby_for_name
= m
->get_standby_for_name();
616 new_info
.standby_for_fscid
= m
->get_standby_for_fscid();
617 new_info
.standby_replay
= m
->get_standby_replay();
618 pending
.insert(new_info
);
621 // Resolve standby_for_name to a rank
622 const MDSMap::mds_info_t
&info
= pending
.get_info_gid(gid
);
623 if (!info
.standby_for_name
.empty()) {
624 const MDSMap::mds_info_t
*leaderinfo
= pending
.find_by_name(
625 info
.standby_for_name
);
626 if (leaderinfo
&& (leaderinfo
->rank
>= 0)) {
627 const auto &fscid
= pending
.mds_roles
.at(leaderinfo
->global_id
);
629 pending
.modify_daemon(gid
, [fscid
, leaderinfo
](
630 MDSMap::mds_info_t
*info
) {
631 info
->standby_for_rank
= leaderinfo
->rank
;
632 info
->standby_for_fscid
= fscid
;
637 // initialize the beacon timer
638 auto &beacon
= last_beacon
[gid
];
639 beacon
.stamp
= mono_clock::now();
643 if (!pending
.compat
.writeable(m
->get_compat())) {
644 dout(10) << " fsmap " << pending
.compat
645 << " can't write to new mds' " << m
->get_compat()
646 << ", updating fsmap and killing old mds's"
648 pending
.update_compat(m
->get_compat());
651 update_metadata(m
->get_global_id(), m
->get_sys_info());
655 if (!pending
.gid_exists(gid
)) {
656 /* gid has been removed from pending, send null map */
657 dout(5) << "mds_beacon " << *m
<< " is not in fsmap (state "
658 << ceph_mds_state_name(state
) << ")" << dendl
;
660 /* We can't send an MDSMap this MDS was a part of because we no longer
661 * know which FS it was part of. Nor does this matter. Sending an empty
662 * MDSMap is sufficient for getting the MDS to respawn.
664 wait_for_finished_proposal(op
, new FunctionContext([op
, this](int r
){
666 const auto& fsmap
= get_fsmap();
668 null_map
.epoch
= fsmap
.epoch
;
669 null_map
.compat
= fsmap
.compat
;
670 mon
->send_reply(op
, new MMDSMap(mon
->monmap
->fsid
, &null_map
));
672 dispatch(op
); // try again
678 const MDSMap::mds_info_t
&info
= pending
.get_info_gid(gid
);
679 // Old MDS daemons don't mention that they're standby replay until
680 // after they've sent their boot beacon, so update this field.
681 if (info
.standby_replay
!= m
->get_standby_replay()) {
682 pending
.modify_daemon(info
.global_id
, [&m
](
683 MDSMap::mds_info_t
*i
)
685 i
->standby_replay
= m
->get_standby_replay();
689 if (info
.state
== MDSMap::STATE_STOPPING
&& state
!= MDSMap::STATE_STOPPED
) {
690 // we can't transition to any other states from STOPPING
691 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
698 dout(1) << "prepare_beacon clearing laggy flag on " << addr
<< dendl
;
699 pending
.modify_daemon(info
.global_id
, [](MDSMap::mds_info_t
*info
)
706 dout(5) << "prepare_beacon mds." << info
.rank
707 << " " << ceph_mds_state_name(info
.state
)
708 << " -> " << ceph_mds_state_name(state
)
709 << " standby_for_rank=" << m
->get_standby_for_rank()
711 if (state
== MDSMap::STATE_STOPPED
) {
712 const auto fscid
= pending
.mds_roles
.at(gid
);
713 const auto &fs
= pending
.get_filesystem(fscid
);
715 mon
->clog
->info() << info
.human_name() << " finished "
716 << "deactivating rank " << info
.rank
<< " in filesystem "
717 << fs
->mds_map
.fs_name
<< " (now has "
718 << fs
->mds_map
.get_num_in_mds() - 1 << " ranks)";
720 auto erased
= pending
.stop(gid
);
721 erased
.push_back(gid
);
723 for (const auto &erased_gid
: erased
) {
724 last_beacon
.erase(erased_gid
);
725 if (pending_daemon_health
.count(erased_gid
)) {
726 pending_daemon_health
.erase(erased_gid
);
727 pending_daemon_health_rm
.insert(erased_gid
);
732 } else if (state
== MDSMap::STATE_DAMAGED
) {
733 if (!mon
->osdmon()->is_writeable()) {
734 dout(1) << __func__
<< ": DAMAGED from rank " << info
.rank
735 << " waiting for osdmon writeable to blacklist it" << dendl
;
736 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
740 // Record this MDS rank as damaged, so that other daemons
741 // won't try to run it.
742 dout(0) << __func__
<< ": marking rank "
743 << info
.rank
<< " damaged" << dendl
;
745 utime_t until
= ceph_clock_now();
746 until
+= g_conf
->get_val
<double>("mon_mds_blacklist_interval");
747 const auto blacklist_epoch
= mon
->osdmon()->blacklist(info
.addr
, until
);
748 request_proposal(mon
->osdmon());
749 pending
.damaged(gid
, blacklist_epoch
);
750 last_beacon
.erase(gid
);
752 // Respond to MDS, so that it knows it can continue to shut down
755 mon
->monmap
->fsid
, m
->get_global_id(),
756 m
->get_name(), pending
.get_epoch(), state
, seq
,
757 CEPH_FEATURES_SUPPORTED_DEFAULT
));
758 } else if (state
== MDSMap::STATE_DNE
) {
759 if (!mon
->osdmon()->is_writeable()) {
760 dout(1) << __func__
<< ": DNE from rank " << info
.rank
761 << " waiting for osdmon writeable to blacklist it" << dendl
;
762 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
766 fail_mds_gid(pending
, gid
);
767 assert(mon
->osdmon()->is_writeable());
768 request_proposal(mon
->osdmon());
770 // Respond to MDS, so that it knows it can continue to shut down
773 mon
->monmap
->fsid
, m
->get_global_id(),
774 m
->get_name(), pending
.get_epoch(), state
, seq
,
775 CEPH_FEATURES_SUPPORTED_DEFAULT
));
776 } else if (info
.state
== MDSMap::STATE_STANDBY
&& state
!= info
.state
) {
777 // Standby daemons should never modify their own
778 // state. Reject any attempts to do so.
779 derr
<< "standby " << gid
<< " attempted to change state to "
780 << ceph_mds_state_name(state
) << ", rejecting" << dendl
;
782 } else if (info
.state
!= MDSMap::STATE_STANDBY
&& state
!= info
.state
&&
783 !MDSMap::state_transition_valid(info
.state
, state
)) {
784 // Validate state transitions for daemons that hold a rank
785 derr
<< "daemon " << gid
<< " (rank " << info
.rank
<< ") "
786 << "reported invalid state transition "
787 << ceph_mds_state_name(info
.state
) << " -> "
788 << ceph_mds_state_name(state
) << dendl
;
791 if (info
.state
!= MDSMap::STATE_ACTIVE
&& state
== MDSMap::STATE_ACTIVE
) {
792 const auto &fscid
= pending
.mds_roles
.at(gid
);
793 const auto &fs
= pending
.get_filesystem(fscid
);
794 mon
->clog
->info() << info
.human_name() << " is now active in "
795 << "filesystem " << fs
->mds_map
.fs_name
<< " as rank "
799 // Made it through special cases and validations, record the
800 // daemon's reported state to the FSMap.
801 pending
.modify_daemon(gid
, [state
, seq
](MDSMap::mds_info_t
*info
) {
803 info
->state_seq
= seq
;
808 dout(5) << "prepare_beacon pending map now:" << dendl
;
811 wait_for_finished_proposal(op
, new FunctionContext([op
, this](int r
){
813 _updated(op
); // success
814 else if (r
== -ECANCELED
) {
817 dispatch(op
); // try again
824 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op
)
826 auto &pending
= get_pending_fsmap_writeable();
828 op
->mark_mdsmon_event(__func__
);
829 MMDSLoadTargets
*m
= static_cast<MMDSLoadTargets
*>(op
->get_req());
830 mds_gid_t gid
= m
->global_id
;
831 if (pending
.gid_has_rank(gid
)) {
832 dout(10) << "prepare_offload_targets " << gid
<< " " << m
->targets
<< dendl
;
833 pending
.update_export_targets(gid
, m
->targets
);
835 dout(10) << "prepare_offload_targets " << gid
<< " not in map" << dendl
;
841 bool MDSMonitor::should_propose(double& delay
)
843 // delegate to PaxosService to assess whether we should propose
844 return PaxosService::should_propose(delay
);
847 void MDSMonitor::_updated(MonOpRequestRef op
)
849 const auto &fsmap
= get_fsmap();
850 op
->mark_mdsmon_event(__func__
);
851 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
852 dout(10) << "_updated " << m
->get_orig_source() << " " << *m
<< dendl
;
853 mon
->clog
->debug() << m
->get_orig_source_inst() << " "
854 << ceph_mds_state_name(m
->get_state());
856 if (m
->get_state() == MDSMap::STATE_STOPPED
) {
857 // send the map manually (they're out of the map, so they won't get it automatic)
859 null_map
.epoch
= fsmap
.epoch
;
860 null_map
.compat
= fsmap
.compat
;
861 mon
->send_reply(op
, new MMDSMap(mon
->monmap
->fsid
, &null_map
));
863 mon
->send_reply(op
, new MMDSBeacon(mon
->monmap
->fsid
,
869 CEPH_FEATURES_SUPPORTED_DEFAULT
));
873 void MDSMonitor::on_active()
879 mon
->clog
->debug() << "fsmap " << get_fsmap();
883 void MDSMonitor::get_health(list
<pair
<health_status_t
, string
> >& summary
,
884 list
<pair
<health_status_t
, string
> > *detail
,
885 CephContext
* cct
) const
887 const auto &fsmap
= get_fsmap();
889 fsmap
.get_health(summary
, detail
);
891 // For each MDS GID...
892 const auto &info_map
= fsmap
.get_mds_info();
893 for (const auto &i
: info_map
) {
894 const auto &gid
= i
.first
;
895 const auto &info
= i
.second
;
899 mon
->store
->get(MDS_HEALTH_PREFIX
, stringify(gid
), bl
);
901 derr
<< "Missing health data for MDS " << gid
<< dendl
;
905 bufferlist::iterator bl_i
= bl
.begin();
908 for (const auto &metric
: health
.metrics
) {
909 const int rank
= info
.rank
;
910 std::ostringstream message
;
911 message
<< "mds" << rank
<< ": " << metric
.message
;
912 summary
.push_back(std::make_pair(metric
.sev
, message
.str()));
915 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
916 // we duplicate the summary message in the detail string and tag the metadata on.
917 std::ostringstream detail_message
;
918 detail_message
<< message
.str();
919 if (metric
.metadata
.size()) {
920 detail_message
<< "(";
921 auto k
= metric
.metadata
.begin();
922 while (k
!= metric
.metadata
.end()) {
923 detail_message
<< k
->first
<< ": " << k
->second
;
924 if (boost::next(k
) != metric
.metadata
.end()) {
925 detail_message
<< ", ";
929 detail_message
<< ")";
931 detail
->push_back(std::make_pair(metric
.sev
, detail_message
.str()));
937 void MDSMonitor::dump_info(Formatter
*f
)
939 f
->open_object_section("fsmap");
943 f
->dump_unsigned("mdsmap_first_committed", get_first_committed());
944 f
->dump_unsigned("mdsmap_last_committed", get_last_committed());
947 bool MDSMonitor::preprocess_command(MonOpRequestRef op
)
949 op
->mark_mdsmon_event(__func__
);
950 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
955 map
<string
, cmd_vartype
> cmdmap
;
956 const auto &fsmap
= get_fsmap();
958 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
959 // ss has reason for failure
960 string rs
= ss
.str();
961 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
966 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
968 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
969 std::unique_ptr
<Formatter
> f(Formatter::create(format
));
971 MonSession
*session
= m
->get_session();
973 mon
->reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
977 if (prefix
== "mds stat") {
979 f
->open_object_section("mds_stat");
987 } else if (prefix
== "mds dump") {
991 const FSMap
*fsmapp
= &get_fsmap();
993 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
996 int err
= get_version(epoch
, b
);
997 if (err
== -ENOENT
) {
1009 const MDSMap
*mdsmapp
= nullptr;
1011 blank
.epoch
= fsmapp
->epoch
;
1012 if (fsmapp
->legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
1013 mdsmapp
= &fsmapp
->filesystems
.at(fsmapp
->legacy_client_fscid
)->mds_map
;
1018 f
->open_object_section("mdsmap");
1019 mdsmapp
->dump(f
.get());
1029 ss
<< "dumped fsmap epoch " << fsmapp
->get_epoch();
1030 } else if (prefix
== "fs dump") {
1034 const FSMap
*fsmapp
= &fsmap
;
1036 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
1039 int err
= get_version(epoch
, b
);
1040 if (err
== -ENOENT
) {
1053 f
->open_object_section("fsmap");
1054 fsmapp
->dump(f
.get());
1064 ss
<< "dumped fsmap epoch " << fsmapp
->get_epoch();
1065 } else if (prefix
== "mds metadata") {
1067 f
.reset(Formatter::create("json-pretty"));
1070 bool all
= !cmd_getval(g_ceph_context
, cmdmap
, "who", who
);
1071 dout(1) << "all = " << all
<< dendl
;
1074 // Dump all MDSs' metadata
1075 const auto all_info
= fsmap
.get_mds_info();
1077 f
->open_array_section("mds_metadata");
1078 for(const auto &i
: all_info
) {
1079 const auto &info
= i
.second
;
1081 f
->open_object_section("mds");
1082 f
->dump_string("name", info
.name
);
1083 std::ostringstream get_err
;
1084 r
= dump_metadata(fsmap
, info
.name
, f
.get(), get_err
);
1085 if (r
== -EINVAL
|| r
== -ENOENT
) {
1086 // Drop error, list what metadata we do have
1087 dout(1) << get_err
.str() << dendl
;
1089 } else if (r
!= 0) {
1090 derr
<< "Unexpected error reading metadata: " << cpp_strerror(r
)
1092 ss
<< get_err
.str();
1100 // Dump a single daemon's metadata
1101 f
->open_object_section("mds_metadata");
1102 r
= dump_metadata(fsmap
, who
, f
.get(), ss
);
1106 } else if (prefix
== "mds versions") {
1108 f
.reset(Formatter::create("json-pretty"));
1109 count_metadata("ceph_version", f
.get());
1112 } else if (prefix
== "mds count-metadata") {
1114 f
.reset(Formatter::create("json-pretty"));
1116 cmd_getval(g_ceph_context
, cmdmap
, "property", field
);
1117 count_metadata(field
, f
.get());
1120 } else if (prefix
== "mds getmap") {
1124 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
1126 int err
= get_version(e
, b
);
1127 if (err
== -ENOENT
) {
1134 mm
.encode(rdata
, m
->get_connection()->get_features());
1135 ss
<< "got fsmap epoch " << mm
.get_epoch();
1139 fsmap
.encode(rdata
, m
->get_connection()->get_features());
1140 ss
<< "got fsmap epoch " << fsmap
.get_epoch();
1143 } else if (prefix
== "mds compat show") {
1145 f
->open_object_section("mds_compat");
1146 fsmap
.compat
.dump(f
.get());
1153 } else if (prefix
== "fs get") {
1155 cmd_getval(g_ceph_context
, cmdmap
, "fs_name", fs_name
);
1156 const auto &fs
= fsmap
.get_filesystem(fs_name
);
1157 if (fs
== nullptr) {
1158 ss
<< "filesystem '" << fs_name
<< "' not found";
1162 f
->open_object_section("filesystem");
1172 } else if (prefix
== "fs ls") {
1174 f
->open_array_section("filesystems");
1175 for (const auto &p
: fsmap
.filesystems
) {
1176 const auto &fs
= p
.second
;
1177 f
->open_object_section("filesystem");
1179 const MDSMap
&mds_map
= fs
->mds_map
;
1180 f
->dump_string("name", mds_map
.fs_name
);
1181 /* Output both the names and IDs of pools, for use by
1182 * humans and machines respectively */
1183 f
->dump_string("metadata_pool", mon
->osdmon()->osdmap
.get_pool_name(
1184 mds_map
.metadata_pool
));
1185 f
->dump_int("metadata_pool_id", mds_map
.metadata_pool
);
1186 f
->open_array_section("data_pool_ids");
1187 for (const auto &id
: mds_map
.data_pools
) {
1188 f
->dump_int("data_pool_id", id
);
1192 f
->open_array_section("data_pools");
1193 for (const auto &id
: mds_map
.data_pools
) {
1194 const auto &name
= mon
->osdmon()->osdmap
.get_pool_name(id
);
1195 f
->dump_string("data_pool", name
);
1204 for (const auto &p
: fsmap
.filesystems
) {
1205 const auto &fs
= p
.second
;
1206 const MDSMap
&mds_map
= fs
->mds_map
;
1207 const string
&md_pool_name
= mon
->osdmon()->osdmap
.get_pool_name(
1208 mds_map
.metadata_pool
);
1210 ds
<< "name: " << mds_map
.fs_name
<< ", metadata pool: "
1211 << md_pool_name
<< ", data pools: [";
1212 for (const auto &id
: mds_map
.data_pools
) {
1213 const string
&pool_name
= mon
->osdmon()->osdmap
.get_pool_name(id
);
1214 ds
<< pool_name
<< " ";
1216 ds
<< "]" << std::endl
;
1219 if (fsmap
.filesystems
.empty()) {
1220 ds
<< "No filesystems enabled" << std::endl
;
1231 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
1237 bool MDSMonitor::fail_mds_gid(FSMap
&fsmap
, mds_gid_t gid
)
1239 const MDSMap::mds_info_t
&info
= fsmap
.get_info_gid(gid
);
1240 dout(1) << "fail_mds_gid " << gid
<< " mds." << info
.name
<< " role " << info
.rank
<< dendl
;
1242 epoch_t blacklist_epoch
= 0;
1243 if (info
.rank
>= 0 && info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
1244 utime_t until
= ceph_clock_now();
1245 until
+= g_conf
->get_val
<double>("mon_mds_blacklist_interval");
1246 blacklist_epoch
= mon
->osdmon()->blacklist(info
.addr
, until
);
1249 fsmap
.erase(gid
, blacklist_epoch
);
1250 last_beacon
.erase(gid
);
1251 if (pending_daemon_health
.count(gid
)) {
1252 pending_daemon_health
.erase(gid
);
1253 pending_daemon_health_rm
.insert(gid
);
1256 return blacklist_epoch
!= 0;
1259 mds_gid_t
MDSMonitor::gid_from_arg(const FSMap
&fsmap
, const std::string
&arg
, std::ostream
&ss
)
1261 // Try parsing as a role
1263 std::ostringstream ignore_err
; // Don't spam 'ss' with parse_role errors
1264 int r
= fsmap
.parse_role(arg
, &role
, ignore_err
);
1266 // See if a GID is assigned to this role
1267 const auto &fs
= fsmap
.get_filesystem(role
.fscid
);
1268 assert(fs
!= nullptr); // parse_role ensures it exists
1269 if (fs
->mds_map
.is_up(role
.rank
)) {
1270 dout(10) << __func__
<< ": validated rank/GID " << role
1271 << " as a rank" << dendl
;
1272 return fs
->mds_map
.get_mds_info(role
.rank
).global_id
;
1276 // Try parsing as a gid
1278 unsigned long long maybe_gid
= strict_strtoll(arg
.c_str(), 10, &err
);
1280 // Not a role or a GID, try as a daemon name
1281 const MDSMap::mds_info_t
*mds_info
= fsmap
.find_by_name(arg
);
1283 ss
<< "MDS named '" << arg
1284 << "' does not exist, or is not up";
1285 return MDS_GID_NONE
;
1287 dout(10) << __func__
<< ": resolved MDS name '" << arg
1288 << "' to GID " << mds_info
->global_id
<< dendl
;
1289 return mds_info
->global_id
;
1291 // Not a role, but parses as a an integer, might be a GID
1292 dout(10) << __func__
<< ": treating MDS reference '" << arg
1293 << "' as an integer " << maybe_gid
<< dendl
;
1295 if (fsmap
.gid_exists(mds_gid_t(maybe_gid
))) {
1296 return mds_gid_t(maybe_gid
);
1300 dout(1) << __func__
<< ": rank/GID " << arg
1301 << " not a existent rank or GID" << dendl
;
1302 return MDS_GID_NONE
;
1305 int MDSMonitor::fail_mds(FSMap
&fsmap
, std::ostream
&ss
,
1306 const std::string
&arg
, MDSMap::mds_info_t
*failed_info
)
1308 assert(failed_info
!= nullptr);
1310 mds_gid_t gid
= gid_from_arg(fsmap
, arg
, ss
);
1311 if (gid
== MDS_GID_NONE
) {
1314 if (!mon
->osdmon()->is_writeable()) {
1318 // Take a copy of the info before removing the MDS from the map,
1319 // so that the caller knows which mds (if any) they ended up removing.
1320 *failed_info
= fsmap
.get_info_gid(gid
);
1322 fail_mds_gid(fsmap
, gid
);
1323 ss
<< "failed mds gid " << gid
;
1324 assert(mon
->osdmon()->is_writeable());
1325 request_proposal(mon
->osdmon());
1329 bool MDSMonitor::prepare_command(MonOpRequestRef op
)
1331 op
->mark_mdsmon_event(__func__
);
1332 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
1337 map
<string
, cmd_vartype
> cmdmap
;
1338 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
1339 string rs
= ss
.str();
1340 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
1345 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
1347 /* Refuse access if message not associated with a valid session */
1348 MonSession
*session
= m
->get_session();
1350 mon
->reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
1354 auto &pending
= get_pending_fsmap_writeable();
1356 bool batched_propose
= false;
1357 for (const auto &h
: handlers
) {
1358 if (h
->can_handle(prefix
)) {
1359 batched_propose
= h
->batched_propose();
1360 if (batched_propose
) {
1363 r
= h
->handle(mon
, pending
, op
, cmdmap
, ss
);
1364 if (batched_propose
) {
1369 // message has been enqueued for retry; return.
1370 dout(4) << __func__
<< " enqueue for retry by prepare_command" << dendl
;
1374 // On successful updates, print the updated map
1377 // Successful or not, we're done: respond.
1383 r
= filesystem_command(pending
, op
, prefix
, cmdmap
, ss
);
1386 } else if (r
== -EAGAIN
) {
1387 // Do not reply, the message has been enqueued for retry
1388 dout(4) << __func__
<< " enqueue for retry by filesystem_command" << dendl
;
1390 } else if (r
!= -ENOSYS
) {
1394 // Only handle legacy commands if there is a filesystem configured
1395 if (pending
.legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
1396 if (pending
.filesystems
.size() == 0) {
1397 ss
<< "No filesystem configured: use `ceph fs new` to create a filesystem";
1399 ss
<< "No filesystem set for use with legacy commands";
1405 r
= legacy_filesystem_command(pending
, op
, prefix
, cmdmap
, ss
);
1407 if (r
== -ENOSYS
&& ss
.str().empty()) {
1408 ss
<< "unrecognized command";
1412 dout(4) << __func__
<< " done, r=" << r
<< dendl
;
1413 /* Compose response */
1418 // success.. delay reply
1419 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, r
, rs
,
1420 get_last_committed() + 1));
1421 if (batched_propose
) {
1422 force_immediate_propose();
1426 // reply immediately
1427 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
1432 int MDSMonitor::filesystem_command(
1435 std::string
const &prefix
,
1436 map
<string
, cmd_vartype
> &cmdmap
,
1437 std::stringstream
&ss
)
1439 dout(4) << __func__
<< " prefix='" << prefix
<< "'" << dendl
;
1440 op
->mark_mdsmon_event(__func__
);
1443 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
1445 if (prefix
== "mds stop" ||
1446 prefix
== "mds deactivate") {
1448 r
= fsmap
.parse_role(whostr
, &role
, ss
);
1452 const auto &fs
= fsmap
.get_filesystem(role
.fscid
);
1454 if (!fs
->mds_map
.is_active(role
.rank
)) {
1456 ss
<< "mds." << role
<< " not active ("
1457 << ceph_mds_state_name(fs
->mds_map
.get_state(role
.rank
)) << ")";
1458 } else if (fs
->mds_map
.get_root() == role
.rank
||
1459 fs
->mds_map
.get_tableserver() == role
.rank
) {
1461 ss
<< "can't tell the root (" << fs
->mds_map
.get_root()
1462 << ") or tableserver (" << fs
->mds_map
.get_tableserver()
1463 << ") to deactivate";
1464 } else if (role
.rank
!= fs
->mds_map
.get_last_in_mds()) {
1466 ss
<< "mds." << role
<< " doesn't have the max rank ("
1467 << fs
->mds_map
.get_last_in_mds() << ")";
1468 } else if (fs
->mds_map
.get_num_in_mds() <= size_t(fs
->mds_map
.get_max_mds())) {
1470 ss
<< "must decrease max_mds or else MDS will immediately reactivate";
1473 mds_gid_t gid
= fs
->mds_map
.up
.at(role
.rank
);
1474 ss
<< "telling mds." << role
<< " "
1475 << fsmap
.get_info_gid(gid
).addr
<< " to deactivate";
1477 fsmap
.modify_daemon(gid
, [](MDSMap::mds_info_t
*info
) {
1478 info
->state
= MDSMap::STATE_STOPPING
;
1481 } else if (prefix
== "mds set_state") {
1483 if (!cmd_getval(g_ceph_context
, cmdmap
, "gid", gid
)) {
1484 ss
<< "error parsing 'gid' value '"
1485 << cmd_vartype_stringify(cmdmap
["gid"]) << "'";
1488 MDSMap::DaemonState state
;
1489 if (!cmd_getval(g_ceph_context
, cmdmap
, "state", state
)) {
1490 ss
<< "error parsing 'state' string value '"
1491 << cmd_vartype_stringify(cmdmap
["state"]) << "'";
1494 if (fsmap
.gid_exists(gid
)) {
1495 fsmap
.modify_daemon(gid
, [state
](MDSMap::mds_info_t
*info
) {
1496 info
->state
= state
;
1498 ss
<< "set mds gid " << gid
<< " to state " << state
<< " "
1499 << ceph_mds_state_name(state
);
1502 } else if (prefix
== "mds fail") {
1504 cmd_getval(g_ceph_context
, cmdmap
, "who", who
);
1506 MDSMap::mds_info_t failed_info
;
1507 r
= fail_mds(fsmap
, ss
, who
, &failed_info
);
1508 if (r
< 0 && r
== -EAGAIN
) {
1509 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
1510 return -EAGAIN
; // don't propose yet; wait for message to be retried
1511 } else if (r
== 0) {
1512 // Only log if we really did something (not when was already gone)
1513 if (failed_info
.global_id
!= MDS_GID_NONE
) {
1514 mon
->clog
->info() << failed_info
.human_name() << " marked failed by "
1515 << op
->get_session()->entity_name
;
1518 } else if (prefix
== "mds rm") {
1520 if (!cmd_getval(g_ceph_context
, cmdmap
, "gid", gid
)) {
1521 ss
<< "error parsing 'gid' value '"
1522 << cmd_vartype_stringify(cmdmap
["gid"]) << "'";
1525 if (!fsmap
.gid_exists(gid
)) {
1526 ss
<< "mds gid " << gid
<< " dne";
1529 const auto &info
= fsmap
.get_info_gid(gid
);
1530 MDSMap::DaemonState state
= info
.state
;
1532 ss
<< "cannot remove active mds." << info
.name
1533 << " rank " << info
.rank
;
1536 fsmap
.erase(gid
, {});
1537 ss
<< "removed mds gid " << gid
;
1541 } else if (prefix
== "mds rmfailed") {
1543 if (!cmd_getval(g_ceph_context
, cmdmap
, "confirm", confirm
) ||
1544 confirm
!= "--yes-i-really-mean-it") {
1545 ss
<< "WARNING: this can make your filesystem inaccessible! "
1546 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1550 std::string role_str
;
1551 cmd_getval(g_ceph_context
, cmdmap
, "who", role_str
);
1553 int r
= fsmap
.parse_role(role_str
, &role
, ss
);
1555 ss
<< "invalid role '" << role_str
<< "'";
1559 fsmap
.modify_filesystem(
1561 [role
](std::shared_ptr
<Filesystem
> fs
)
1563 fs
->mds_map
.failed
.erase(role
.rank
);
1566 ss
<< "removed failed mds." << role
;
1568 } else if (prefix
== "mds compat rm_compat") {
1570 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature", f
)) {
1571 ss
<< "error parsing feature value '"
1572 << cmd_vartype_stringify(cmdmap
["feature"]) << "'";
1575 if (fsmap
.compat
.compat
.contains(f
)) {
1576 ss
<< "removing compat feature " << f
;
1577 CompatSet modified
= fsmap
.compat
;
1578 modified
.compat
.remove(f
);
1579 fsmap
.update_compat(modified
);
1581 ss
<< "compat feature " << f
<< " not present in " << fsmap
.compat
;
1584 } else if (prefix
== "mds compat rm_incompat") {
1586 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature", f
)) {
1587 ss
<< "error parsing feature value '"
1588 << cmd_vartype_stringify(cmdmap
["feature"]) << "'";
1591 if (fsmap
.compat
.incompat
.contains(f
)) {
1592 ss
<< "removing incompat feature " << f
;
1593 CompatSet modified
= fsmap
.compat
;
1594 modified
.incompat
.remove(f
);
1595 fsmap
.update_compat(modified
);
1597 ss
<< "incompat feature " << f
<< " not present in " << fsmap
.compat
;
1600 } else if (prefix
== "mds repaired") {
1601 std::string role_str
;
1602 cmd_getval(g_ceph_context
, cmdmap
, "rank", role_str
);
1604 r
= fsmap
.parse_role(role_str
, &role
, ss
);
1609 bool modified
= fsmap
.undamaged(role
.fscid
, role
.rank
);
1611 dout(1) << "repaired: restoring rank " << role
<< dendl
;
1613 dout(1) << "repaired: no-op on rank " << role
<< dendl
;
1625 * Helper to legacy_filesystem_command
1627 void MDSMonitor::modify_legacy_filesystem(FSMap
&fsmap
,
1628 std::function
<void(std::shared_ptr
<Filesystem
> )> fn
)
1630 fsmap
.modify_filesystem(
1631 fsmap
.legacy_client_fscid
,
1639 * Handle a command that affects the filesystem (i.e. a filesystem
1640 * must exist for the command to act upon).
1642 * @retval 0 Command was successfully handled and has side effects
1643 * @retval -EAGAIN Messages has been requeued for retry
1644 * @retval -ENOSYS Unknown command
1645 * @retval < 0 An error has occurred; **ss** may have been set.
1647 int MDSMonitor::legacy_filesystem_command(
1650 std::string
const &prefix
,
1651 map
<string
, cmd_vartype
> &cmdmap
,
1652 std::stringstream
&ss
)
1654 dout(4) << __func__
<< " prefix='" << prefix
<< "'" << dendl
;
1655 op
->mark_mdsmon_event(__func__
);
1658 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
1660 assert (fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
);
1662 if (prefix
== "mds set_max_mds") {
1663 // NOTE: deprecated by "fs set max_mds"
1665 if (!cmd_getval(g_ceph_context
, cmdmap
, "maxmds", maxmds
) || maxmds
<= 0) {
1669 const MDSMap
& mdsmap
=
1670 fsmap
.filesystems
.at(fsmap
.legacy_client_fscid
)->mds_map
;
1672 if (!mdsmap
.allows_multimds() &&
1673 maxmds
> mdsmap
.get_max_mds() &&
1675 ss
<< "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1679 if (maxmds
> MAX_MDS
) {
1680 ss
<< "may not have more than " << MAX_MDS
<< " MDS ranks";
1684 modify_legacy_filesystem(fsmap
,
1685 [maxmds
](std::shared_ptr
<Filesystem
> fs
)
1687 fs
->mds_map
.set_max_mds(maxmds
);
1691 ss
<< "max_mds = " << maxmds
;
1692 } else if (prefix
== "mds cluster_down") {
1693 // NOTE: deprecated by "fs set cluster_down"
1694 modify_legacy_filesystem(fsmap
,
1695 [](std::shared_ptr
<Filesystem
> fs
)
1697 fs
->mds_map
.set_flag(CEPH_MDSMAP_DOWN
);
1699 ss
<< "marked fsmap DOWN";
1701 } else if (prefix
== "mds cluster_up") {
1702 // NOTE: deprecated by "fs set cluster_up"
1703 modify_legacy_filesystem(fsmap
,
1704 [](std::shared_ptr
<Filesystem
> fs
)
1706 fs
->mds_map
.clear_flag(CEPH_MDSMAP_DOWN
);
1708 ss
<< "unmarked fsmap DOWN";
1718 void MDSMonitor::check_subs()
1720 std::list
<std::string
> types
;
1722 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1723 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1724 // filesystems. Build a list of all the types we service
1725 // subscriptions for.
1726 types
.push_back("fsmap");
1727 types
.push_back("fsmap.user");
1728 types
.push_back("mdsmap");
1729 for (const auto &p
: get_fsmap().filesystems
) {
1730 const auto &fscid
= p
.first
;
1731 std::ostringstream oss
;
1732 oss
<< "mdsmap." << fscid
;
1733 types
.push_back(oss
.str());
1736 for (const auto &type
: types
) {
1737 if (mon
->session_map
.subs
.count(type
) == 0)
1739 xlist
<Subscription
*>::iterator p
= mon
->session_map
.subs
[type
]->begin();
1741 Subscription
*sub
= *p
;
1749 void MDSMonitor::check_sub(Subscription
*sub
)
1751 dout(20) << __func__
<< ": " << sub
->type
<< dendl
;
1753 const auto &fsmap
= get_fsmap();
1755 if (sub
->type
== "fsmap") {
1756 if (sub
->next
<= fsmap
.get_epoch()) {
1757 sub
->session
->con
->send_message(new MFSMap(mon
->monmap
->fsid
, fsmap
));
1759 mon
->session_map
.remove_sub(sub
);
1761 sub
->next
= fsmap
.get_epoch() + 1;
1764 } else if (sub
->type
== "fsmap.user") {
1765 if (sub
->next
<= fsmap
.get_epoch()) {
1767 fsmap_u
.epoch
= fsmap
.get_epoch();
1768 fsmap_u
.legacy_client_fscid
= fsmap
.legacy_client_fscid
;
1769 for (const auto &p
: fsmap
.filesystems
) {
1770 FSMapUser::fs_info_t
& fs_info
= fsmap_u
.filesystems
[p
.second
->fscid
];
1771 fs_info
.cid
= p
.second
->fscid
;
1772 fs_info
.name
= p
.second
->mds_map
.fs_name
;
1774 sub
->session
->con
->send_message(new MFSMapUser(mon
->monmap
->fsid
, fsmap_u
));
1776 mon
->session_map
.remove_sub(sub
);
1778 sub
->next
= fsmap
.get_epoch() + 1;
1781 } else if (sub
->type
.compare(0, 6, "mdsmap") == 0) {
1782 if (sub
->next
> fsmap
.get_epoch()) {
1786 const bool is_mds
= sub
->session
->inst
.name
.is_mds();
1787 mds_gid_t mds_gid
= MDS_GID_NONE
;
1788 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
1790 // What (if any) namespace are you assigned to?
1791 auto mds_info
= fsmap
.get_mds_info();
1792 for (const auto &p
: mds_info
) {
1793 if (p
.second
.addr
== sub
->session
->inst
.addr
) {
1795 fscid
= fsmap
.mds_roles
.at(mds_gid
);
1799 // You're a client. Did you request a particular
1801 if (sub
->type
.find("mdsmap.") == 0) {
1802 auto namespace_id_str
= sub
->type
.substr(std::string("mdsmap.").size());
1803 dout(10) << __func__
<< ": namespace_id " << namespace_id_str
<< dendl
;
1805 fscid
= strict_strtoll(namespace_id_str
.c_str(), 10, &err
);
1807 // Client asked for a non-existent namespace, send them nothing
1808 dout(1) << "Invalid client subscription '" << sub
->type
1812 if (fsmap
.filesystems
.count(fscid
) == 0) {
1813 // Client asked for a non-existent namespace, send them nothing
1814 // TODO: something more graceful for when a client has a filesystem
1815 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1817 dout(1) << "Client subscribed to non-existent namespace '" <<
1818 fscid
<< "'" << dendl
;
1822 // Unqualified request for "mdsmap": give it the one marked
1823 // for use by legacy clients.
1824 if (fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
1825 fscid
= fsmap
.legacy_client_fscid
;
1827 dout(1) << "Client subscribed for legacy filesystem but "
1828 "none is configured" << dendl
;
1833 dout(10) << __func__
<< ": is_mds=" << is_mds
<< ", fscid= " << fscid
<< dendl
;
1835 // Work out the effective latest epoch
1836 const MDSMap
*mds_map
= nullptr;
1838 null_map
.compat
= fsmap
.compat
;
1839 if (fscid
== FS_CLUSTER_ID_NONE
) {
1840 // For a client, we should have already dropped out
1843 auto it
= fsmap
.standby_daemons
.find(mds_gid
);
1844 if (it
!= fsmap
.standby_daemons
.end()) {
1845 // For an MDS, we need to feed it an MDSMap with its own state in
1846 null_map
.mds_info
[mds_gid
] = it
->second
;
1847 null_map
.epoch
= fsmap
.standby_epochs
.at(mds_gid
);
1849 null_map
.epoch
= fsmap
.epoch
;
1851 mds_map
= &null_map
;
1853 // Check the effective epoch
1854 mds_map
= &fsmap
.get_filesystem(fscid
)->mds_map
;
1857 assert(mds_map
!= nullptr);
1858 dout(10) << __func__
<< " selected MDS map epoch " <<
1859 mds_map
->epoch
<< " for namespace " << fscid
<< " for subscriber "
1860 << sub
->session
->inst
.name
<< " who wants epoch " << sub
->next
<< dendl
;
1862 if (sub
->next
> mds_map
->epoch
) {
1865 auto msg
= new MMDSMap(mon
->monmap
->fsid
, mds_map
);
1867 sub
->session
->con
->send_message(msg
);
1869 mon
->session_map
.remove_sub(sub
);
1871 sub
->next
= mds_map
->get_epoch() + 1;
1877 void MDSMonitor::update_metadata(mds_gid_t gid
,
1878 const map
<string
, string
>& metadata
)
1880 if (metadata
.empty()) {
1883 pending_metadata
[gid
] = metadata
;
1885 MonitorDBStore::TransactionRef t
= paxos
->get_pending_transaction();
1887 ::encode(pending_metadata
, bl
);
1888 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1889 paxos
->trigger_propose();
1892 void MDSMonitor::remove_from_metadata(const FSMap
&fsmap
, MonitorDBStore::TransactionRef t
)
1894 bool update
= false;
1895 for (auto it
= pending_metadata
.begin(); it
!= pending_metadata
.end(); ) {
1896 if (!fsmap
.gid_exists(it
->first
)) {
1897 it
= pending_metadata
.erase(it
);
1906 ::encode(pending_metadata
, bl
);
1907 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1910 int MDSMonitor::load_metadata(map
<mds_gid_t
, Metadata
>& m
)
1913 int r
= mon
->store
->get(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1915 dout(1) << "Unable to load 'last_metadata'" << dendl
;
1919 bufferlist::iterator it
= bl
.begin();
1924 void MDSMonitor::count_metadata(const std::string
&field
, map
<string
,int> *out
)
1926 map
<mds_gid_t
,Metadata
> meta
;
1927 load_metadata(meta
);
1928 for (auto& p
: meta
) {
1929 auto q
= p
.second
.find(field
);
1930 if (q
== p
.second
.end()) {
1931 (*out
)["unknown"]++;
1933 (*out
)[q
->second
]++;
1938 void MDSMonitor::count_metadata(const std::string
&field
, Formatter
*f
)
1940 map
<string
,int> by_val
;
1941 count_metadata(field
, &by_val
);
1942 f
->open_object_section(field
.c_str());
1943 for (auto& p
: by_val
) {
1944 f
->dump_int(p
.first
.c_str(), p
.second
);
1949 int MDSMonitor::dump_metadata(const FSMap
& fsmap
, const std::string
&who
,
1950 Formatter
*f
, ostream
& err
)
1954 mds_gid_t gid
= gid_from_arg(fsmap
, who
, err
);
1955 if (gid
== MDS_GID_NONE
) {
1959 map
<mds_gid_t
, Metadata
> metadata
;
1960 if (int r
= load_metadata(metadata
)) {
1961 err
<< "Unable to load 'last_metadata'";
1965 if (!metadata
.count(gid
)) {
1968 const Metadata
& m
= metadata
[gid
];
1969 for (Metadata::const_iterator p
= m
.begin(); p
!= m
.end(); ++p
) {
1970 f
->dump_string(p
->first
.c_str(), p
->second
);
1975 int MDSMonitor::print_nodes(Formatter
*f
)
1979 const auto &fsmap
= get_fsmap();
1981 map
<mds_gid_t
, Metadata
> metadata
;
1982 if (int r
= load_metadata(metadata
)) {
1986 map
<string
, list
<int> > mdses
; // hostname => rank
1987 for (const auto &p
: metadata
) {
1988 const mds_gid_t
& gid
= p
.first
;
1989 const Metadata
& m
= p
.second
;
1990 Metadata::const_iterator hostname
= m
.find("hostname");
1991 if (hostname
== m
.end()) {
1992 // not likely though
1995 if (!fsmap
.gid_exists(gid
)) {
1996 dout(5) << __func__
<< ": GID " << gid
<< " not existent" << dendl
;
1999 const MDSMap::mds_info_t
& mds_info
= fsmap
.get_info_gid(gid
);
2000 // FIXME: include filesystem name with rank here
2001 mdses
[hostname
->second
].push_back(mds_info
.rank
);
2004 dump_services(f
, mdses
, "mds");
2009 * If a cluster is undersized (with respect to max_mds), then
2010 * attempt to find daemons to grow it.
2012 bool MDSMonitor::maybe_expand_cluster(FSMap
&fsmap
, fs_cluster_id_t fscid
)
2014 auto fs
= fsmap
.get_filesystem(fscid
);
2015 auto &mds_map
= fs
->mds_map
;
2017 if (fs
->mds_map
.test_flag(CEPH_MDSMAP_DOWN
)) {
2021 int in
= mds_map
.get_num_in_mds();
2022 int max
= mds_map
.get_max_mds();
2024 dout(20) << __func__
<< " in " << in
<< " max " << max
<< dendl
;
2027 mds_rank_t mds
= mds_rank_t(0);
2029 while (mds_map
.is_in(mds
)) {
2032 mds_gid_t newgid
= fsmap
.find_replacement_for({fscid
, mds
},
2033 name
, g_conf
->mon_force_standby_active
);
2034 if (newgid
== MDS_GID_NONE
) {
2038 const auto &new_info
= fsmap
.get_info_gid(newgid
);
2039 dout(1) << "assigned standby " << new_info
.addr
2040 << " as mds." << mds
<< dendl
;
2042 mon
->clog
->info() << new_info
.human_name() << " assigned to "
2043 "filesystem " << mds_map
.fs_name
<< " as rank "
2044 << mds
<< " (now has " << mds_map
.get_num_in_mds() + 1
2046 fsmap
.promote(newgid
, fs
, mds
);
2055 * If a daemon is laggy, and a suitable replacement
2056 * is available, fail this daemon (remove from map) and pass its
2057 * role to another daemon.
2059 void MDSMonitor::maybe_replace_gid(FSMap
&fsmap
, mds_gid_t gid
,
2060 const MDSMap::mds_info_t
& info
, bool *mds_propose
, bool *osd_propose
)
2062 assert(mds_propose
!= nullptr);
2063 assert(osd_propose
!= nullptr);
2065 const auto fscid
= fsmap
.mds_roles
.at(gid
);
2067 // We will only take decisive action (replacing/removing a daemon)
2068 // if we have some indicating that some other daemon(s) are successfully
2069 // getting beacons through recently.
2070 mono_time latest_beacon
= mono_clock::zero();
2071 for (const auto &p
: last_beacon
) {
2072 latest_beacon
= std::max(p
.second
.stamp
, latest_beacon
);
2074 mono_time now
= mono_clock::now();
2075 chrono::duration
<double> since
= now
-latest_beacon
;
2076 const bool may_replace
= since
.count() <
2077 std::max(g_conf
->mds_beacon_interval
, g_conf
->mds_beacon_grace
* 0.5);
2080 // and is there a non-laggy standby that can take over for us?
2082 if (info
.rank
>= 0 &&
2083 info
.state
!= MDSMap::STATE_STANDBY
&&
2084 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
&&
2086 !fsmap
.get_filesystem(fscid
)->mds_map
.test_flag(CEPH_MDSMAP_DOWN
) &&
2087 (sgid
= fsmap
.find_replacement_for({fscid
, info
.rank
}, info
.name
,
2088 g_conf
->mon_force_standby_active
)) != MDS_GID_NONE
)
2091 MDSMap::mds_info_t si
= fsmap
.get_info_gid(sgid
);
2092 dout(1) << " replacing " << gid
<< " " << info
.addr
<< " mds."
2093 << info
.rank
<< "." << info
.inc
2094 << " " << ceph_mds_state_name(info
.state
)
2095 << " with " << sgid
<< "/" << si
.name
<< " " << si
.addr
<< dendl
;
2097 mon
->clog
->warn() << info
.human_name()
2098 << " is not responding, replacing it "
2099 << "as rank " << info
.rank
2100 << " with standby " << si
.human_name();
2102 // Remember what NS the old one was in
2103 const fs_cluster_id_t fscid
= fsmap
.mds_roles
.at(gid
);
2105 // Remove the old one
2106 *osd_propose
|= fail_mds_gid(fsmap
, gid
);
2108 // Promote the replacement
2109 auto fs
= fsmap
.filesystems
.at(fscid
);
2110 fsmap
.promote(sgid
, fs
, info
.rank
);
2112 *mds_propose
= true;
2113 } else if ((info
.state
== MDSMap::STATE_STANDBY_REPLAY
||
2114 info
.state
== MDSMap::STATE_STANDBY
) && may_replace
) {
2115 dout(1) << " failing and removing " << gid
<< " " << info
.addr
<< " mds." << info
.rank
2116 << "." << info
.inc
<< " " << ceph_mds_state_name(info
.state
)
2118 mon
->clog
->info() << "Standby " << info
.human_name() << " is not "
2119 "responding, dropping it";
2120 fail_mds_gid(fsmap
, gid
);
2121 *mds_propose
= true;
2122 } else if (!info
.laggy()) {
2123 dout(1) << " marking " << gid
<< " " << info
.addr
<< " mds." << info
.rank
<< "." << info
.inc
2124 << " " << ceph_mds_state_name(info
.state
)
2125 << " laggy" << dendl
;
2126 fsmap
.modify_daemon(info
.global_id
, [](MDSMap::mds_info_t
*info
) {
2127 info
->laggy_since
= ceph_clock_now();
2129 *mds_propose
= true;
2133 bool MDSMonitor::maybe_promote_standby(FSMap
&fsmap
, std::shared_ptr
<Filesystem
> &fs
)
2135 assert(!fs
->mds_map
.test_flag(CEPH_MDSMAP_DOWN
));
2137 bool do_propose
= false;
2139 // have a standby take over?
2140 set
<mds_rank_t
> failed
;
2141 fs
->mds_map
.get_failed_mds_set(failed
);
2142 if (!failed
.empty()) {
2143 set
<mds_rank_t
>::iterator p
= failed
.begin();
2144 while (p
!= failed
.end()) {
2145 mds_rank_t f
= *p
++;
2146 mds_gid_t sgid
= fsmap
.find_replacement_for({fs
->fscid
, f
}, {},
2147 g_conf
->mon_force_standby_active
);
2149 const MDSMap::mds_info_t si
= fsmap
.get_info_gid(sgid
);
2150 dout(1) << " taking over failed mds." << f
<< " with " << sgid
2151 << "/" << si
.name
<< " " << si
.addr
<< dendl
;
2152 mon
->clog
->info() << "Standby " << si
.human_name()
2153 << " assigned to filesystem " << fs
->mds_map
.fs_name
2154 << " as rank " << f
;
2156 fsmap
.promote(sgid
, fs
, f
);
2161 // There were no failures to replace, so try using any available standbys
2162 // as standby-replay daemons.
2164 // Take a copy of the standby GIDs so that we can iterate over
2165 // them while perhaps-modifying standby_daemons during the loop
2166 // (if we promote anyone they are removed from standby_daemons)
2167 std::vector
<mds_gid_t
> standby_gids
;
2168 for (const auto &j
: fsmap
.standby_daemons
) {
2169 standby_gids
.push_back(j
.first
);
2172 for (const auto &gid
: standby_gids
) {
2173 const auto &info
= fsmap
.standby_daemons
.at(gid
);
2174 assert(info
.state
== MDSMap::STATE_STANDBY
);
2176 if (!info
.standby_replay
) {
2181 * This mds is standby but has no rank assigned.
2182 * See if we can find it somebody to shadow
2184 dout(20) << "gid " << gid
<< " is standby and following nobody" << dendl
;
2186 // standby for someone specific?
2187 if (info
.standby_for_rank
>= 0) {
2188 // The mds_info_t may or may not tell us exactly which filesystem
2189 // the standby_for_rank refers to: lookup via legacy_client_fscid
2190 mds_role_t target_role
= {
2191 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
2192 fsmap
.legacy_client_fscid
: info
.standby_for_fscid
,
2193 info
.standby_for_rank
};
2195 // It is possible that the map contains a standby_for_fscid
2196 // that doesn't correspond to an existing filesystem, especially
2197 // if we loaded from a version with a bug (#17466)
2198 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
2199 && !fsmap
.filesystem_exists(info
.standby_for_fscid
)) {
2200 derr
<< "gid " << gid
<< " has invalid standby_for_fscid "
2201 << info
.standby_for_fscid
<< dendl
;
2205 // If we managed to resolve a full target role
2206 if (target_role
.fscid
!= FS_CLUSTER_ID_NONE
) {
2207 const auto &fs
= fsmap
.get_filesystem(target_role
.fscid
);
2208 if (fs
->mds_map
.is_followable(target_role
.rank
)) {
2209 do_propose
|= try_standby_replay(fsmap
, info
, *fs
,
2210 fs
->mds_map
.get_info(target_role
.rank
));
2218 for (const auto &p
: fsmap
.filesystems
) {
2219 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
&&
2220 info
.standby_for_fscid
!= p
.first
)
2223 bool assigned
= false;
2224 const auto &fs
= p
.second
;
2225 const MDSMap
&mds_map
= fs
->mds_map
;
2226 for (const auto &mds_i
: mds_map
.mds_info
) {
2227 const MDSMap::mds_info_t
&cand_info
= mds_i
.second
;
2228 if (cand_info
.rank
>= 0 && mds_map
.is_followable(cand_info
.rank
)) {
2229 if ((info
.standby_for_name
.length() && info
.standby_for_name
!= cand_info
.name
) ||
2230 info
.standby_for_rank
!= MDS_RANK_NONE
) {
2231 continue; // we're supposed to follow someone else
2234 if (try_standby_replay(fsmap
, info
, *fs
, cand_info
)) {
2251 void MDSMonitor::tick()
2253 // make sure mds's are still alive
2254 // ...if i am an active leader
2256 if (!is_active() || !is_leader()) return;
2258 auto &pending
= get_pending_fsmap_writeable();
2260 bool do_propose
= false;
2262 do_propose
|= pending
.check_health();
2264 // expand mds cluster (add new nodes to @in)?
2265 for (auto &p
: pending
.filesystems
) {
2266 do_propose
|= maybe_expand_cluster(pending
, p
.second
->fscid
);
2269 mono_time now
= mono_clock::now();
2270 if (last_tick
== decltype(last_tick
)::min()) {
2273 chrono::duration
<double> since_last
= now
-last_tick
;
2275 if (since_last
.count() >
2276 (g_conf
->mds_beacon_grace
- g_conf
->mds_beacon_interval
)) {
2277 // This case handles either local slowness (calls being delayed
2278 // for whatever reason) or cluster election slowness (a long gap
2279 // between calls while an election happened)
2280 dout(1) << __func__
<< ": resetting beacon timeouts due to mon delay "
2281 "(slow election?) of " << now
- last_tick
<< " seconds" << dendl
;
2282 for (auto &p
: last_beacon
) {
2283 p
.second
.stamp
= now
;
2289 // make sure last_beacon is fully populated
2290 for (auto &p
: pending
.mds_roles
) {
2291 auto &gid
= p
.first
;
2292 last_beacon
.emplace(std::piecewise_construct
,
2293 std::forward_as_tuple(gid
),
2294 std::forward_as_tuple(mono_clock::now(), 0));
2298 // check beacon timestamps
2299 bool propose_osdmap
= false;
2300 bool osdmap_writeable
= mon
->osdmon()->is_writeable();
2301 for (auto it
= last_beacon
.begin(); it
!= last_beacon
.end(); ) {
2302 mds_gid_t gid
= it
->first
;
2303 auto beacon_info
= it
->second
;
2304 chrono::duration
<double> since_last
= now
-beacon_info
.stamp
;
2306 if (!pending
.gid_exists(gid
)) {
2308 it
= last_beacon
.erase(it
);
2313 if (since_last
.count() >= g_conf
->mds_beacon_grace
) {
2314 auto &info
= pending
.get_info_gid(gid
);
2315 dout(1) << "no beacon from mds." << info
.rank
<< "." << info
.inc
2316 << " (gid: " << gid
<< " addr: " << info
.addr
2317 << " state: " << ceph_mds_state_name(info
.state
) << ")"
2318 << " since " << since_last
.count() << "s" << dendl
;
2319 // If the OSDMap is writeable, we can blacklist things, so we can
2320 // try failing any laggy MDS daemons. Consider each one for failure.
2321 if (osdmap_writeable
) {
2322 maybe_replace_gid(pending
, gid
, info
, &do_propose
, &propose_osdmap
);
2328 if (propose_osdmap
) {
2329 request_proposal(mon
->osdmon());
2332 for (auto &p
: pending
.filesystems
) {
2333 auto &fs
= p
.second
;
2334 if (!fs
->mds_map
.test_flag(CEPH_MDSMAP_DOWN
)) {
2335 do_propose
|= maybe_promote_standby(pending
, fs
);
2345 * finfo: the would-be follower
2346 * leader_fs: the Filesystem containing the would-be leader
2347 * ainfo: the would-be leader
2349 bool MDSMonitor::try_standby_replay(
2351 const MDSMap::mds_info_t
& finfo
,
2352 const Filesystem
&leader_fs
,
2353 const MDSMap::mds_info_t
& ainfo
)
2355 // someone else already following?
2356 if (leader_fs
.has_standby_replay(ainfo
.global_id
)) {
2357 dout(20) << " mds." << ainfo
.rank
<< " already has a follower" << dendl
;
2360 // Assign the new role to the standby
2361 dout(10) << " setting to follow mds rank " << ainfo
.rank
<< dendl
;
2362 fsmap
.assign_standby_replay(finfo
.global_id
, leader_fs
.fscid
, ainfo
.rank
);
2367 MDSMonitor::MDSMonitor(Monitor
*mn
, Paxos
*p
, string service_name
)
2368 : PaxosService(mn
, p
, service_name
)
2370 handlers
= FileSystemCommandHandler::load(p
);
2373 void MDSMonitor::on_restart()
2375 // Clear out the leader-specific state.
2376 last_tick
= mono_clock::now();
2377 last_beacon
.clear();