1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include <boost/utility.hpp>
18 #include "MDSMonitor.h"
19 #include "FSCommands.h"
21 #include "MonitorDBStore.h"
22 #include "OSDMonitor.h"
23 #include "PGMonitor.h"
25 #include "common/strtol.h"
26 #include "common/perf_counters.h"
27 #include "common/config.h"
28 #include "common/cmdparse.h"
29 #include "messages/MMDSMap.h"
30 #include "messages/MFSMap.h"
31 #include "messages/MFSMapUser.h"
32 #include "messages/MMDSLoadTargets.h"
33 #include "messages/MMonCommand.h"
34 #include "messages/MGenericMessage.h"
36 #include "include/assert.h"
37 #include "include/str_list.h"
38 #include "include/stringify.h"
39 #include "mds/mdstypes.h"
42 #define dout_subsys ceph_subsys_mon
44 #define dout_prefix _prefix(_dout, mon, fsmap)
45 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, FSMap
const& fsmap
) {
46 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
47 << "(" << mon
->get_state_name()
48 << ").mds e" << fsmap
.get_epoch() << " ";
52 * Specialized implementation of cmd_getval to allow us to parse
53 * out strongly-typedef'd types
55 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
56 const std::string
& k
, mds_gid_t
&val
)
58 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
61 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
62 const std::string
& k
, mds_rank_t
&val
)
64 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
67 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
68 const std::string
& k
, MDSMap::DaemonState
&val
)
70 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
73 static const string
MDS_METADATA_PREFIX("mds_metadata");
78 void MDSMonitor::print_map(FSMap
&m
, int dbl
)
80 dout(dbl
) << "print_map\n";
86 void MDSMonitor::create_initial()
88 dout(10) << "create_initial" << dendl
;
92 void MDSMonitor::update_from_paxos(bool *need_bootstrap
)
94 version_t version
= get_last_committed();
95 if (version
== fsmap
.epoch
)
98 dout(10) << __func__
<< " version " << version
99 << ", my e " << fsmap
.epoch
<< dendl
;
100 assert(version
> fsmap
.epoch
);
105 int err
= get_version(version
, fsmap_bl
);
108 assert(fsmap_bl
.length() > 0);
109 dout(10) << __func__
<< " got " << version
<< dendl
;
110 fsmap
.decode(fsmap_bl
);
113 dout(4) << "new map" << dendl
;
115 if (!g_conf
->mon_mds_skip_sanity
) {
123 void MDSMonitor::init()
125 (void)load_metadata(pending_metadata
);
128 void MDSMonitor::create_pending()
130 pending_fsmap
= fsmap
;
131 pending_fsmap
.epoch
++;
133 dout(10) << "create_pending e" << pending_fsmap
.epoch
<< dendl
;
136 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
138 dout(10) << "encode_pending e" << pending_fsmap
.epoch
<< dendl
;
141 // print map iff 'debug mon = 30' or higher
142 print_map(pending_fsmap
, 30);
143 if (!g_conf
->mon_mds_skip_sanity
) {
144 pending_fsmap
.sanity();
147 // Set 'modified' on maps modified this epoch
148 for (auto &i
: fsmap
.filesystems
) {
149 if (i
.second
->mds_map
.epoch
== fsmap
.epoch
) {
150 i
.second
->mds_map
.modified
= ceph_clock_now();
155 assert(get_last_committed() + 1 == pending_fsmap
.epoch
);
157 pending_fsmap
.encode(fsmap_bl
, mon
->get_quorum_con_features());
159 /* put everything in the transaction */
160 put_version(t
, pending_fsmap
.epoch
, fsmap_bl
);
161 put_last_committed(t
, pending_fsmap
.epoch
);
163 // Encode MDSHealth data
164 for (std::map
<uint64_t, MDSHealth
>::iterator i
= pending_daemon_health
.begin();
165 i
!= pending_daemon_health
.end(); ++i
) {
167 i
->second
.encode(bl
);
168 t
->put(MDS_HEALTH_PREFIX
, stringify(i
->first
), bl
);
171 for (std::set
<uint64_t>::iterator i
= pending_daemon_health_rm
.begin();
172 i
!= pending_daemon_health_rm
.end(); ++i
) {
173 t
->erase(MDS_HEALTH_PREFIX
, stringify(*i
));
175 pending_daemon_health_rm
.clear();
176 remove_from_metadata(t
);
179 version_t
MDSMonitor::get_trim_to()
182 if (g_conf
->mon_mds_force_trim_to
> 0 &&
183 g_conf
->mon_mds_force_trim_to
< (int)get_last_committed()) {
184 floor
= g_conf
->mon_mds_force_trim_to
;
185 dout(10) << __func__
<< " explicit mon_mds_force_trim_to = "
189 unsigned max
= g_conf
->mon_max_mdsmap_epochs
;
190 version_t last
= get_last_committed();
192 if (last
- get_first_committed() > max
&& floor
< last
- max
)
197 void MDSMonitor::update_logger()
199 dout(10) << "update_logger" << dendl
;
204 for (const auto &i
: fsmap
.filesystems
) {
205 const MDSMap
&mds_map
= i
.second
->mds_map
;
207 up
+= mds_map
.get_num_up_mds();
208 in
+= mds_map
.get_num_in_mds();
209 failed
+= mds_map
.get_num_failed_mds();
211 mon
->cluster_logger
->set(l_cluster_num_mds_up
, up
);
212 mon
->cluster_logger
->set(l_cluster_num_mds_in
, in
);
213 mon
->cluster_logger
->set(l_cluster_num_mds_failed
, failed
);
214 mon
->cluster_logger
->set(l_cluster_mds_epoch
, fsmap
.get_epoch());
217 bool MDSMonitor::preprocess_query(MonOpRequestRef op
)
219 op
->mark_mdsmon_event(__func__
);
220 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
221 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
223 switch (m
->get_type()) {
226 return preprocess_beacon(op
);
228 case MSG_MON_COMMAND
:
229 return preprocess_command(op
);
231 case MSG_MDS_OFFLOAD_TARGETS
:
232 return preprocess_offload_targets(op
);
240 void MDSMonitor::_note_beacon(MMDSBeacon
*m
)
242 mds_gid_t gid
= mds_gid_t(m
->get_global_id());
243 version_t seq
= m
->get_seq();
245 dout(15) << "_note_beacon " << *m
<< " noting time" << dendl
;
246 last_beacon
[gid
].stamp
= ceph_clock_now();
247 last_beacon
[gid
].seq
= seq
;
250 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op
)
252 op
->mark_mdsmon_event(__func__
);
253 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
254 MDSMap::DaemonState state
= m
->get_state();
255 mds_gid_t gid
= m
->get_global_id();
256 version_t seq
= m
->get_seq();
257 MDSMap::mds_info_t info
;
258 epoch_t effective_epoch
= 0;
260 // check privileges, ignore if fails
261 MonSession
*session
= m
->get_session();
263 if (!session
->is_capable("mds", MON_CAP_X
)) {
264 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
265 << session
->caps
<< dendl
;
269 if (m
->get_fsid() != mon
->monmap
->fsid
) {
270 dout(0) << "preprocess_beacon on fsid " << m
->get_fsid() << " != " << mon
->monmap
->fsid
<< dendl
;
274 dout(12) << "preprocess_beacon " << *m
275 << " from " << m
->get_orig_source_inst()
276 << " " << m
->get_compat()
279 // make sure the address has a port
280 if (m
->get_orig_source_addr().get_port() == 0) {
281 dout(1) << " ignoring boot message without a port" << dendl
;
286 if (!m
->get_compat().writeable(fsmap
.compat
)) {
287 dout(1) << " mds " << m
->get_source_inst() << " can't write to fsmap " << fsmap
.compat
<< dendl
;
292 if (!mon
->is_leader())
295 // booted, but not in map?
296 if (!pending_fsmap
.gid_exists(gid
)) {
297 if (state
!= MDSMap::STATE_BOOT
) {
298 dout(7) << "mds_beacon " << *m
<< " is not in fsmap (state "
299 << ceph_mds_state_name(state
) << ")" << dendl
;
302 null_map
.epoch
= fsmap
.epoch
;
303 null_map
.compat
= fsmap
.compat
;
304 mon
->send_reply(op
, new MMDSMap(mon
->monmap
->fsid
, &null_map
));
307 return false; // not booted yet.
310 dout(10) << __func__
<< ": GID exists in map: " << gid
<< dendl
;
311 info
= pending_fsmap
.get_info_gid(gid
);
314 if (info
.state_seq
> seq
) {
315 dout(7) << "mds_beacon " << *m
<< " has old seq, ignoring" << dendl
;
319 // Work out the latest epoch that this daemon should have seen
321 fs_cluster_id_t fscid
= pending_fsmap
.mds_roles
.at(gid
);
322 if (fscid
== FS_CLUSTER_ID_NONE
) {
323 effective_epoch
= pending_fsmap
.standby_epochs
.at(gid
);
325 effective_epoch
= pending_fsmap
.get_filesystem(fscid
)->mds_map
.epoch
;
327 if (effective_epoch
!= m
->get_last_epoch_seen()) {
328 dout(10) << "mds_beacon " << *m
329 << " ignoring requested state, because mds hasn't seen latest map" << dendl
;
336 return false; // no longer laggy, need to update map.
338 if (state
== MDSMap::STATE_BOOT
) {
339 // ignore, already booted.
342 // is there a state change here?
343 if (info
.state
!= state
) {
344 // legal state change?
345 if ((info
.state
== MDSMap::STATE_STANDBY
||
346 info
.state
== MDSMap::STATE_STANDBY_REPLAY
) && state
> 0) {
347 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info
.state
)
348 << " -> " << ceph_mds_state_name(state
) << ")" << dendl
;
352 if ((state
== MDSMap::STATE_STANDBY
|| state
== MDSMap::STATE_STANDBY_REPLAY
)
353 && info
.rank
!= MDS_RANK_NONE
)
355 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
356 "held rank " << info
.rank
<< " while requesting state "
357 << ceph_mds_state_name(state
) << dendl
;
365 // Comparing known daemon health with m->get_health()
366 // and return false (i.e. require proposal) if they
367 // do not match, to update our stored
368 if (!(pending_daemon_health
[gid
] == m
->get_health())) {
369 dout(20) << __func__
<< " health metrics for gid " << gid
<< " were updated" << dendl
;
375 // note time and reply
376 assert(effective_epoch
> 0);
379 new MMDSBeacon(mon
->monmap
->fsid
, m
->get_global_id(), m
->get_name(),
380 effective_epoch
, state
, seq
,
381 CEPH_FEATURES_SUPPORTED_DEFAULT
));
385 // I won't reply this beacon, drop it.
390 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op
)
392 op
->mark_mdsmon_event(__func__
);
393 MMDSLoadTargets
*m
= static_cast<MMDSLoadTargets
*>(op
->get_req());
394 dout(10) << "preprocess_offload_targets " << *m
<< " from " << m
->get_orig_source() << dendl
;
396 // check privileges, ignore message if fails
397 MonSession
*session
= m
->get_session();
400 if (!session
->is_capable("mds", MON_CAP_X
)) {
401 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
402 << session
->caps
<< dendl
;
406 if (fsmap
.gid_exists(m
->global_id
) &&
407 m
->targets
== fsmap
.get_info_gid(m
->global_id
).export_targets
)
417 bool MDSMonitor::prepare_update(MonOpRequestRef op
)
419 op
->mark_mdsmon_event(__func__
);
420 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
421 dout(7) << "prepare_update " << *m
<< dendl
;
423 switch (m
->get_type()) {
426 return prepare_beacon(op
);
428 case MSG_MON_COMMAND
:
429 return prepare_command(op
);
431 case MSG_MDS_OFFLOAD_TARGETS
:
432 return prepare_offload_targets(op
);
441 bool MDSMonitor::prepare_beacon(MonOpRequestRef op
)
443 op
->mark_mdsmon_event(__func__
);
444 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
445 // -- this is an update --
446 dout(12) << "prepare_beacon " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
447 entity_addr_t addr
= m
->get_orig_source_inst().addr
;
448 mds_gid_t gid
= m
->get_global_id();
449 MDSMap::DaemonState state
= m
->get_state();
450 version_t seq
= m
->get_seq();
452 dout(20) << __func__
<< " got health from gid " << gid
<< " with " << m
->get_health().metrics
.size() << " metrics." << dendl
;
454 // Calculate deltas of health metrics created and removed
455 // Do this by type rather than MDSHealthMetric equality, because messages can
456 // change a lot when they include e.g. a number of items.
457 const auto &old_health
= pending_daemon_health
[gid
].metrics
;
458 const auto &new_health
= m
->get_health().metrics
;
460 std::set
<mds_metric_t
> old_types
;
461 for (const auto &i
: old_health
) {
462 old_types
.insert(i
.type
);
465 std::set
<mds_metric_t
> new_types
;
466 for (const auto &i
: new_health
) {
467 new_types
.insert(i
.type
);
470 for (const auto &new_metric
: new_health
) {
471 if (old_types
.count(new_metric
.type
) == 0) {
472 std::stringstream msg
;
473 msg
<< "MDS health message (" << m
->get_orig_source_inst().name
<< "): "
474 << new_metric
.message
;
475 if (new_metric
.sev
== HEALTH_ERR
) {
476 mon
->clog
->error() << msg
.str();
477 } else if (new_metric
.sev
== HEALTH_WARN
) {
478 mon
->clog
->warn() << msg
.str();
480 mon
->clog
->info() << msg
.str();
485 // Log the disappearance of health messages at INFO
486 for (const auto &old_metric
: old_health
) {
487 if (new_types
.count(old_metric
.type
) == 0) {
488 mon
->clog
->info() << "MDS health message cleared ("
489 << m
->get_orig_source_inst().name
<< "): " << old_metric
.message
;
494 pending_daemon_health
[gid
] = m
->get_health();
497 if (state
== MDSMap::STATE_BOOT
) {
498 // zap previous instance of this name?
499 if (g_conf
->mds_enforce_unique_name
) {
500 bool failed_mds
= false;
501 while (mds_gid_t existing
= pending_fsmap
.find_mds_gid_by_name(m
->get_name())) {
502 if (!mon
->osdmon()->is_writeable()) {
503 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
506 mon
->clog
->info() << "MDS daemon '" << m
->get_name() << "' restarted";
507 fail_mds_gid(existing
);
511 assert(mon
->osdmon()->is_writeable());
512 request_proposal(mon
->osdmon());
516 // Add this daemon to the map
517 if (pending_fsmap
.mds_roles
.count(gid
) == 0) {
518 MDSMap::mds_info_t new_info
;
519 new_info
.global_id
= gid
;
520 new_info
.name
= m
->get_name();
521 new_info
.addr
= addr
;
522 new_info
.mds_features
= m
->get_mds_features();
523 new_info
.state
= MDSMap::STATE_STANDBY
;
524 new_info
.state_seq
= seq
;
525 new_info
.standby_for_rank
= m
->get_standby_for_rank();
526 new_info
.standby_for_name
= m
->get_standby_for_name();
527 new_info
.standby_for_fscid
= m
->get_standby_for_fscid();
528 new_info
.standby_replay
= m
->get_standby_replay();
529 pending_fsmap
.insert(new_info
);
532 // Resolve standby_for_name to a rank
533 const MDSMap::mds_info_t
&info
= pending_fsmap
.get_info_gid(gid
);
534 if (!info
.standby_for_name
.empty()) {
535 const MDSMap::mds_info_t
*leaderinfo
= fsmap
.find_by_name(
536 info
.standby_for_name
);
537 if (leaderinfo
&& (leaderinfo
->rank
>= 0)) {
538 auto fscid
= pending_fsmap
.mds_roles
.at(leaderinfo
->global_id
);
539 auto fs
= pending_fsmap
.get_filesystem(fscid
);
540 bool followable
= fs
->mds_map
.is_followable(leaderinfo
->rank
);
542 pending_fsmap
.modify_daemon(gid
, [fscid
, leaderinfo
, followable
](
543 MDSMap::mds_info_t
*info
) {
544 info
->standby_for_rank
= leaderinfo
->rank
;
545 info
->standby_for_fscid
= fscid
;
550 // initialize the beacon timer
551 last_beacon
[gid
].stamp
= ceph_clock_now();
552 last_beacon
[gid
].seq
= seq
;
555 if (!pending_fsmap
.compat
.writeable(m
->get_compat())) {
556 dout(10) << " fsmap " << pending_fsmap
.compat
557 << " can't write to new mds' " << m
->get_compat()
558 << ", updating fsmap and killing old mds's"
560 pending_fsmap
.update_compat(m
->get_compat());
563 update_metadata(m
->get_global_id(), m
->get_sys_info());
566 const MDSMap::mds_info_t
&info
= pending_fsmap
.get_info_gid(gid
);
567 // Old MDS daemons don't mention that they're standby replay until
568 // after they've sent their boot beacon, so update this field.
569 if (info
.standby_replay
!= m
->get_standby_replay()) {
570 pending_fsmap
.modify_daemon(info
.global_id
, [&m
](
571 MDSMap::mds_info_t
*i
)
573 i
->standby_replay
= m
->get_standby_replay();
577 if (info
.state
== MDSMap::STATE_STOPPING
&& state
!= MDSMap::STATE_STOPPED
) {
578 // we can't transition to any other states from STOPPING
579 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
586 dout(10) << "prepare_beacon clearing laggy flag on " << addr
<< dendl
;
587 pending_fsmap
.modify_daemon(info
.global_id
, [](MDSMap::mds_info_t
*info
)
594 dout(10) << "prepare_beacon mds." << info
.rank
595 << " " << ceph_mds_state_name(info
.state
)
596 << " -> " << ceph_mds_state_name(state
)
597 << " standby_for_rank=" << m
->get_standby_for_rank()
599 if (state
== MDSMap::STATE_STOPPED
) {
600 auto erased
= pending_fsmap
.stop(gid
);
601 erased
.push_back(gid
);
603 for (const auto &erased_gid
: erased
) {
604 last_beacon
.erase(erased_gid
);
605 if (pending_daemon_health
.count(erased_gid
)) {
606 pending_daemon_health
.erase(erased_gid
);
607 pending_daemon_health_rm
.insert(erased_gid
);
610 } else if (state
== MDSMap::STATE_DAMAGED
) {
611 if (!mon
->osdmon()->is_writeable()) {
612 dout(4) << __func__
<< ": DAMAGED from rank " << info
.rank
613 << " waiting for osdmon writeable to blacklist it" << dendl
;
614 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
618 // Record this MDS rank as damaged, so that other daemons
619 // won't try to run it.
620 dout(4) << __func__
<< ": marking rank "
621 << info
.rank
<< " damaged" << dendl
;
623 utime_t until
= ceph_clock_now();
624 until
+= g_conf
->mds_blacklist_interval
;
625 const auto blacklist_epoch
= mon
->osdmon()->blacklist(info
.addr
, until
);
626 request_proposal(mon
->osdmon());
627 pending_fsmap
.damaged(gid
, blacklist_epoch
);
628 last_beacon
.erase(gid
);
630 // Respond to MDS, so that it knows it can continue to shut down
633 mon
->monmap
->fsid
, m
->get_global_id(),
634 m
->get_name(), fsmap
.get_epoch(), state
, seq
,
635 CEPH_FEATURES_SUPPORTED_DEFAULT
));
636 } else if (state
== MDSMap::STATE_DNE
) {
637 if (!mon
->osdmon()->is_writeable()) {
638 dout(4) << __func__
<< ": DNE from rank " << info
.rank
639 << " waiting for osdmon writeable to blacklist it" << dendl
;
640 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
645 assert(mon
->osdmon()->is_writeable());
646 request_proposal(mon
->osdmon());
648 // Respond to MDS, so that it knows it can continue to shut down
651 mon
->monmap
->fsid
, m
->get_global_id(),
652 m
->get_name(), fsmap
.get_epoch(), state
, seq
,
653 CEPH_FEATURES_SUPPORTED_DEFAULT
));
654 } else if (info
.state
== MDSMap::STATE_STANDBY
&& state
!= info
.state
) {
655 // Standby daemons should never modify their own
656 // state. Reject any attempts to do so.
657 derr
<< "standby " << gid
<< " attempted to change state to "
658 << ceph_mds_state_name(state
) << ", rejecting" << dendl
;
660 } else if (info
.state
!= MDSMap::STATE_STANDBY
&& state
!= info
.state
&&
661 !MDSMap::state_transition_valid(info
.state
, state
)) {
662 // Validate state transitions for daemons that hold a rank
663 derr
<< "daemon " << gid
<< " (rank " << info
.rank
<< ") "
664 << "reported invalid state transition "
665 << ceph_mds_state_name(info
.state
) << " -> "
666 << ceph_mds_state_name(state
) << dendl
;
669 // Made it through special cases and validations, record the
670 // daemon's reported state to the FSMap.
671 pending_fsmap
.modify_daemon(gid
, [state
, seq
](MDSMap::mds_info_t
*info
) {
673 info
->state_seq
= seq
;
678 dout(7) << "prepare_beacon pending map now:" << dendl
;
679 print_map(pending_fsmap
);
681 wait_for_finished_proposal(op
, new FunctionContext([op
, this](int r
){
683 _updated(op
); // success
684 else if (r
== -ECANCELED
) {
687 dispatch(op
); // try again
694 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op
)
696 op
->mark_mdsmon_event(__func__
);
697 MMDSLoadTargets
*m
= static_cast<MMDSLoadTargets
*>(op
->get_req());
698 mds_gid_t gid
= m
->global_id
;
699 if (pending_fsmap
.gid_has_rank(gid
)) {
700 dout(10) << "prepare_offload_targets " << gid
<< " " << m
->targets
<< dendl
;
701 pending_fsmap
.update_export_targets(gid
, m
->targets
);
703 dout(10) << "prepare_offload_targets " << gid
<< " not in map" << dendl
;
708 bool MDSMonitor::should_propose(double& delay
)
710 // delegate to PaxosService to assess whether we should propose
711 return PaxosService::should_propose(delay
);
714 void MDSMonitor::_updated(MonOpRequestRef op
)
716 op
->mark_mdsmon_event(__func__
);
717 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
718 dout(10) << "_updated " << m
->get_orig_source() << " " << *m
<< dendl
;
719 mon
->clog
->info() << m
->get_orig_source_inst() << " "
720 << ceph_mds_state_name(m
->get_state());
722 if (m
->get_state() == MDSMap::STATE_STOPPED
) {
723 // send the map manually (they're out of the map, so they won't get it automatic)
725 null_map
.epoch
= fsmap
.epoch
;
726 null_map
.compat
= fsmap
.compat
;
727 mon
->send_reply(op
, new MMDSMap(mon
->monmap
->fsid
, &null_map
));
729 mon
->send_reply(op
, new MMDSBeacon(mon
->monmap
->fsid
,
735 CEPH_FEATURES_SUPPORTED_DEFAULT
));
739 void MDSMonitor::on_active()
744 if (mon
->is_leader())
745 mon
->clog
->info() << "fsmap " << fsmap
;
748 void MDSMonitor::get_health(list
<pair
<health_status_t
, string
> >& summary
,
749 list
<pair
<health_status_t
, string
> > *detail
,
750 CephContext
* cct
) const
752 fsmap
.get_health(summary
, detail
);
754 // For each MDS GID...
755 const auto info_map
= fsmap
.get_mds_info();
756 for (const auto &i
: info_map
) {
757 const auto &gid
= i
.first
;
758 const auto &info
= i
.second
;
762 mon
->store
->get(MDS_HEALTH_PREFIX
, stringify(gid
), bl
);
764 derr
<< "Missing health data for MDS " << gid
<< dendl
;
768 bufferlist::iterator bl_i
= bl
.begin();
771 for (const auto &metric
: health
.metrics
) {
772 int const rank
= info
.rank
;
773 std::ostringstream message
;
774 message
<< "mds" << rank
<< ": " << metric
.message
;
775 summary
.push_back(std::make_pair(metric
.sev
, message
.str()));
778 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
779 // we duplicate the summary message in the detail string and tag the metadata on.
780 std::ostringstream detail_message
;
781 detail_message
<< message
.str();
782 if (metric
.metadata
.size()) {
783 detail_message
<< "(";
784 auto k
= metric
.metadata
.begin();
785 while (k
!= metric
.metadata
.end()) {
786 detail_message
<< k
->first
<< ": " << k
->second
;
787 if (boost::next(k
) != metric
.metadata
.end()) {
788 detail_message
<< ", ";
792 detail_message
<< ")";
794 detail
->push_back(std::make_pair(metric
.sev
, detail_message
.str()));
800 void MDSMonitor::dump_info(Formatter
*f
)
802 f
->open_object_section("fsmap");
806 f
->dump_unsigned("mdsmap_first_committed", get_first_committed());
807 f
->dump_unsigned("mdsmap_last_committed", get_last_committed());
810 bool MDSMonitor::preprocess_command(MonOpRequestRef op
)
812 op
->mark_mdsmon_event(__func__
);
813 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
818 map
<string
, cmd_vartype
> cmdmap
;
819 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
820 // ss has reason for failure
821 string rs
= ss
.str();
822 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
827 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
829 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
830 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
832 MonSession
*session
= m
->get_session();
834 mon
->reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
838 if (prefix
== "mds stat") {
840 f
->open_object_section("mds_stat");
848 } else if (prefix
== "mds dump") {
853 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
856 int err
= get_version(epoch
, b
);
857 if (err
== -ENOENT
) {
869 const MDSMap
*mdsmap
= nullptr;
871 blank
.epoch
= fsmap
.epoch
;
872 if (fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
873 mdsmap
= &(fsmap
.filesystems
[fsmap
.legacy_client_fscid
]->mds_map
);
878 f
->open_object_section("mdsmap");
879 mdsmap
->dump(f
.get());
889 ss
<< "dumped fsmap epoch " << p
->get_epoch();
895 } else if (prefix
== "fs dump") {
900 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
903 int err
= get_version(epoch
, b
);
904 if (err
== -ENOENT
) {
917 f
->open_object_section("fsmap");
928 ss
<< "dumped fsmap epoch " << p
->get_epoch();
933 } else if (prefix
== "mds metadata") {
935 f
.reset(Formatter::create("json-pretty"));
938 bool all
= !cmd_getval(g_ceph_context
, cmdmap
, "who", who
);
939 dout(1) << "all = " << all
<< dendl
;
942 // Dump all MDSs' metadata
943 const auto all_info
= fsmap
.get_mds_info();
945 f
->open_array_section("mds_metadata");
946 for(const auto &i
: all_info
) {
947 const auto &info
= i
.second
;
949 f
->open_object_section("mds");
950 f
->dump_string("name", info
.name
);
951 std::ostringstream get_err
;
952 r
= dump_metadata(info
.name
, f
.get(), get_err
);
953 if (r
== -EINVAL
|| r
== -ENOENT
) {
954 // Drop error, list what metadata we do have
955 dout(1) << get_err
.str() << dendl
;
958 derr
<< "Unexpected error reading metadata: " << cpp_strerror(r
)
967 // Dump a single daemon's metadata
968 f
->open_object_section("mds_metadata");
969 r
= dump_metadata(who
, f
.get(), ss
);
973 } else if (prefix
== "mds versions") {
975 f
.reset(Formatter::create("json-pretty"));
976 count_metadata("ceph_version", f
.get());
979 } else if (prefix
== "mds count-metadata") {
981 f
.reset(Formatter::create("json-pretty"));
983 cmd_getval(g_ceph_context
, cmdmap
, "property", field
);
984 count_metadata(field
, f
.get());
987 } else if (prefix
== "mds getmap") {
991 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
993 int err
= get_version(e
, b
);
994 if (err
== -ENOENT
) {
1001 mm
.encode(rdata
, m
->get_connection()->get_features());
1002 ss
<< "got fsmap epoch " << mm
.get_epoch();
1006 fsmap
.encode(rdata
, m
->get_connection()->get_features());
1007 ss
<< "got fsmap epoch " << fsmap
.get_epoch();
1010 } else if (prefix
== "mds compat show") {
1012 f
->open_object_section("mds_compat");
1013 fsmap
.compat
.dump(f
.get());
1020 } else if (prefix
== "fs get") {
1022 cmd_getval(g_ceph_context
, cmdmap
, "fs_name", fs_name
);
1023 auto fs
= fsmap
.get_filesystem(fs_name
);
1024 if (fs
== nullptr) {
1025 ss
<< "filesystem '" << fs_name
<< "' not found";
1029 f
->open_object_section("filesystem");
1039 } else if (prefix
== "fs ls") {
1041 f
->open_array_section("filesystems");
1043 for (const auto i
: fsmap
.filesystems
) {
1044 const auto fs
= i
.second
;
1045 f
->open_object_section("filesystem");
1047 const MDSMap
&mds_map
= fs
->mds_map
;
1048 f
->dump_string("name", mds_map
.fs_name
);
1049 /* Output both the names and IDs of pools, for use by
1050 * humans and machines respectively */
1051 f
->dump_string("metadata_pool", mon
->osdmon()->osdmap
.get_pool_name(
1052 mds_map
.metadata_pool
));
1053 f
->dump_int("metadata_pool_id", mds_map
.metadata_pool
);
1054 f
->open_array_section("data_pool_ids");
1056 for (auto dpi
= mds_map
.data_pools
.begin();
1057 dpi
!= mds_map
.data_pools
.end(); ++dpi
) {
1058 f
->dump_int("data_pool_id", *dpi
);
1063 f
->open_array_section("data_pools");
1065 for (auto dpi
= mds_map
.data_pools
.begin();
1066 dpi
!= mds_map
.data_pools
.end(); ++dpi
) {
1067 const auto &name
= mon
->osdmon()->osdmap
.get_pool_name(
1069 f
->dump_string("data_pool", name
);
1081 for (const auto i
: fsmap
.filesystems
) {
1082 const auto fs
= i
.second
;
1083 const MDSMap
&mds_map
= fs
->mds_map
;
1084 const string
&md_pool_name
= mon
->osdmon()->osdmap
.get_pool_name(
1085 mds_map
.metadata_pool
);
1087 ds
<< "name: " << mds_map
.fs_name
<< ", metadata pool: "
1088 << md_pool_name
<< ", data pools: [";
1089 for (auto dpi
: mds_map
.data_pools
) {
1090 const string
&pool_name
= mon
->osdmon()->osdmap
.get_pool_name(dpi
);
1091 ds
<< pool_name
<< " ";
1093 ds
<< "]" << std::endl
;
1096 if (fsmap
.filesystems
.empty()) {
1097 ds
<< "No filesystems enabled" << std::endl
;
1107 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
1113 bool MDSMonitor::fail_mds_gid(mds_gid_t gid
)
1115 const MDSMap::mds_info_t info
= pending_fsmap
.get_info_gid(gid
);
1116 dout(10) << "fail_mds_gid " << gid
<< " mds." << info
.name
<< " role " << info
.rank
<< dendl
;
1118 epoch_t blacklist_epoch
= 0;
1119 if (info
.rank
>= 0 && info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
1120 utime_t until
= ceph_clock_now();
1121 until
+= g_conf
->mds_blacklist_interval
;
1122 blacklist_epoch
= mon
->osdmon()->blacklist(info
.addr
, until
);
1125 pending_fsmap
.erase(gid
, blacklist_epoch
);
1126 last_beacon
.erase(gid
);
1127 if (pending_daemon_health
.count(gid
)) {
1128 pending_daemon_health
.erase(gid
);
1129 pending_daemon_health_rm
.insert(gid
);
1132 return blacklist_epoch
!= 0;
1135 mds_gid_t
MDSMonitor::gid_from_arg(const std::string
& arg
, std::ostream
&ss
)
1137 const FSMap
*relevant_fsmap
= mon
->is_leader() ? &pending_fsmap
: &fsmap
;
1139 // Try parsing as a role
1141 std::ostringstream ignore_err
; // Don't spam 'ss' with parse_role errors
1142 int r
= parse_role(arg
, &role
, ignore_err
);
1144 // See if a GID is assigned to this role
1145 auto fs
= relevant_fsmap
->get_filesystem(role
.fscid
);
1146 assert(fs
!= nullptr); // parse_role ensures it exists
1147 if (fs
->mds_map
.is_up(role
.rank
)) {
1148 dout(10) << __func__
<< ": validated rank/GID " << role
1149 << " as a rank" << dendl
;
1150 return fs
->mds_map
.get_mds_info(role
.rank
).global_id
;
1154 // Try parsing as a gid
1156 unsigned long long maybe_gid
= strict_strtoll(arg
.c_str(), 10, &err
);
1158 // Not a role or a GID, try as a daemon name
1159 const MDSMap::mds_info_t
*mds_info
= relevant_fsmap
->find_by_name(arg
);
1161 ss
<< "MDS named '" << arg
1162 << "' does not exist, or is not up";
1163 return MDS_GID_NONE
;
1165 dout(10) << __func__
<< ": resolved MDS name '" << arg
1166 << "' to GID " << mds_info
->global_id
<< dendl
;
1167 return mds_info
->global_id
;
1169 // Not a role, but parses as a an integer, might be a GID
1170 dout(10) << __func__
<< ": treating MDS reference '" << arg
1171 << "' as an integer " << maybe_gid
<< dendl
;
1173 if (relevant_fsmap
->gid_exists(mds_gid_t(maybe_gid
))) {
1174 return mds_gid_t(maybe_gid
);
1178 dout(1) << __func__
<< ": rank/GID " << arg
1179 << " not a existent rank or GID" << dendl
;
1180 return MDS_GID_NONE
;
1183 int MDSMonitor::fail_mds(std::ostream
&ss
, const std::string
&arg
)
1185 mds_gid_t gid
= gid_from_arg(arg
, ss
);
1186 if (gid
== MDS_GID_NONE
) {
1189 if (!mon
->osdmon()->is_writeable()) {
1193 ss
<< "failed mds gid " << gid
;
1194 assert(mon
->osdmon()->is_writeable());
1195 request_proposal(mon
->osdmon());
1199 bool MDSMonitor::prepare_command(MonOpRequestRef op
)
1201 op
->mark_mdsmon_event(__func__
);
1202 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
1207 map
<string
, cmd_vartype
> cmdmap
;
1208 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
1209 string rs
= ss
.str();
1210 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
1215 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
1217 /* Refuse access if message not associated with a valid session */
1218 MonSession
*session
= m
->get_session();
1220 mon
->reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
1224 for (auto h
: handlers
) {
1225 if (h
->can_handle(prefix
)) {
1226 r
= h
->handle(mon
, pending_fsmap
, op
, cmdmap
, ss
);
1228 // message has been enqueued for retry; return.
1229 dout(4) << __func__
<< " enqueue for retry by prepare_command" << dendl
;
1233 // On successful updates, print the updated map
1234 print_map(pending_fsmap
);
1236 // Successful or not, we're done: respond.
1242 r
= filesystem_command(op
, prefix
, cmdmap
, ss
);
1245 } else if (r
== -EAGAIN
) {
1246 // Do not reply, the message has been enqueued for retry
1247 dout(4) << __func__
<< " enqueue for retry by filesystem_command" << dendl
;
1249 } else if (r
!= -ENOSYS
) {
1253 // Only handle legacy commands if there is a filesystem configured
1254 if (pending_fsmap
.legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
1255 if (pending_fsmap
.filesystems
.size() == 0) {
1256 ss
<< "No filesystem configured: use `ceph fs new` to create a filesystem";
1258 ss
<< "No filesystem set for use with legacy commands";
1264 r
= legacy_filesystem_command(op
, prefix
, cmdmap
, ss
);
1266 if (r
== -ENOSYS
&& ss
.str().empty()) {
1267 ss
<< "unrecognized command";
1271 dout(4) << __func__
<< " done, r=" << r
<< dendl
;
1272 /* Compose response */
1277 // success.. delay reply
1278 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, r
, rs
,
1279 get_last_committed() + 1));
1282 // reply immediately
1283 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
1290 * Given one of the following forms:
1295 * Parse into a mds_role_t. The rank-only form is only valid
1296 * if legacy_client_ns is set.
1298 int MDSMonitor::parse_role(
1299 const std::string
&role_str
,
1303 const FSMap
*relevant_fsmap
= &fsmap
;
1304 if (mon
->is_leader()) {
1305 relevant_fsmap
= &pending_fsmap
;
1307 return relevant_fsmap
->parse_role(role_str
, role
, ss
);
1310 int MDSMonitor::filesystem_command(
1312 std::string
const &prefix
,
1313 map
<string
, cmd_vartype
> &cmdmap
,
1314 std::stringstream
&ss
)
1316 dout(4) << __func__
<< " prefix='" << prefix
<< "'" << dendl
;
1317 op
->mark_mdsmon_event(__func__
);
1320 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
1322 if (prefix
== "mds stop" ||
1323 prefix
== "mds deactivate") {
1326 r
= parse_role(whostr
, &role
, ss
);
1330 auto fs
= pending_fsmap
.get_filesystem(role
.fscid
);
1332 if (!fs
->mds_map
.is_active(role
.rank
)) {
1334 ss
<< "mds." << role
<< " not active ("
1335 << ceph_mds_state_name(fs
->mds_map
.get_state(role
.rank
)) << ")";
1336 } else if (fs
->mds_map
.get_root() == role
.rank
||
1337 fs
->mds_map
.get_tableserver() == role
.rank
) {
1339 ss
<< "can't tell the root (" << fs
->mds_map
.get_root()
1340 << ") or tableserver (" << fs
->mds_map
.get_tableserver()
1341 << ") to deactivate";
1342 } else if (role
.rank
!= fs
->mds_map
.get_last_in_mds()) {
1344 ss
<< "mds." << role
<< " doesn't have the max rank ("
1345 << fs
->mds_map
.get_last_in_mds() << ")";
1346 } else if (fs
->mds_map
.get_num_in_mds() <= size_t(fs
->mds_map
.get_max_mds())) {
1348 ss
<< "must decrease max_mds or else MDS will immediately reactivate";
1351 mds_gid_t gid
= fs
->mds_map
.up
.at(role
.rank
);
1352 ss
<< "telling mds." << role
<< " "
1353 << pending_fsmap
.get_info_gid(gid
).addr
<< " to deactivate";
1355 pending_fsmap
.modify_daemon(gid
, [](MDSMap::mds_info_t
*info
) {
1356 info
->state
= MDSMap::STATE_STOPPING
;
1359 } else if (prefix
== "mds set_state") {
1361 if (!cmd_getval(g_ceph_context
, cmdmap
, "gid", gid
)) {
1362 ss
<< "error parsing 'gid' value '"
1363 << cmd_vartype_stringify(cmdmap
["gid"]) << "'";
1366 MDSMap::DaemonState state
;
1367 if (!cmd_getval(g_ceph_context
, cmdmap
, "state", state
)) {
1368 ss
<< "error parsing 'state' string value '"
1369 << cmd_vartype_stringify(cmdmap
["state"]) << "'";
1372 if (pending_fsmap
.gid_exists(gid
)) {
1373 pending_fsmap
.modify_daemon(gid
, [state
](MDSMap::mds_info_t
*info
) {
1374 info
->state
= state
;
1376 ss
<< "set mds gid " << gid
<< " to state " << state
<< " "
1377 << ceph_mds_state_name(state
);
1380 } else if (prefix
== "mds fail") {
1382 cmd_getval(g_ceph_context
, cmdmap
, "who", who
);
1383 r
= fail_mds(ss
, who
);
1384 if (r
< 0 && r
== -EAGAIN
) {
1385 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
1386 return -EAGAIN
; // don't propose yet; wait for message to be retried
1388 } else if (prefix
== "mds rm") {
1390 if (!cmd_getval(g_ceph_context
, cmdmap
, "gid", gid
)) {
1391 ss
<< "error parsing 'gid' value '"
1392 << cmd_vartype_stringify(cmdmap
["gid"]) << "'";
1395 if (!pending_fsmap
.gid_exists(gid
)) {
1396 ss
<< "mds gid " << gid
<< " dne";
1399 MDSMap::DaemonState state
= pending_fsmap
.get_info_gid(gid
).state
;
1401 ss
<< "cannot remove active mds." << pending_fsmap
.get_info_gid(gid
).name
1402 << " rank " << pending_fsmap
.get_info_gid(gid
).rank
;
1405 pending_fsmap
.erase(gid
, {});
1406 ss
<< "removed mds gid " << gid
;
1410 } else if (prefix
== "mds rmfailed") {
1412 if (!cmd_getval(g_ceph_context
, cmdmap
, "confirm", confirm
) ||
1413 confirm
!= "--yes-i-really-mean-it") {
1414 ss
<< "WARNING: this can make your filesystem inaccessible! "
1415 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1419 std::string role_str
;
1420 cmd_getval(g_ceph_context
, cmdmap
, "who", role_str
);
1422 int r
= parse_role(role_str
, &role
, ss
);
1424 ss
<< "invalid role '" << role_str
<< "'";
1428 pending_fsmap
.modify_filesystem(
1430 [role
](std::shared_ptr
<Filesystem
> fs
)
1432 fs
->mds_map
.failed
.erase(role
.rank
);
1435 ss
<< "removed failed mds." << role
;
1437 } else if (prefix
== "mds compat rm_compat") {
1439 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature", f
)) {
1440 ss
<< "error parsing feature value '"
1441 << cmd_vartype_stringify(cmdmap
["feature"]) << "'";
1444 if (pending_fsmap
.compat
.compat
.contains(f
)) {
1445 ss
<< "removing compat feature " << f
;
1446 CompatSet modified
= pending_fsmap
.compat
;
1447 modified
.compat
.remove(f
);
1448 pending_fsmap
.update_compat(modified
);
1450 ss
<< "compat feature " << f
<< " not present in " << pending_fsmap
.compat
;
1453 } else if (prefix
== "mds compat rm_incompat") {
1455 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature", f
)) {
1456 ss
<< "error parsing feature value '"
1457 << cmd_vartype_stringify(cmdmap
["feature"]) << "'";
1460 if (pending_fsmap
.compat
.incompat
.contains(f
)) {
1461 ss
<< "removing incompat feature " << f
;
1462 CompatSet modified
= pending_fsmap
.compat
;
1463 modified
.incompat
.remove(f
);
1464 pending_fsmap
.update_compat(modified
);
1466 ss
<< "incompat feature " << f
<< " not present in " << pending_fsmap
.compat
;
1469 } else if (prefix
== "mds repaired") {
1470 std::string role_str
;
1471 cmd_getval(g_ceph_context
, cmdmap
, "rank", role_str
);
1473 r
= parse_role(role_str
, &role
, ss
);
1478 bool modified
= pending_fsmap
.undamaged(role
.fscid
, role
.rank
);
1480 dout(4) << "repaired: restoring rank " << role
<< dendl
;
1482 dout(4) << "repaired: no-op on rank " << role
<< dendl
;
1494 * Helper to legacy_filesystem_command
1496 void MDSMonitor::modify_legacy_filesystem(
1497 std::function
<void(std::shared_ptr
<Filesystem
> )> fn
)
1499 pending_fsmap
.modify_filesystem(
1500 pending_fsmap
.legacy_client_fscid
,
1508 * Handle a command that affects the filesystem (i.e. a filesystem
1509 * must exist for the command to act upon).
1511 * @retval 0 Command was successfully handled and has side effects
1512 * @retval -EAGAIN Messages has been requeued for retry
1513 * @retval -ENOSYS Unknown command
1514 * @retval < 0 An error has occurred; **ss** may have been set.
1516 int MDSMonitor::legacy_filesystem_command(
1518 std::string
const &prefix
,
1519 map
<string
, cmd_vartype
> &cmdmap
,
1520 std::stringstream
&ss
)
1522 dout(4) << __func__
<< " prefix='" << prefix
<< "'" << dendl
;
1523 op
->mark_mdsmon_event(__func__
);
1526 cmd_getval(g_ceph_context
, cmdmap
, "who", whostr
);
1528 assert (pending_fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
);
1530 if (prefix
== "mds set_max_mds") {
1531 // NOTE: deprecated by "fs set max_mds"
1533 if (!cmd_getval(g_ceph_context
, cmdmap
, "maxmds", maxmds
) || maxmds
<= 0) {
1537 const MDSMap
& mdsmap
=
1538 pending_fsmap
.filesystems
.at(pending_fsmap
.legacy_client_fscid
)->mds_map
;
1540 if (!mdsmap
.allows_multimds() &&
1541 maxmds
> mdsmap
.get_max_mds() &&
1543 ss
<< "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1547 if (maxmds
> MAX_MDS
) {
1548 ss
<< "may not have more than " << MAX_MDS
<< " MDS ranks";
1552 modify_legacy_filesystem(
1553 [maxmds
](std::shared_ptr
<Filesystem
> fs
)
1555 fs
->mds_map
.set_max_mds(maxmds
);
1559 ss
<< "max_mds = " << maxmds
;
1560 } else if (prefix
== "mds cluster_down") {
1561 // NOTE: deprecated by "fs set cluster_down"
1562 modify_legacy_filesystem(
1563 [](std::shared_ptr
<Filesystem
> fs
)
1565 fs
->mds_map
.set_flag(CEPH_MDSMAP_DOWN
);
1567 ss
<< "marked fsmap DOWN";
1569 } else if (prefix
== "mds cluster_up") {
1570 // NOTE: deprecated by "fs set cluster_up"
1571 modify_legacy_filesystem(
1572 [](std::shared_ptr
<Filesystem
> fs
)
1574 fs
->mds_map
.clear_flag(CEPH_MDSMAP_DOWN
);
1576 ss
<< "unmarked fsmap DOWN";
1586 void MDSMonitor::check_subs()
1588 std::list
<std::string
> types
;
1590 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1591 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1592 // filesystems. Build a list of all the types we service
1593 // subscriptions for.
1594 types
.push_back("fsmap");
1595 types
.push_back("fsmap.user");
1596 types
.push_back("mdsmap");
1597 for (const auto &i
: fsmap
.filesystems
) {
1598 auto fscid
= i
.first
;
1599 std::ostringstream oss
;
1600 oss
<< "mdsmap." << fscid
;
1601 types
.push_back(oss
.str());
1604 for (const auto &type
: types
) {
1605 if (mon
->session_map
.subs
.count(type
) == 0)
1607 xlist
<Subscription
*>::iterator p
= mon
->session_map
.subs
[type
]->begin();
1609 Subscription
*sub
= *p
;
1617 void MDSMonitor::check_sub(Subscription
*sub
)
1619 dout(20) << __func__
<< ": " << sub
->type
<< dendl
;
1621 if (sub
->type
== "fsmap") {
1622 if (sub
->next
<= fsmap
.get_epoch()) {
1623 sub
->session
->con
->send_message(new MFSMap(mon
->monmap
->fsid
, fsmap
));
1625 mon
->session_map
.remove_sub(sub
);
1627 sub
->next
= fsmap
.get_epoch() + 1;
1630 } else if (sub
->type
== "fsmap.user") {
1631 if (sub
->next
<= fsmap
.get_epoch()) {
1633 fsmap_u
.epoch
= fsmap
.get_epoch();
1634 fsmap_u
.legacy_client_fscid
= fsmap
.legacy_client_fscid
;
1635 for (auto p
= fsmap
.filesystems
.begin();
1636 p
!= fsmap
.filesystems
.end();
1638 FSMapUser::fs_info_t
& fs_info
= fsmap_u
.filesystems
[p
->first
];
1639 fs_info
.cid
= p
->first
;
1640 fs_info
.name
= p
->second
->mds_map
.fs_name
;
1642 sub
->session
->con
->send_message(new MFSMapUser(mon
->monmap
->fsid
, fsmap_u
));
1644 mon
->session_map
.remove_sub(sub
);
1646 sub
->next
= fsmap
.get_epoch() + 1;
1649 } else if (sub
->type
.compare(0, 6, "mdsmap") == 0) {
1650 if (sub
->next
> fsmap
.get_epoch()) {
1654 const bool is_mds
= sub
->session
->inst
.name
.is_mds();
1655 mds_gid_t mds_gid
= MDS_GID_NONE
;
1656 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
1658 // What (if any) namespace are you assigned to?
1659 auto mds_info
= fsmap
.get_mds_info();
1660 for (const auto &i
: mds_info
) {
1661 if (i
.second
.addr
== sub
->session
->inst
.addr
) {
1663 fscid
= fsmap
.mds_roles
.at(mds_gid
);
1667 // You're a client. Did you request a particular
1669 if (sub
->type
.find("mdsmap.") == 0) {
1670 auto namespace_id_str
= sub
->type
.substr(std::string("mdsmap.").size());
1671 dout(10) << __func__
<< ": namespace_id " << namespace_id_str
<< dendl
;
1673 fscid
= strict_strtoll(namespace_id_str
.c_str(), 10, &err
);
1675 // Client asked for a non-existent namespace, send them nothing
1676 dout(1) << "Invalid client subscription '" << sub
->type
1680 if (fsmap
.filesystems
.count(fscid
) == 0) {
1681 // Client asked for a non-existent namespace, send them nothing
1682 // TODO: something more graceful for when a client has a filesystem
1683 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1685 dout(1) << "Client subscribed to non-existent namespace '" <<
1686 fscid
<< "'" << dendl
;
1690 // Unqualified request for "mdsmap": give it the one marked
1691 // for use by legacy clients.
1692 if (fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
1693 fscid
= fsmap
.legacy_client_fscid
;
1695 dout(1) << "Client subscribed for legacy filesystem but "
1696 "none is configured" << dendl
;
1701 dout(10) << __func__
<< ": is_mds=" << is_mds
<< ", fscid= " << fscid
<< dendl
;
1703 // Work out the effective latest epoch
1704 MDSMap
*mds_map
= nullptr;
1706 null_map
.compat
= fsmap
.compat
;
1707 if (fscid
== FS_CLUSTER_ID_NONE
) {
1708 // For a client, we should have already dropped out
1711 if (fsmap
.standby_daemons
.count(mds_gid
)) {
1712 // For an MDS, we need to feed it an MDSMap with its own state in
1713 null_map
.mds_info
[mds_gid
] = fsmap
.standby_daemons
[mds_gid
];
1714 null_map
.epoch
= fsmap
.standby_epochs
[mds_gid
];
1716 null_map
.epoch
= fsmap
.epoch
;
1718 mds_map
= &null_map
;
1720 // Check the effective epoch
1721 mds_map
= &(fsmap
.filesystems
.at(fscid
)->mds_map
);
1724 assert(mds_map
!= nullptr);
1725 dout(10) << __func__
<< " selected MDS map epoch " <<
1726 mds_map
->epoch
<< " for namespace " << fscid
<< " for subscriber "
1727 << sub
->session
->inst
.name
<< " who wants epoch " << sub
->next
<< dendl
;
1729 if (sub
->next
> mds_map
->epoch
) {
1732 auto msg
= new MMDSMap(mon
->monmap
->fsid
, mds_map
);
1734 sub
->session
->con
->send_message(msg
);
1736 mon
->session_map
.remove_sub(sub
);
1738 sub
->next
= mds_map
->get_epoch() + 1;
1744 void MDSMonitor::update_metadata(mds_gid_t gid
,
1745 const map
<string
, string
>& metadata
)
1747 if (metadata
.empty()) {
1750 pending_metadata
[gid
] = metadata
;
1752 MonitorDBStore::TransactionRef t
= paxos
->get_pending_transaction();
1754 ::encode(pending_metadata
, bl
);
1755 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1756 paxos
->trigger_propose();
1759 void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t
)
1761 bool update
= false;
1762 for (map
<mds_gid_t
, Metadata
>::iterator i
= pending_metadata
.begin();
1763 i
!= pending_metadata
.end(); ) {
1764 if (!pending_fsmap
.gid_exists(i
->first
)) {
1765 pending_metadata
.erase(i
++);
1774 ::encode(pending_metadata
, bl
);
1775 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1778 int MDSMonitor::load_metadata(map
<mds_gid_t
, Metadata
>& m
)
1781 int r
= mon
->store
->get(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1783 dout(1) << "Unable to load 'last_metadata'" << dendl
;
1787 bufferlist::iterator it
= bl
.begin();
1792 void MDSMonitor::count_metadata(const string
& field
, Formatter
*f
)
1794 map
<string
,int> by_val
;
1795 map
<mds_gid_t
,Metadata
> meta
;
1796 load_metadata(meta
);
1797 for (auto& p
: meta
) {
1798 auto q
= p
.second
.find(field
);
1799 if (q
== p
.second
.end()) {
1800 by_val
["unknown"]++;
1802 by_val
[q
->second
]++;
1805 f
->open_object_section(field
.c_str());
1806 for (auto& p
: by_val
) {
1807 f
->dump_int(p
.first
.c_str(), p
.second
);
1812 int MDSMonitor::dump_metadata(const std::string
&who
, Formatter
*f
, ostream
& err
)
1816 mds_gid_t gid
= gid_from_arg(who
, err
);
1817 if (gid
== MDS_GID_NONE
) {
1821 map
<mds_gid_t
, Metadata
> metadata
;
1822 if (int r
= load_metadata(metadata
)) {
1823 err
<< "Unable to load 'last_metadata'";
1827 if (!metadata
.count(gid
)) {
1830 const Metadata
& m
= metadata
[gid
];
1831 for (Metadata::const_iterator p
= m
.begin(); p
!= m
.end(); ++p
) {
1832 f
->dump_string(p
->first
.c_str(), p
->second
);
1837 int MDSMonitor::print_nodes(Formatter
*f
)
1841 map
<mds_gid_t
, Metadata
> metadata
;
1842 if (int r
= load_metadata(metadata
)) {
1846 map
<string
, list
<int> > mdses
; // hostname => rank
1847 for (map
<mds_gid_t
, Metadata
>::iterator it
= metadata
.begin();
1848 it
!= metadata
.end(); ++it
) {
1849 const Metadata
& m
= it
->second
;
1850 Metadata::const_iterator hostname
= m
.find("hostname");
1851 if (hostname
== m
.end()) {
1852 // not likely though
1855 const mds_gid_t gid
= it
->first
;
1856 if (!fsmap
.gid_exists(gid
)) {
1857 dout(5) << __func__
<< ": GID " << gid
<< " not existent" << dendl
;
1860 const MDSMap::mds_info_t
& mds_info
= fsmap
.get_info_gid(gid
);
1861 // FIXME: include filesystem name with rank here
1862 mdses
[hostname
->second
].push_back(mds_info
.rank
);
1865 dump_services(f
, mdses
, "mds");
1870 * If a cluster is undersized (with respect to max_mds), then
1871 * attempt to find daemons to grow it.
1873 bool MDSMonitor::maybe_expand_cluster(std::shared_ptr
<Filesystem
> fs
)
1875 bool do_propose
= false;
1877 if (fs
->mds_map
.test_flag(CEPH_MDSMAP_DOWN
)) {
1881 while (fs
->mds_map
.get_num_in_mds() < size_t(fs
->mds_map
.get_max_mds()) &&
1882 !fs
->mds_map
.is_degraded()) {
1883 mds_rank_t mds
= mds_rank_t(0);
1885 while (fs
->mds_map
.is_in(mds
)) {
1888 mds_gid_t newgid
= pending_fsmap
.find_replacement_for({fs
->fscid
, mds
},
1889 name
, g_conf
->mon_force_standby_active
);
1890 if (newgid
== MDS_GID_NONE
) {
1894 dout(1) << "adding standby " << pending_fsmap
.get_info_gid(newgid
).addr
1895 << " as mds." << mds
<< dendl
;
1896 pending_fsmap
.promote(newgid
, fs
, mds
);
1905 * If a daemon is laggy, and a suitable replacement
1906 * is available, fail this daemon (remove from map) and pass its
1907 * role to another daemon.
1909 void MDSMonitor::maybe_replace_gid(mds_gid_t gid
,
1910 const beacon_info_t
&beacon
,
1911 bool *mds_propose
, bool *osd_propose
)
1913 assert(mds_propose
!= nullptr);
1914 assert(osd_propose
!= nullptr);
1916 const MDSMap::mds_info_t info
= pending_fsmap
.get_info_gid(gid
);
1917 const auto fscid
= pending_fsmap
.mds_roles
.at(gid
);
1919 dout(10) << "no beacon from " << gid
<< " " << info
.addr
<< " mds."
1920 << info
.rank
<< "." << info
.inc
1921 << " " << ceph_mds_state_name(info
.state
)
1922 << " since " << beacon
.stamp
<< dendl
;
1924 // We will only take decisive action (replacing/removing a daemon)
1925 // if we have some indicating that some other daemon(s) are successfully
1926 // getting beacons through recently.
1927 utime_t latest_beacon
;
1928 for (const auto & i
: last_beacon
) {
1929 latest_beacon
= MAX(i
.second
.stamp
, latest_beacon
);
1931 const bool may_replace
= latest_beacon
>
1933 MAX(g_conf
->mds_beacon_interval
, g_conf
->mds_beacon_grace
* 0.5));
1936 // and is there a non-laggy standby that can take over for us?
1938 if (info
.rank
>= 0 &&
1939 info
.state
!= MDSMap::STATE_STANDBY
&&
1940 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
&&
1942 !pending_fsmap
.get_filesystem(fscid
)->mds_map
.test_flag(CEPH_MDSMAP_DOWN
) &&
1943 (sgid
= pending_fsmap
.find_replacement_for({fscid
, info
.rank
}, info
.name
,
1944 g_conf
->mon_force_standby_active
)) != MDS_GID_NONE
)
1947 MDSMap::mds_info_t si
= pending_fsmap
.get_info_gid(sgid
);
1948 dout(10) << " replacing " << gid
<< " " << info
.addr
<< " mds."
1949 << info
.rank
<< "." << info
.inc
1950 << " " << ceph_mds_state_name(info
.state
)
1951 << " with " << sgid
<< "/" << si
.name
<< " " << si
.addr
<< dendl
;
1953 mon
->clog
->warn() << "MDS daemon '" << info
.name
<< "'"
1954 << " is not responding, replacing it "
1955 << "as rank " << info
.rank
1956 << " with standby '" << si
.name
<< "'";
1958 // Remember what NS the old one was in
1959 const fs_cluster_id_t fscid
= pending_fsmap
.mds_roles
.at(gid
);
1961 // Remove the old one
1962 *osd_propose
|= fail_mds_gid(gid
);
1964 // Promote the replacement
1965 auto fs
= pending_fsmap
.filesystems
.at(fscid
);
1966 pending_fsmap
.promote(sgid
, fs
, info
.rank
);
1968 *mds_propose
= true;
1969 } else if ((info
.state
== MDSMap::STATE_STANDBY_REPLAY
||
1970 info
.state
== MDSMap::STATE_STANDBY
) && may_replace
) {
1971 dout(10) << " failing and removing " << gid
<< " " << info
.addr
<< " mds." << info
.rank
1972 << "." << info
.inc
<< " " << ceph_mds_state_name(info
.state
)
1974 mon
->clog
->info() << "MDS standby '" << info
.name
1975 << "' is not responding, removing it from the set of "
1978 *mds_propose
= true;
1979 } else if (!info
.laggy()) {
1980 dout(10) << " marking " << gid
<< " " << info
.addr
<< " mds." << info
.rank
<< "." << info
.inc
1981 << " " << ceph_mds_state_name(info
.state
)
1982 << " laggy" << dendl
;
1983 pending_fsmap
.modify_daemon(info
.global_id
, [](MDSMap::mds_info_t
*info
) {
1984 info
->laggy_since
= ceph_clock_now();
1986 *mds_propose
= true;
1990 bool MDSMonitor::maybe_promote_standby(std::shared_ptr
<Filesystem
> fs
)
1992 assert(!fs
->mds_map
.test_flag(CEPH_MDSMAP_DOWN
));
1994 bool do_propose
= false;
1996 // have a standby take over?
1997 set
<mds_rank_t
> failed
;
1998 fs
->mds_map
.get_failed_mds_set(failed
);
1999 if (!failed
.empty()) {
2000 set
<mds_rank_t
>::iterator p
= failed
.begin();
2001 while (p
!= failed
.end()) {
2002 mds_rank_t f
= *p
++;
2003 mds_gid_t sgid
= pending_fsmap
.find_replacement_for({fs
->fscid
, f
}, {},
2004 g_conf
->mon_force_standby_active
);
2006 const MDSMap::mds_info_t si
= pending_fsmap
.get_info_gid(sgid
);
2007 dout(0) << " taking over failed mds." << f
<< " with " << sgid
2008 << "/" << si
.name
<< " " << si
.addr
<< dendl
;
2009 pending_fsmap
.promote(sgid
, fs
, f
);
2014 // There were no failures to replace, so try using any available standbys
2015 // as standby-replay daemons.
2017 // Take a copy of the standby GIDs so that we can iterate over
2018 // them while perhaps-modifying standby_daemons during the loop
2019 // (if we promote anyone they are removed from standby_daemons)
2020 std::vector
<mds_gid_t
> standby_gids
;
2021 for (const auto &j
: pending_fsmap
.standby_daemons
) {
2022 standby_gids
.push_back(j
.first
);
2025 for (const auto &gid
: standby_gids
) {
2026 const auto &info
= pending_fsmap
.standby_daemons
.at(gid
);
2027 assert(info
.state
== MDSMap::STATE_STANDBY
);
2029 if (!info
.standby_replay
) {
2034 * This mds is standby but has no rank assigned.
2035 * See if we can find it somebody to shadow
2037 dout(20) << "gid " << gid
<< " is standby and following nobody" << dendl
;
2039 // standby for someone specific?
2040 if (info
.standby_for_rank
>= 0) {
2041 // The mds_info_t may or may not tell us exactly which filesystem
2042 // the standby_for_rank refers to: lookup via legacy_client_fscid
2043 mds_role_t target_role
= {
2044 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
2045 pending_fsmap
.legacy_client_fscid
: info
.standby_for_fscid
,
2046 info
.standby_for_rank
};
2048 // It is possible that the map contains a standby_for_fscid
2049 // that doesn't correspond to an existing filesystem, especially
2050 // if we loaded from a version with a bug (#17466)
2051 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
2052 && !pending_fsmap
.filesystem_exists(info
.standby_for_fscid
)) {
2053 derr
<< "gid " << gid
<< " has invalid standby_for_fscid "
2054 << info
.standby_for_fscid
<< dendl
;
2058 // If we managed to resolve a full target role
2059 if (target_role
.fscid
!= FS_CLUSTER_ID_NONE
) {
2060 auto fs
= pending_fsmap
.get_filesystem(target_role
.fscid
);
2061 if (fs
->mds_map
.is_followable(target_role
.rank
)) {
2062 do_propose
|= try_standby_replay(
2065 fs
->mds_map
.get_info(target_role
.rank
));
2073 for (auto fs_i
: pending_fsmap
.filesystems
) {
2074 const MDSMap
&mds_map
= fs_i
.second
->mds_map
;
2075 for (auto mds_i
: mds_map
.mds_info
) {
2076 MDSMap::mds_info_t
&cand_info
= mds_i
.second
;
2077 if (cand_info
.rank
>= 0 && mds_map
.is_followable(cand_info
.rank
)) {
2078 if ((info
.standby_for_name
.length() && info
.standby_for_name
!= cand_info
.name
) ||
2079 info
.standby_for_rank
!= MDS_RANK_NONE
) {
2080 continue; // we're supposed to follow someone else
2083 if (try_standby_replay(info
, *(fs_i
.second
), cand_info
)) {
2097 void MDSMonitor::tick()
2099 // make sure mds's are still alive
2100 // ...if i am an active leader
2101 if (!is_active()) return;
2103 dout(10) << fsmap
<< dendl
;
2105 bool do_propose
= false;
2107 if (!mon
->is_leader()) return;
2109 do_propose
|= pending_fsmap
.check_health();
2111 // expand mds cluster (add new nodes to @in)?
2112 for (auto i
: pending_fsmap
.filesystems
) {
2113 do_propose
|= maybe_expand_cluster(i
.second
);
2116 const auto now
= ceph_clock_now();
2117 if (last_tick
.is_zero()) {
2121 if (now
- last_tick
> (g_conf
->mds_beacon_grace
- g_conf
->mds_beacon_interval
)) {
2122 // This case handles either local slowness (calls being delayed
2123 // for whatever reason) or cluster election slowness (a long gap
2124 // between calls while an election happened)
2125 dout(4) << __func__
<< ": resetting beacon timeouts due to mon delay "
2126 "(slow election?) of " << now
- last_tick
<< " seconds" << dendl
;
2127 for (auto &i
: last_beacon
) {
2128 i
.second
.stamp
= now
;
2134 // check beacon timestamps
2135 utime_t cutoff
= now
;
2136 cutoff
-= g_conf
->mds_beacon_grace
;
2138 // make sure last_beacon is fully populated
2139 for (const auto &p
: pending_fsmap
.mds_roles
) {
2140 auto &gid
= p
.first
;
2141 if (last_beacon
.count(gid
) == 0) {
2142 last_beacon
[gid
].stamp
= now
;
2143 last_beacon
[gid
].seq
= 0;
2147 // If the OSDMap is writeable, we can blacklist things, so we can
2148 // try failing any laggy MDS daemons. Consider each one for failure.
2149 if (mon
->osdmon()->is_writeable()) {
2150 bool propose_osdmap
= false;
2152 map
<mds_gid_t
, beacon_info_t
>::iterator p
= last_beacon
.begin();
2153 while (p
!= last_beacon
.end()) {
2154 mds_gid_t gid
= p
->first
;
2155 auto beacon_info
= p
->second
;
2158 if (!pending_fsmap
.gid_exists(gid
)) {
2160 last_beacon
.erase(gid
);
2164 if (beacon_info
.stamp
< cutoff
) {
2165 maybe_replace_gid(gid
, beacon_info
, &do_propose
, &propose_osdmap
);
2169 if (propose_osdmap
) {
2170 request_proposal(mon
->osdmon());
2174 for (auto i
: pending_fsmap
.filesystems
) {
2176 if (!fs
->mds_map
.test_flag(CEPH_MDSMAP_DOWN
)) {
2177 do_propose
|= maybe_promote_standby(fs
);
2187 * finfo: the would-be follower
2188 * leader_fs: the Filesystem containing the would-be leader
2189 * ainfo: the would-be leader
2191 bool MDSMonitor::try_standby_replay(
2192 const MDSMap::mds_info_t
& finfo
,
2193 const Filesystem
&leader_fs
,
2194 const MDSMap::mds_info_t
& ainfo
)
2196 // someone else already following?
2197 if (leader_fs
.has_standby_replay(ainfo
.global_id
)) {
2198 dout(20) << " mds." << ainfo
.rank
<< " already has a follower" << dendl
;
2201 // Assign the new role to the standby
2202 dout(10) << " setting to follow mds rank " << ainfo
.rank
<< dendl
;
2203 pending_fsmap
.assign_standby_replay(finfo
.global_id
, leader_fs
.fscid
, ainfo
.rank
);
2208 MDSMonitor::MDSMonitor(Monitor
*mn
, Paxos
*p
, string service_name
)
2209 : PaxosService(mn
, p
, service_name
)
2211 handlers
= FileSystemCommandHandler::load();
2214 void MDSMonitor::on_restart()
2216 // Clear out the leader-specific state.
2217 last_tick
= utime_t();
2218 last_beacon
.clear();