1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <boost/utility.hpp>
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
25 #include "common/strtol.h"
26 #include "common/perf_counters.h"
27 #include "common/config.h"
28 #include "common/cmdparse.h"
29 #include "messages/MMDSMap.h"
30 #include "messages/MFSMap.h"
31 #include "messages/MFSMapUser.h"
32 #include "messages/MMDSLoadTargets.h"
33 #include "messages/MMonCommand.h"
34 #include "messages/MGenericMessage.h"
36 #include "include/ceph_assert.h"
37 #include "include/str_list.h"
38 #include "include/stringify.h"
39 #include "mds/mdstypes.h"
42 #define dout_subsys ceph_subsys_mon
44 #define dout_prefix _prefix(_dout, mon, get_fsmap())
45 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
, const FSMap
& fsmap
) {
46 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
47 << "(" << mon
->get_state_name()
48 << ").mds e" << fsmap
.get_epoch() << " ";
51 static const string
MDS_METADATA_PREFIX("mds_metadata");
52 static const string
MDS_HEALTH_PREFIX("mds_health");
56 * Specialized implementation of cmd_getval to allow us to parse
57 * out strongly-typedef'd types
59 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
60 const std::string
& k
, mds_gid_t
&val
)
62 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
65 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
66 const std::string
& k
, mds_rank_t
&val
)
68 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
71 template<> bool cmd_getval(CephContext
*cct
, const cmdmap_t
& cmdmap
,
72 const std::string
& k
, MDSMap::DaemonState
&val
)
74 return cmd_getval(cct
, cmdmap
, k
, (int64_t&)val
);
80 void MDSMonitor::print_map(const FSMap
& m
)
82 dout(dblV
) << "print_map\n";
88 void MDSMonitor::create_initial()
90 dout(10) << "create_initial" << dendl
;
93 void MDSMonitor::get_store_prefixes(std::set
<string
>& s
) const
95 s
.insert(service_name
);
96 s
.insert(MDS_METADATA_PREFIX
);
97 s
.insert(MDS_HEALTH_PREFIX
);
100 void MDSMonitor::update_from_paxos(bool *need_bootstrap
)
102 version_t version
= get_last_committed();
103 if (version
== get_fsmap().epoch
)
106 dout(10) << __func__
<< " version " << version
107 << ", my e " << get_fsmap().epoch
<< dendl
;
108 ceph_assert(version
> get_fsmap().epoch
);
115 int err
= get_version(version
, fsmap_bl
);
116 ceph_assert(err
== 0);
118 ceph_assert(fsmap_bl
.length() > 0);
119 dout(10) << __func__
<< " got " << version
<< dendl
;
120 PaxosFSMap::decode(fsmap_bl
);
123 dout(0) << "new map" << dendl
;
124 print_map
<0>(get_fsmap());
125 if (!g_conf()->mon_mds_skip_sanity
) {
126 get_fsmap().sanity();
132 void MDSMonitor::init()
134 (void)load_metadata(pending_metadata
);
137 void MDSMonitor::create_pending()
139 auto &fsmap
= PaxosFSMap::create_pending();
141 if (mon
->osdmon()->is_readable()) {
142 const auto &osdmap
= mon
->osdmon()->osdmap
;
143 fsmap
.sanitize([&osdmap
](int64_t pool
){return osdmap
.have_pg_pool(pool
);});
146 dout(10) << "create_pending e" << fsmap
.epoch
<< dendl
;
149 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
151 auto &pending
= get_pending_fsmap_writeable();
152 auto &epoch
= pending
.epoch
;
154 dout(10) << "encode_pending e" << epoch
<< dendl
;
156 // print map iff 'debug mon = 30' or higher
157 print_map
<30>(pending
);
158 if (!g_conf()->mon_mds_skip_sanity
) {
162 // Set 'modified' on maps modified this epoch
163 for (auto &p
: pending
.filesystems
) {
164 if (p
.second
->mds_map
.epoch
== epoch
) {
165 p
.second
->mds_map
.modified
= ceph_clock_now();
170 ceph_assert(get_last_committed() + 1 == pending
.epoch
);
171 bufferlist pending_bl
;
172 pending
.encode(pending_bl
, mon
->get_quorum_con_features());
174 /* put everything in the transaction */
175 put_version(t
, pending
.epoch
, pending_bl
);
176 put_last_committed(t
, pending
.epoch
);
178 // Encode MDSHealth data
179 for (std::map
<uint64_t, MDSHealth
>::iterator i
= pending_daemon_health
.begin();
180 i
!= pending_daemon_health
.end(); ++i
) {
182 i
->second
.encode(bl
);
183 t
->put(MDS_HEALTH_PREFIX
, stringify(i
->first
), bl
);
186 for (std::set
<uint64_t>::iterator i
= pending_daemon_health_rm
.begin();
187 i
!= pending_daemon_health_rm
.end(); ++i
) {
188 t
->erase(MDS_HEALTH_PREFIX
, stringify(*i
));
190 pending_daemon_health_rm
.clear();
191 remove_from_metadata(pending
, t
);
194 health_check_map_t new_checks
;
195 const auto &info_map
= pending
.get_mds_info();
196 for (const auto &i
: info_map
) {
197 const auto &gid
= i
.first
;
198 const auto &info
= i
.second
;
199 if (pending_daemon_health_rm
.count(gid
)) {
203 auto p
= pending_daemon_health
.find(gid
);
204 if (p
!= pending_daemon_health
.end()) {
208 mon
->store
->get(MDS_HEALTH_PREFIX
, stringify(gid
), bl
);
210 derr
<< "Missing health data for MDS " << gid
<< dendl
;
213 auto bl_i
= bl
.cbegin();
216 for (const auto &metric
: health
.metrics
) {
217 const int rank
= info
.rank
;
218 health_check_t
*check
= &new_checks
.get_or_add(
219 mds_metric_name(metric
.type
),
221 mds_metric_summary(metric
.type
));
223 ss
<< "mds" << info
.name
<< "(mds." << rank
<< "): " << metric
.message
;
225 for (auto &p
: metric
.metadata
) {
231 ss
<< p
.first
<< ": " << p
.second
;
234 check
->detail
.push_back(ss
.str());
237 pending
.get_health_checks(&new_checks
);
238 for (auto& p
: new_checks
.checks
) {
239 p
.second
.summary
= std::regex_replace(
242 stringify(p
.second
.detail
.size()));
243 p
.second
.summary
= std::regex_replace(
245 std::regex("%plurals%"),
246 p
.second
.detail
.size() > 1 ? "s" : "");
247 p
.second
.summary
= std::regex_replace(
249 std::regex("%isorare%"),
250 p
.second
.detail
.size() > 1 ? "are" : "is");
251 p
.second
.summary
= std::regex_replace(
253 std::regex("%hasorhave%"),
254 p
.second
.detail
.size() > 1 ? "have" : "has");
256 encode_health(new_checks
, t
);
259 version_t
MDSMonitor::get_trim_to() const
262 if (g_conf()->mon_mds_force_trim_to
> 0 &&
263 g_conf()->mon_mds_force_trim_to
< (int)get_last_committed()) {
264 floor
= g_conf()->mon_mds_force_trim_to
;
265 dout(10) << __func__
<< " explicit mon_mds_force_trim_to = "
269 unsigned max
= g_conf()->mon_max_mdsmap_epochs
;
270 version_t last
= get_last_committed();
272 if (last
- get_first_committed() > max
&& floor
< last
- max
)
277 bool MDSMonitor::preprocess_query(MonOpRequestRef op
)
279 op
->mark_mdsmon_event(__func__
);
280 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
281 dout(10) << "preprocess_query " << *m
<< " from " << m
->get_orig_source()
282 << " " << m
->get_orig_source_addrs() << dendl
;
284 switch (m
->get_type()) {
287 return preprocess_beacon(op
);
289 case MSG_MON_COMMAND
:
291 return preprocess_command(op
);
292 } catch (const bad_cmd_get
& e
) {
294 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
298 case MSG_MDS_OFFLOAD_TARGETS
:
299 return preprocess_offload_targets(op
);
307 void MDSMonitor::_note_beacon(MMDSBeacon
*m
)
309 mds_gid_t gid
= mds_gid_t(m
->get_global_id());
310 version_t seq
= m
->get_seq();
312 dout(5) << "_note_beacon " << *m
<< " noting time" << dendl
;
313 auto &beacon
= last_beacon
[gid
];
314 beacon
.stamp
= mono_clock::now();
318 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op
)
320 op
->mark_mdsmon_event(__func__
);
321 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
322 MDSMap::DaemonState state
= m
->get_state();
323 mds_gid_t gid
= m
->get_global_id();
324 version_t seq
= m
->get_seq();
325 MDSMap::mds_info_t info
;
326 epoch_t effective_epoch
= 0;
328 const auto &fsmap
= get_fsmap();
330 // check privileges, ignore if fails
331 MonSession
*session
= op
->get_session();
334 if (!session
->is_capable("mds", MON_CAP_X
)) {
335 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
336 << session
->caps
<< dendl
;
340 if (m
->get_fsid() != mon
->monmap
->fsid
) {
341 dout(0) << "preprocess_beacon on fsid " << m
->get_fsid() << " != " << mon
->monmap
->fsid
<< dendl
;
345 dout(5) << "preprocess_beacon " << *m
346 << " from " << m
->get_orig_source()
347 << " " << m
->get_orig_source_addrs()
348 << " " << m
->get_compat()
351 // make sure the address has a port
352 if (m
->get_orig_source_addr().get_port() == 0) {
353 dout(1) << " ignoring boot message without a port" << dendl
;
358 if (!m
->get_compat().writeable(fsmap
.compat
)) {
359 dout(1) << " mds " << m
->get_orig_source()
360 << " " << m
->get_orig_source_addrs()
361 << " can't write to fsmap " << fsmap
.compat
<< dendl
;
369 // booted, but not in map?
370 if (!fsmap
.gid_exists(gid
)) {
371 if (state
!= MDSMap::STATE_BOOT
) {
372 dout(7) << "mds_beacon " << *m
<< " is not in fsmap (state "
373 << ceph_mds_state_name(state
) << ")" << dendl
;
375 /* We can't send an MDSMap this MDS was a part of because we no longer
376 * know which FS it was part of. Nor does this matter. Sending an empty
377 * MDSMap is sufficient for getting the MDS to respawn.
380 null_map
.epoch
= fsmap
.epoch
;
381 null_map
.compat
= fsmap
.compat
;
382 auto m
= MMDSMap::create(mon
->monmap
->fsid
, null_map
);
383 mon
->send_reply(op
, m
.detach());
386 return false; // not booted yet.
389 dout(10) << __func__
<< ": GID exists in map: " << gid
<< dendl
;
390 info
= fsmap
.get_info_gid(gid
);
393 if (info
.state_seq
> seq
) {
394 dout(7) << "mds_beacon " << *m
<< " has old seq, ignoring" << dendl
;
398 // Work out the latest epoch that this daemon should have seen
400 fs_cluster_id_t fscid
= fsmap
.mds_roles
.at(gid
);
401 if (fscid
== FS_CLUSTER_ID_NONE
) {
402 effective_epoch
= fsmap
.standby_epochs
.at(gid
);
404 effective_epoch
= fsmap
.get_filesystem(fscid
)->mds_map
.epoch
;
406 if (effective_epoch
!= m
->get_last_epoch_seen()) {
407 dout(10) << "mds_beacon " << *m
408 << " ignoring requested state, because mds hasn't seen latest map" << dendl
;
415 return false; // no longer laggy, need to update map.
417 if (state
== MDSMap::STATE_BOOT
) {
418 // ignore, already booted.
421 // is there a state change here?
422 if (info
.state
!= state
) {
423 // legal state change?
424 if ((info
.state
== MDSMap::STATE_STANDBY
||
425 info
.state
== MDSMap::STATE_STANDBY_REPLAY
) && state
> 0) {
426 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info
.state
)
427 << " -> " << ceph_mds_state_name(state
) << ")" << dendl
;
431 if ((state
== MDSMap::STATE_STANDBY
|| state
== MDSMap::STATE_STANDBY_REPLAY
)
432 && info
.rank
!= MDS_RANK_NONE
)
434 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
435 "held rank " << info
.rank
<< " while requesting state "
436 << ceph_mds_state_name(state
) << dendl
;
444 // Comparing known daemon health with m->get_health()
445 // and return false (i.e. require proposal) if they
446 // do not match, to update our stored
447 if (!(pending_daemon_health
[gid
] == m
->get_health())) {
448 dout(10) << __func__
<< " health metrics for gid " << gid
<< " were updated" << dendl
;
454 // note time and reply
455 ceph_assert(effective_epoch
> 0);
458 auto beacon
= MMDSBeacon::create(mon
->monmap
->fsid
,
459 m
->get_global_id(), m
->get_name(), effective_epoch
,
460 state
, seq
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
461 mon
->send_reply(op
, beacon
.detach());
466 // I won't reply this beacon, drop it.
471 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op
)
473 op
->mark_mdsmon_event(__func__
);
474 MMDSLoadTargets
*m
= static_cast<MMDSLoadTargets
*>(op
->get_req());
475 dout(10) << "preprocess_offload_targets " << *m
<< " from " << m
->get_orig_source() << dendl
;
477 const auto &fsmap
= get_fsmap();
479 // check privileges, ignore message if fails
480 MonSession
*session
= op
->get_session();
483 if (!session
->is_capable("mds", MON_CAP_X
)) {
484 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
485 << session
->caps
<< dendl
;
489 if (fsmap
.gid_exists(m
->global_id
) &&
490 m
->targets
== fsmap
.get_info_gid(m
->global_id
).export_targets
)
501 bool MDSMonitor::prepare_update(MonOpRequestRef op
)
503 op
->mark_mdsmon_event(__func__
);
504 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
505 dout(7) << "prepare_update " << *m
<< dendl
;
507 switch (m
->get_type()) {
510 return prepare_beacon(op
);
512 case MSG_MON_COMMAND
:
514 return prepare_command(op
);
515 } catch (const bad_cmd_get
& e
) {
517 mon
->reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
521 case MSG_MDS_OFFLOAD_TARGETS
:
522 return prepare_offload_targets(op
);
531 bool MDSMonitor::prepare_beacon(MonOpRequestRef op
)
533 op
->mark_mdsmon_event(__func__
);
534 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
535 // -- this is an update --
536 dout(12) << "prepare_beacon " << *m
<< " from " << m
->get_orig_source()
537 << " " << m
->get_orig_source_addrs() << dendl
;
538 entity_addrvec_t addrs
= m
->get_orig_source_addrs();
539 mds_gid_t gid
= m
->get_global_id();
540 MDSMap::DaemonState state
= m
->get_state();
541 version_t seq
= m
->get_seq();
543 auto &pending
= get_pending_fsmap_writeable();
545 dout(15) << __func__
<< " got health from gid " << gid
<< " with " << m
->get_health().metrics
.size() << " metrics." << dendl
;
547 // Calculate deltas of health metrics created and removed
548 // Do this by type rather than MDSHealthMetric equality, because messages can
549 // change a lot when they include e.g. a number of items.
550 const auto &old_health
= pending_daemon_health
[gid
].metrics
;
551 const auto &new_health
= m
->get_health().metrics
;
553 std::set
<mds_metric_t
> old_types
;
554 for (const auto &i
: old_health
) {
555 old_types
.insert(i
.type
);
558 std::set
<mds_metric_t
> new_types
;
559 for (const auto &i
: new_health
) {
560 new_types
.insert(i
.type
);
563 for (const auto &new_metric
: new_health
) {
564 if (old_types
.count(new_metric
.type
) == 0) {
565 dout(10) << "MDS health message (" << m
->get_orig_source()
566 << "): " << new_metric
.sev
<< " " << new_metric
.message
<< dendl
;
570 // Log the disappearance of health messages at INFO
571 for (const auto &old_metric
: old_health
) {
572 if (new_types
.count(old_metric
.type
) == 0) {
573 mon
->clog
->info() << "MDS health message cleared ("
574 << m
->get_orig_source() << "): " << old_metric
.message
;
579 pending_daemon_health
[gid
] = m
->get_health();
582 if (state
== MDSMap::STATE_BOOT
) {
583 // zap previous instance of this name?
584 if (g_conf()->mds_enforce_unique_name
) {
585 bool failed_mds
= false;
586 while (mds_gid_t existing
= pending
.find_mds_gid_by_name(m
->get_name())) {
587 if (!mon
->osdmon()->is_writeable()) {
588 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
591 const MDSMap::mds_info_t
&existing_info
=
592 pending
.get_info_gid(existing
);
593 mon
->clog
->info() << existing_info
.human_name() << " restarted";
594 fail_mds_gid(pending
, existing
);
598 ceph_assert(mon
->osdmon()->is_writeable());
599 request_proposal(mon
->osdmon());
603 // Add this daemon to the map
604 if (pending
.mds_roles
.count(gid
) == 0) {
605 MDSMap::mds_info_t new_info
;
606 new_info
.global_id
= gid
;
607 new_info
.name
= m
->get_name();
608 new_info
.addrs
= addrs
;
609 new_info
.mds_features
= m
->get_mds_features();
610 new_info
.state
= MDSMap::STATE_STANDBY
;
611 new_info
.state_seq
= seq
;
612 pending
.insert(new_info
);
615 // initialize the beacon timer
616 auto &beacon
= last_beacon
[gid
];
617 beacon
.stamp
= mono_clock::now();
621 if (!pending
.compat
.writeable(m
->get_compat())) {
622 dout(10) << " fsmap " << pending
.compat
623 << " can't write to new mds' " << m
->get_compat()
624 << ", updating fsmap and killing old mds's"
626 pending
.update_compat(m
->get_compat());
629 update_metadata(m
->get_global_id(), m
->get_sys_info());
633 if (!pending
.gid_exists(gid
)) {
634 /* gid has been removed from pending, send null map */
635 dout(5) << "mds_beacon " << *m
<< " is not in fsmap (state "
636 << ceph_mds_state_name(state
) << ")" << dendl
;
638 /* We can't send an MDSMap this MDS was a part of because we no longer
639 * know which FS it was part of. Nor does this matter. Sending an empty
640 * MDSMap is sufficient for getting the MDS to respawn.
642 wait_for_finished_proposal(op
, new FunctionContext([op
, this](int r
){
644 const auto& fsmap
= get_fsmap();
646 null_map
.epoch
= fsmap
.epoch
;
647 null_map
.compat
= fsmap
.compat
;
648 auto m
= MMDSMap::create(mon
->monmap
->fsid
, null_map
);
649 mon
->send_reply(op
, m
.detach());
651 dispatch(op
); // try again
657 const auto& info
= pending
.get_info_gid(gid
);
658 if (info
.state
== MDSMap::STATE_STOPPING
&&
659 state
!= MDSMap::STATE_STOPPING
&&
660 state
!= MDSMap::STATE_STOPPED
) {
661 // we can't transition to any other states from STOPPING
662 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
669 dout(1) << "prepare_beacon clearing laggy flag on " << addrs
<< dendl
;
670 pending
.modify_daemon(info
.global_id
, [](auto& info
)
677 dout(5) << "prepare_beacon mds." << info
.rank
678 << " " << ceph_mds_state_name(info
.state
)
679 << " -> " << ceph_mds_state_name(state
)
681 if (state
== MDSMap::STATE_STOPPED
) {
682 const auto fscid
= pending
.mds_roles
.at(gid
);
683 const auto &fs
= pending
.get_filesystem(fscid
);
685 mon
->clog
->info() << info
.human_name() << " finished "
686 << "stopping rank " << info
.rank
<< " in filesystem "
687 << fs
->mds_map
.fs_name
<< " (now has "
688 << fs
->mds_map
.get_num_in_mds() - 1 << " ranks)";
690 auto erased
= pending
.stop(gid
);
691 erased
.push_back(gid
);
693 for (const auto &erased_gid
: erased
) {
694 last_beacon
.erase(erased_gid
);
695 if (pending_daemon_health
.count(erased_gid
)) {
696 pending_daemon_health
.erase(erased_gid
);
697 pending_daemon_health_rm
.insert(erased_gid
);
702 } else if (state
== MDSMap::STATE_DAMAGED
) {
703 if (!mon
->osdmon()->is_writeable()) {
704 dout(1) << __func__
<< ": DAMAGED from rank " << info
.rank
705 << " waiting for osdmon writeable to blacklist it" << dendl
;
706 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
710 // Record this MDS rank as damaged, so that other daemons
711 // won't try to run it.
712 dout(0) << __func__
<< ": marking rank "
713 << info
.rank
<< " damaged" << dendl
;
715 utime_t until
= ceph_clock_now();
716 until
+= g_conf().get_val
<double>("mon_mds_blacklist_interval");
717 const auto blacklist_epoch
= mon
->osdmon()->blacklist(info
.addrs
, until
);
718 request_proposal(mon
->osdmon());
719 pending
.damaged(gid
, blacklist_epoch
);
720 last_beacon
.erase(gid
);
722 // Respond to MDS, so that it knows it can continue to shut down
723 auto beacon
= MMDSBeacon::create(
724 mon
->monmap
->fsid
, m
->get_global_id(),
725 m
->get_name(), pending
.get_epoch(), state
, seq
,
726 CEPH_FEATURES_SUPPORTED_DEFAULT
);
727 mon
->send_reply(op
, beacon
.detach());
728 } else if (state
== MDSMap::STATE_DNE
) {
729 if (!mon
->osdmon()->is_writeable()) {
730 dout(1) << __func__
<< ": DNE from rank " << info
.rank
731 << " waiting for osdmon writeable to blacklist it" << dendl
;
732 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
736 fail_mds_gid(pending
, gid
);
737 ceph_assert(mon
->osdmon()->is_writeable());
738 request_proposal(mon
->osdmon());
740 // Respond to MDS, so that it knows it can continue to shut down
741 auto beacon
= MMDSBeacon::create(mon
->monmap
->fsid
,
742 m
->get_global_id(), m
->get_name(), pending
.get_epoch(), state
, seq
,
743 CEPH_FEATURES_SUPPORTED_DEFAULT
);
744 mon
->send_reply(op
, beacon
.detach());
745 } else if (info
.state
== MDSMap::STATE_STANDBY
&& state
!= info
.state
) {
746 // Standby daemons should never modify their own
747 // state. Reject any attempts to do so.
748 derr
<< "standby " << gid
<< " attempted to change state to "
749 << ceph_mds_state_name(state
) << ", rejecting" << dendl
;
751 } else if (info
.state
!= MDSMap::STATE_STANDBY
&& state
!= info
.state
&&
752 !MDSMap::state_transition_valid(info
.state
, state
)) {
753 // Validate state transitions for daemons that hold a rank
754 derr
<< "daemon " << gid
<< " (rank " << info
.rank
<< ") "
755 << "reported invalid state transition "
756 << ceph_mds_state_name(info
.state
) << " -> "
757 << ceph_mds_state_name(state
) << dendl
;
760 if (info
.state
!= MDSMap::STATE_ACTIVE
&& state
== MDSMap::STATE_ACTIVE
) {
761 const auto &fscid
= pending
.mds_roles
.at(gid
);
762 const auto &fs
= pending
.get_filesystem(fscid
);
763 mon
->clog
->info() << info
.human_name() << " is now active in "
764 << "filesystem " << fs
->mds_map
.fs_name
<< " as rank "
768 // Made it through special cases and validations, record the
769 // daemon's reported state to the FSMap.
770 pending
.modify_daemon(gid
, [state
, seq
](auto& info
) {
772 info
.state_seq
= seq
;
777 dout(5) << "prepare_beacon pending map now:" << dendl
;
780 wait_for_finished_proposal(op
, new FunctionContext([op
, this](int r
){
782 _updated(op
); // success
783 else if (r
== -ECANCELED
) {
786 dispatch(op
); // try again
793 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op
)
795 auto &pending
= get_pending_fsmap_writeable();
797 op
->mark_mdsmon_event(__func__
);
798 MMDSLoadTargets
*m
= static_cast<MMDSLoadTargets
*>(op
->get_req());
799 mds_gid_t gid
= m
->global_id
;
800 if (pending
.gid_has_rank(gid
)) {
801 dout(10) << "prepare_offload_targets " << gid
<< " " << m
->targets
<< dendl
;
802 pending
.update_export_targets(gid
, m
->targets
);
804 dout(10) << "prepare_offload_targets " << gid
<< " not in map" << dendl
;
810 bool MDSMonitor::should_propose(double& delay
)
812 // delegate to PaxosService to assess whether we should propose
813 return PaxosService::should_propose(delay
);
816 void MDSMonitor::_updated(MonOpRequestRef op
)
818 const auto &fsmap
= get_fsmap();
819 op
->mark_mdsmon_event(__func__
);
820 MMDSBeacon
*m
= static_cast<MMDSBeacon
*>(op
->get_req());
821 dout(10) << "_updated " << m
->get_orig_source() << " " << *m
<< dendl
;
822 mon
->clog
->debug() << m
->get_orig_source() << " "
823 << m
->get_orig_source_addrs() << " "
824 << ceph_mds_state_name(m
->get_state());
826 if (m
->get_state() == MDSMap::STATE_STOPPED
) {
827 // send the map manually (they're out of the map, so they won't get it automatic)
829 null_map
.epoch
= fsmap
.epoch
;
830 null_map
.compat
= fsmap
.compat
;
831 auto m
= MMDSMap::create(mon
->monmap
->fsid
, null_map
);
832 mon
->send_reply(op
, m
.detach());
834 auto beacon
= MMDSBeacon::create(mon
->monmap
->fsid
,
835 m
->get_global_id(), m
->get_name(), fsmap
.get_epoch(),
836 m
->get_state(), m
->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT
);
837 mon
->send_reply(op
, beacon
.detach());
841 void MDSMonitor::on_active()
846 mon
->clog
->debug() << "fsmap " << get_fsmap();
850 void MDSMonitor::dump_info(Formatter
*f
)
852 f
->open_object_section("fsmap");
856 f
->dump_unsigned("mdsmap_first_committed", get_first_committed());
857 f
->dump_unsigned("mdsmap_last_committed", get_last_committed());
860 bool MDSMonitor::preprocess_command(MonOpRequestRef op
)
862 op
->mark_mdsmon_event(__func__
);
863 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
868 const auto &fsmap
= get_fsmap();
871 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
872 // ss has reason for failure
873 string rs
= ss
.str();
874 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
879 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
881 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
882 std::unique_ptr
<Formatter
> f(Formatter::create(format
));
884 MonSession
*session
= op
->get_session();
886 mon
->reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
890 if (prefix
== "mds stat") {
892 f
->open_object_section("mds_stat");
900 } else if (prefix
== "mds ok-to-stop") {
902 if (!cmd_getval(g_ceph_context
, cmdmap
, "ids", ids
)) {
904 ss
<< "must specify mds id";
907 if (fsmap
.is_any_degraded()) {
908 ss
<< "one or more filesystems is currently degraded";
912 set
<mds_gid_t
> stopping
;
913 for (auto& id
: ids
) {
915 mds_gid_t gid
= gid_from_arg(fsmap
, id
, ess
);
916 if (gid
== MDS_GID_NONE
) {
917 // the mds doesn't exist, but no file systems are unhappy, so losing it
918 // can't have any effect.
921 stopping
.insert(gid
);
923 set
<mds_gid_t
> active
;
924 set
<mds_gid_t
> standby
;
925 for (auto gid
: stopping
) {
926 if (fsmap
.gid_has_rank(gid
)) {
927 // ignore standby-replay daemons (at this level)
928 if (!fsmap
.is_standby_replay(gid
)) {
929 auto standby
= fsmap
.get_standby_replay(gid
);
930 if (standby
== MDS_GID_NONE
||
931 stopping
.count(standby
)) {
932 // no standby-replay, or we're also stopping the standby-replay
938 // net loss of a standby
942 if (fsmap
.get_num_standby() - standby
.size() < active
.size()) {
944 ss
<< "insufficent standby MDS daemons to stop active gids "
946 << " and/or standby gids " << stringify(standby
);;
950 ss
<< "should be safe to stop " << ids
;
951 } else if (prefix
== "fs dump") {
955 const FSMap
*fsmapp
= &fsmap
;
957 if (cmd_getval(g_ceph_context
, cmdmap
, "epoch", epocharg
)) {
960 int err
= get_version(epoch
, b
);
961 if (err
== -ENOENT
) {
965 ceph_assert(err
== 0);
966 ceph_assert(b
.length());
974 f
->open_object_section("fsmap");
975 fsmapp
->dump(f
.get());
985 ss
<< "dumped fsmap epoch " << fsmapp
->get_epoch();
986 } else if (prefix
== "mds metadata") {
988 f
.reset(Formatter::create("json-pretty"));
991 bool all
= !cmd_getval(g_ceph_context
, cmdmap
, "who", who
);
992 dout(1) << "all = " << all
<< dendl
;
995 // Dump all MDSs' metadata
996 const auto all_info
= fsmap
.get_mds_info();
998 f
->open_array_section("mds_metadata");
999 for(const auto &i
: all_info
) {
1000 const auto &info
= i
.second
;
1002 f
->open_object_section("mds");
1003 f
->dump_string("name", info
.name
);
1004 std::ostringstream get_err
;
1005 r
= dump_metadata(fsmap
, info
.name
, f
.get(), get_err
);
1006 if (r
== -EINVAL
|| r
== -ENOENT
) {
1007 // Drop error, list what metadata we do have
1008 dout(1) << get_err
.str() << dendl
;
1010 } else if (r
!= 0) {
1011 derr
<< "Unexpected error reading metadata: " << cpp_strerror(r
)
1013 ss
<< get_err
.str();
1021 // Dump a single daemon's metadata
1022 f
->open_object_section("mds_metadata");
1023 r
= dump_metadata(fsmap
, who
, f
.get(), ss
);
1027 } else if (prefix
== "mds versions") {
1029 f
.reset(Formatter::create("json-pretty"));
1030 count_metadata("ceph_version", f
.get());
1033 } else if (prefix
== "mds count-metadata") {
1035 f
.reset(Formatter::create("json-pretty"));
1037 cmd_getval(g_ceph_context
, cmdmap
, "property", field
);
1038 count_metadata(field
, f
.get());
1041 } else if (prefix
== "mds compat show") {
1043 f
->open_object_section("mds_compat");
1044 fsmap
.compat
.dump(f
.get());
1051 } else if (prefix
== "fs get") {
1053 cmd_getval(g_ceph_context
, cmdmap
, "fs_name", fs_name
);
1054 const auto &fs
= fsmap
.get_filesystem(fs_name
);
1055 if (fs
== nullptr) {
1056 ss
<< "filesystem '" << fs_name
<< "' not found";
1060 f
->open_object_section("filesystem");
1070 } else if (prefix
== "fs ls") {
1072 f
->open_array_section("filesystems");
1073 for (const auto &p
: fsmap
.filesystems
) {
1074 const auto &fs
= p
.second
;
1075 f
->open_object_section("filesystem");
1077 const MDSMap
&mds_map
= fs
->mds_map
;
1078 f
->dump_string("name", mds_map
.fs_name
);
1079 /* Output both the names and IDs of pools, for use by
1080 * humans and machines respectively */
1081 f
->dump_string("metadata_pool", mon
->osdmon()->osdmap
.get_pool_name(
1082 mds_map
.metadata_pool
));
1083 f
->dump_int("metadata_pool_id", mds_map
.metadata_pool
);
1084 f
->open_array_section("data_pool_ids");
1085 for (const auto &id
: mds_map
.data_pools
) {
1086 f
->dump_int("data_pool_id", id
);
1090 f
->open_array_section("data_pools");
1091 for (const auto &id
: mds_map
.data_pools
) {
1092 const auto &name
= mon
->osdmon()->osdmap
.get_pool_name(id
);
1093 f
->dump_string("data_pool", name
);
1102 for (const auto &p
: fsmap
.filesystems
) {
1103 const auto &fs
= p
.second
;
1104 const MDSMap
&mds_map
= fs
->mds_map
;
1105 const string
&md_pool_name
= mon
->osdmon()->osdmap
.get_pool_name(
1106 mds_map
.metadata_pool
);
1108 ds
<< "name: " << mds_map
.fs_name
<< ", metadata pool: "
1109 << md_pool_name
<< ", data pools: [";
1110 for (const auto &id
: mds_map
.data_pools
) {
1111 const string
&pool_name
= mon
->osdmon()->osdmap
.get_pool_name(id
);
1112 ds
<< pool_name
<< " ";
1114 ds
<< "]" << std::endl
;
1117 if (fsmap
.filesystems
.empty()) {
1118 ds
<< "No filesystems enabled" << std::endl
;
1129 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
1135 bool MDSMonitor::fail_mds_gid(FSMap
&fsmap
, mds_gid_t gid
)
1137 const MDSMap::mds_info_t
&info
= fsmap
.get_info_gid(gid
);
1138 dout(1) << "fail_mds_gid " << gid
<< " mds." << info
.name
<< " role " << info
.rank
<< dendl
;
1140 ceph_assert(mon
->osdmon()->is_writeable());
1142 epoch_t blacklist_epoch
= 0;
1143 if (info
.rank
>= 0 && info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
1144 utime_t until
= ceph_clock_now();
1145 until
+= g_conf().get_val
<double>("mon_mds_blacklist_interval");
1146 blacklist_epoch
= mon
->osdmon()->blacklist(info
.addrs
, until
);
1149 fsmap
.erase(gid
, blacklist_epoch
);
1150 last_beacon
.erase(gid
);
1151 if (pending_daemon_health
.count(gid
)) {
1152 pending_daemon_health
.erase(gid
);
1153 pending_daemon_health_rm
.insert(gid
);
1156 return blacklist_epoch
!= 0;
1159 mds_gid_t
MDSMonitor::gid_from_arg(const FSMap
&fsmap
, const std::string
&arg
, std::ostream
&ss
)
1161 // Try parsing as a role
1163 std::ostringstream ignore_err
; // Don't spam 'ss' with parse_role errors
1164 int r
= fsmap
.parse_role(arg
, &role
, ignore_err
);
1166 // See if a GID is assigned to this role
1167 const auto &fs
= fsmap
.get_filesystem(role
.fscid
);
1168 ceph_assert(fs
!= nullptr); // parse_role ensures it exists
1169 if (fs
->mds_map
.is_up(role
.rank
)) {
1170 dout(10) << __func__
<< ": validated rank/GID " << role
1171 << " as a rank" << dendl
;
1172 return fs
->mds_map
.get_mds_info(role
.rank
).global_id
;
1176 // Try parsing as a gid
1178 unsigned long long maybe_gid
= strict_strtoll(arg
.c_str(), 10, &err
);
1180 // Not a role or a GID, try as a daemon name
1181 const MDSMap::mds_info_t
*mds_info
= fsmap
.find_by_name(arg
);
1183 ss
<< "MDS named '" << arg
1184 << "' does not exist, or is not up";
1185 return MDS_GID_NONE
;
1187 dout(10) << __func__
<< ": resolved MDS name '" << arg
1188 << "' to GID " << mds_info
->global_id
<< dendl
;
1189 return mds_info
->global_id
;
1191 // Not a role, but parses as a an integer, might be a GID
1192 dout(10) << __func__
<< ": treating MDS reference '" << arg
1193 << "' as an integer " << maybe_gid
<< dendl
;
1195 if (fsmap
.gid_exists(mds_gid_t(maybe_gid
))) {
1196 return mds_gid_t(maybe_gid
);
1200 dout(1) << __func__
<< ": rank/GID " << arg
1201 << " not a existent rank or GID" << dendl
;
1202 return MDS_GID_NONE
;
1205 int MDSMonitor::fail_mds(FSMap
&fsmap
, std::ostream
&ss
,
1206 const std::string
&arg
, MDSMap::mds_info_t
*failed_info
)
1208 ceph_assert(failed_info
!= nullptr);
1210 mds_gid_t gid
= gid_from_arg(fsmap
, arg
, ss
);
1211 if (gid
== MDS_GID_NONE
) {
1214 if (!mon
->osdmon()->is_writeable()) {
1218 // Take a copy of the info before removing the MDS from the map,
1219 // so that the caller knows which mds (if any) they ended up removing.
1220 *failed_info
= fsmap
.get_info_gid(gid
);
1222 fail_mds_gid(fsmap
, gid
);
1223 ss
<< "failed mds gid " << gid
;
1224 ceph_assert(mon
->osdmon()->is_writeable());
1225 request_proposal(mon
->osdmon());
1229 bool MDSMonitor::prepare_command(MonOpRequestRef op
)
1231 op
->mark_mdsmon_event(__func__
);
1232 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
1238 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
1239 string rs
= ss
.str();
1240 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
1245 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
1247 /* Refuse access if message not associated with a valid session */
1248 MonSession
*session
= op
->get_session();
1250 mon
->reply_command(op
, -EACCES
, "access denied", rdata
, get_last_committed());
1254 auto &pending
= get_pending_fsmap_writeable();
1256 bool batched_propose
= false;
1257 for (const auto &h
: handlers
) {
1258 if (h
->can_handle(prefix
)) {
1259 batched_propose
= h
->batched_propose();
1260 if (batched_propose
) {
1263 r
= h
->handle(mon
, pending
, op
, cmdmap
, ss
);
1264 if (batched_propose
) {
1269 // message has been enqueued for retry; return.
1270 dout(4) << __func__
<< " enqueue for retry by prepare_command" << dendl
;
1274 // On successful updates, print the updated map
1277 // Successful or not, we're done: respond.
1283 r
= filesystem_command(pending
, op
, prefix
, cmdmap
, ss
);
1286 } else if (r
== -EAGAIN
) {
1287 // Do not reply, the message has been enqueued for retry
1288 dout(4) << __func__
<< " enqueue for retry by filesystem_command" << dendl
;
1290 } else if (r
!= -ENOSYS
) {
1294 if (r
== -ENOSYS
&& ss
.str().empty()) {
1295 ss
<< "unrecognized command";
1299 dout(4) << __func__
<< " done, r=" << r
<< dendl
;
1300 /* Compose response */
1305 // success.. delay reply
1306 wait_for_finished_proposal(op
, new Monitor::C_Command(mon
, op
, r
, rs
,
1307 get_last_committed() + 1));
1308 if (batched_propose
) {
1309 force_immediate_propose();
1313 // reply immediately
1314 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
1319 int MDSMonitor::filesystem_command(
1322 std::string
const &prefix
,
1323 const cmdmap_t
& cmdmap
,
1324 std::stringstream
&ss
)
1326 dout(4) << __func__
<< " prefix='" << prefix
<< "'" << dendl
;
1327 op
->mark_mdsmon_event(__func__
);
1330 cmd_getval(g_ceph_context
, cmdmap
, "role", whostr
);
1332 if (prefix
== "mds set_state") {
1334 if (!cmd_getval(g_ceph_context
, cmdmap
, "gid", gid
)) {
1335 ss
<< "error parsing 'gid' value '"
1336 << cmd_vartype_stringify(cmdmap
.at("gid")) << "'";
1339 MDSMap::DaemonState state
;
1340 if (!cmd_getval(g_ceph_context
, cmdmap
, "state", state
)) {
1341 ss
<< "error parsing 'state' string value '"
1342 << cmd_vartype_stringify(cmdmap
.at("state")) << "'";
1345 if (fsmap
.gid_exists(gid
)) {
1346 fsmap
.modify_daemon(gid
, [state
](auto& info
) {
1349 ss
<< "set mds gid " << gid
<< " to state " << state
<< " "
1350 << ceph_mds_state_name(state
);
1353 } else if (prefix
== "mds fail") {
1355 cmd_getval(g_ceph_context
, cmdmap
, "role_or_gid", who
);
1357 MDSMap::mds_info_t failed_info
;
1358 r
= fail_mds(fsmap
, ss
, who
, &failed_info
);
1359 if (r
< 0 && r
== -EAGAIN
) {
1360 mon
->osdmon()->wait_for_writeable(op
, new C_RetryMessage(this, op
));
1361 return -EAGAIN
; // don't propose yet; wait for message to be retried
1362 } else if (r
== 0) {
1363 // Only log if we really did something (not when was already gone)
1364 if (failed_info
.global_id
!= MDS_GID_NONE
) {
1365 mon
->clog
->info() << failed_info
.human_name() << " marked failed by "
1366 << op
->get_session()->entity_name
;
1369 } else if (prefix
== "mds rm") {
1371 if (!cmd_getval(g_ceph_context
, cmdmap
, "gid", gid
)) {
1372 ss
<< "error parsing 'gid' value '"
1373 << cmd_vartype_stringify(cmdmap
.at("gid")) << "'";
1376 if (!fsmap
.gid_exists(gid
)) {
1377 ss
<< "mds gid " << gid
<< " does not exist";
1380 const auto &info
= fsmap
.get_info_gid(gid
);
1381 MDSMap::DaemonState state
= info
.state
;
1383 ss
<< "cannot remove active mds." << info
.name
1384 << " rank " << info
.rank
;
1387 fsmap
.erase(gid
, {});
1388 ss
<< "removed mds gid " << gid
;
1392 } else if (prefix
== "mds rmfailed") {
1393 bool confirm
= false;
1394 cmd_getval(g_ceph_context
, cmdmap
, "yes_i_really_mean_it", confirm
);
1396 ss
<< "WARNING: this can make your filesystem inaccessible! "
1397 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1401 std::string role_str
;
1402 cmd_getval(g_ceph_context
, cmdmap
, "role", role_str
);
1404 int r
= fsmap
.parse_role(role_str
, &role
, ss
);
1406 ss
<< "invalid role '" << role_str
<< "'";
1410 fsmap
.modify_filesystem(
1412 [role
](std::shared_ptr
<Filesystem
> fs
)
1414 fs
->mds_map
.failed
.erase(role
.rank
);
1417 ss
<< "removed failed mds." << role
;
1419 } else if (prefix
== "mds compat rm_compat") {
1421 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature", f
)) {
1422 ss
<< "error parsing feature value '"
1423 << cmd_vartype_stringify(cmdmap
.at("feature")) << "'";
1426 if (fsmap
.compat
.compat
.contains(f
)) {
1427 ss
<< "removing compat feature " << f
;
1428 CompatSet modified
= fsmap
.compat
;
1429 modified
.compat
.remove(f
);
1430 fsmap
.update_compat(modified
);
1432 ss
<< "compat feature " << f
<< " not present in " << fsmap
.compat
;
1435 } else if (prefix
== "mds compat rm_incompat") {
1437 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature", f
)) {
1438 ss
<< "error parsing feature value '"
1439 << cmd_vartype_stringify(cmdmap
.at("feature")) << "'";
1442 if (fsmap
.compat
.incompat
.contains(f
)) {
1443 ss
<< "removing incompat feature " << f
;
1444 CompatSet modified
= fsmap
.compat
;
1445 modified
.incompat
.remove(f
);
1446 fsmap
.update_compat(modified
);
1448 ss
<< "incompat feature " << f
<< " not present in " << fsmap
.compat
;
1451 } else if (prefix
== "mds repaired") {
1452 std::string role_str
;
1453 cmd_getval(g_ceph_context
, cmdmap
, "role", role_str
);
1455 r
= fsmap
.parse_role(role_str
, &role
, ss
);
1460 bool modified
= fsmap
.undamaged(role
.fscid
, role
.rank
);
1462 ss
<< "repaired: restoring rank " << role
;
1464 ss
<< "nothing to do: rank is not damaged";
1468 } else if (prefix
== "mds freeze") {
1470 cmd_getval(g_ceph_context
, cmdmap
, "role_or_gid", who
);
1471 mds_gid_t gid
= gid_from_arg(fsmap
, who
, ss
);
1472 if (gid
== MDS_GID_NONE
) {
1476 bool freeze
= false;
1479 cmd_getval(g_ceph_context
, cmdmap
, "val", str
);
1480 if ((r
= parse_bool(str
, &freeze
, ss
)) != 0) {
1485 auto f
= [freeze
,gid
,&ss
](auto& info
) {
1487 ss
<< "freezing mds." << gid
;
1490 ss
<< "unfreezing mds." << gid
;
1494 fsmap
.modify_daemon(gid
, f
);
1503 void MDSMonitor::check_subs()
1505 std::list
<std::string
> types
;
1507 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1508 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1509 // filesystems. Build a list of all the types we service
1510 // subscriptions for.
1511 types
.push_back("fsmap");
1512 types
.push_back("fsmap.user");
1513 types
.push_back("mdsmap");
1514 for (const auto &p
: get_fsmap().filesystems
) {
1515 const auto &fscid
= p
.first
;
1516 std::ostringstream oss
;
1517 oss
<< "mdsmap." << fscid
;
1518 types
.push_back(oss
.str());
1521 for (const auto &type
: types
) {
1522 if (mon
->session_map
.subs
.count(type
) == 0)
1524 xlist
<Subscription
*>::iterator p
= mon
->session_map
.subs
[type
]->begin();
1526 Subscription
*sub
= *p
;
1534 void MDSMonitor::check_sub(Subscription
*sub
)
1536 dout(20) << __func__
<< ": " << sub
->type
<< dendl
;
1538 const auto &fsmap
= get_fsmap();
1540 if (sub
->type
== "fsmap") {
1541 if (sub
->next
<= fsmap
.get_epoch()) {
1542 sub
->session
->con
->send_message(new MFSMap(mon
->monmap
->fsid
, fsmap
));
1544 mon
->session_map
.remove_sub(sub
);
1546 sub
->next
= fsmap
.get_epoch() + 1;
1549 } else if (sub
->type
== "fsmap.user") {
1550 if (sub
->next
<= fsmap
.get_epoch()) {
1552 fsmap_u
.epoch
= fsmap
.get_epoch();
1553 fsmap_u
.legacy_client_fscid
= fsmap
.legacy_client_fscid
;
1554 for (const auto &p
: fsmap
.filesystems
) {
1555 FSMapUser::fs_info_t
& fs_info
= fsmap_u
.filesystems
[p
.second
->fscid
];
1556 fs_info
.cid
= p
.second
->fscid
;
1557 fs_info
.name
= p
.second
->mds_map
.fs_name
;
1559 sub
->session
->con
->send_message(new MFSMapUser(mon
->monmap
->fsid
, fsmap_u
));
1561 mon
->session_map
.remove_sub(sub
);
1563 sub
->next
= fsmap
.get_epoch() + 1;
1566 } else if (sub
->type
.compare(0, 6, "mdsmap") == 0) {
1567 if (sub
->next
> fsmap
.get_epoch()) {
1571 const bool is_mds
= sub
->session
->name
.is_mds();
1572 mds_gid_t mds_gid
= MDS_GID_NONE
;
1573 fs_cluster_id_t fscid
= FS_CLUSTER_ID_NONE
;
1575 // What (if any) namespace are you assigned to?
1576 auto mds_info
= fsmap
.get_mds_info();
1577 for (const auto &p
: mds_info
) {
1578 if (p
.second
.addrs
== sub
->session
->addrs
) {
1580 fscid
= fsmap
.mds_roles
.at(mds_gid
);
1584 // You're a client. Did you request a particular
1586 if (sub
->type
.compare(0, 7, "mdsmap.") == 0) {
1587 auto namespace_id_str
= sub
->type
.substr(std::string("mdsmap.").size());
1588 dout(10) << __func__
<< ": namespace_id " << namespace_id_str
<< dendl
;
1590 fscid
= strict_strtoll(namespace_id_str
.c_str(), 10, &err
);
1592 // Client asked for a non-existent namespace, send them nothing
1593 dout(1) << "Invalid client subscription '" << sub
->type
1597 if (fsmap
.filesystems
.count(fscid
) == 0) {
1598 // Client asked for a non-existent namespace, send them nothing
1599 // TODO: something more graceful for when a client has a filesystem
1600 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1602 dout(1) << "Client subscribed to non-existent namespace '" <<
1603 fscid
<< "'" << dendl
;
1607 // Unqualified request for "mdsmap": give it the one marked
1608 // for use by legacy clients.
1609 if (fsmap
.legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
1610 fscid
= fsmap
.legacy_client_fscid
;
1612 dout(1) << "Client subscribed for legacy filesystem but "
1613 "none is configured" << dendl
;
1618 dout(10) << __func__
<< ": is_mds=" << is_mds
<< ", fscid= " << fscid
<< dendl
;
1620 // Work out the effective latest epoch
1621 const MDSMap
*mds_map
= nullptr;
1623 null_map
.compat
= fsmap
.compat
;
1624 if (fscid
== FS_CLUSTER_ID_NONE
) {
1625 // For a client, we should have already dropped out
1626 ceph_assert(is_mds
);
1628 auto it
= fsmap
.standby_daemons
.find(mds_gid
);
1629 if (it
!= fsmap
.standby_daemons
.end()) {
1630 // For an MDS, we need to feed it an MDSMap with its own state in
1631 null_map
.mds_info
[mds_gid
] = it
->second
;
1632 null_map
.epoch
= fsmap
.standby_epochs
.at(mds_gid
);
1634 null_map
.epoch
= fsmap
.epoch
;
1636 mds_map
= &null_map
;
1638 // Check the effective epoch
1639 mds_map
= &fsmap
.get_filesystem(fscid
)->mds_map
;
1642 ceph_assert(mds_map
!= nullptr);
1643 dout(10) << __func__
<< " selected MDS map epoch " <<
1644 mds_map
->epoch
<< " for namespace " << fscid
<< " for subscriber "
1645 << sub
->session
->name
<< " who wants epoch " << sub
->next
<< dendl
;
1647 if (sub
->next
> mds_map
->epoch
) {
1650 auto msg
= MMDSMap::create(mon
->monmap
->fsid
, *mds_map
);
1652 sub
->session
->con
->send_message(msg
.detach());
1654 mon
->session_map
.remove_sub(sub
);
1656 sub
->next
= mds_map
->get_epoch() + 1;
1662 void MDSMonitor::update_metadata(mds_gid_t gid
,
1663 const map
<string
, string
>& metadata
)
1665 if (metadata
.empty()) {
1668 pending_metadata
[gid
] = metadata
;
1670 MonitorDBStore::TransactionRef t
= paxos
->get_pending_transaction();
1672 encode(pending_metadata
, bl
);
1673 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1674 paxos
->trigger_propose();
1677 void MDSMonitor::remove_from_metadata(const FSMap
&fsmap
, MonitorDBStore::TransactionRef t
)
1679 bool update
= false;
1680 for (auto it
= pending_metadata
.begin(); it
!= pending_metadata
.end(); ) {
1681 if (!fsmap
.gid_exists(it
->first
)) {
1682 it
= pending_metadata
.erase(it
);
1691 encode(pending_metadata
, bl
);
1692 t
->put(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1695 int MDSMonitor::load_metadata(map
<mds_gid_t
, Metadata
>& m
)
1698 int r
= mon
->store
->get(MDS_METADATA_PREFIX
, "last_metadata", bl
);
1700 dout(5) << "Unable to load 'last_metadata'" << dendl
;
1704 auto it
= bl
.cbegin();
1705 ceph::decode(m
, it
);
1709 void MDSMonitor::count_metadata(const std::string
&field
, map
<string
,int> *out
)
1711 map
<mds_gid_t
,Metadata
> meta
;
1712 load_metadata(meta
);
1713 for (auto& p
: meta
) {
1714 auto q
= p
.second
.find(field
);
1715 if (q
== p
.second
.end()) {
1716 (*out
)["unknown"]++;
1718 (*out
)[q
->second
]++;
1723 void MDSMonitor::count_metadata(const std::string
&field
, Formatter
*f
)
1725 map
<string
,int> by_val
;
1726 count_metadata(field
, &by_val
);
1727 f
->open_object_section(field
.c_str());
1728 for (auto& p
: by_val
) {
1729 f
->dump_int(p
.first
.c_str(), p
.second
);
1734 int MDSMonitor::dump_metadata(const FSMap
& fsmap
, const std::string
&who
,
1735 Formatter
*f
, ostream
& err
)
1739 mds_gid_t gid
= gid_from_arg(fsmap
, who
, err
);
1740 if (gid
== MDS_GID_NONE
) {
1744 map
<mds_gid_t
, Metadata
> metadata
;
1745 if (int r
= load_metadata(metadata
)) {
1746 err
<< "Unable to load 'last_metadata'";
1750 if (!metadata
.count(gid
)) {
1753 const Metadata
& m
= metadata
[gid
];
1754 for (Metadata::const_iterator p
= m
.begin(); p
!= m
.end(); ++p
) {
1755 f
->dump_string(p
->first
.c_str(), p
->second
);
1760 int MDSMonitor::print_nodes(Formatter
*f
)
1764 const auto &fsmap
= get_fsmap();
1766 map
<mds_gid_t
, Metadata
> metadata
;
1767 if (int r
= load_metadata(metadata
)) {
1771 map
<string
, list
<string
> > mdses
; // hostname => mds
1772 for (const auto &p
: metadata
) {
1773 const mds_gid_t
& gid
= p
.first
;
1774 const Metadata
& m
= p
.second
;
1775 Metadata::const_iterator hostname
= m
.find("hostname");
1776 if (hostname
== m
.end()) {
1777 // not likely though
1780 if (!fsmap
.gid_exists(gid
)) {
1781 dout(5) << __func__
<< ": GID " << gid
<< " not existent" << dendl
;
1784 const MDSMap::mds_info_t
& mds_info
= fsmap
.get_info_gid(gid
);
1785 mdses
[hostname
->second
].push_back(mds_info
.name
);
1788 dump_services(f
, mdses
, "mds");
1793 * If a cluster is undersized (with respect to max_mds), then
1794 * attempt to find daemons to grow it. If the cluster is oversized
1795 * (with respect to max_mds) then shrink it by stopping its highest rank.
1797 bool MDSMonitor::maybe_resize_cluster(FSMap
&fsmap
, fs_cluster_id_t fscid
)
1799 auto ¤t_mds_map
= get_fsmap().get_filesystem(fscid
)->mds_map
;
1800 auto&& fs
= fsmap
.get_filesystem(fscid
);
1801 auto &mds_map
= fs
->mds_map
;
1803 int in
= mds_map
.get_num_in_mds();
1804 int max
= mds_map
.get_max_mds();
1806 dout(20) << __func__
<< " in " << in
<< " max " << max
<< dendl
;
1808 /* Check that both the current epoch mds_map is resizeable as well as the
1809 * current batch of changes in pending. This is important if an MDS is
1810 * becoming active in the next epoch.
1812 if (!current_mds_map
.is_resizeable() ||
1813 !mds_map
.is_resizeable()) {
1814 dout(5) << __func__
<< " mds_map is not currently resizeable" << dendl
;
1818 if (in
< max
&& !mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
)) {
1819 mds_rank_t mds
= mds_rank_t(0);
1821 while (mds_map
.is_in(mds
)) {
1824 auto&& newgid
= fsmap
.find_replacement_for({fscid
, mds
}, name
);
1825 if (newgid
== MDS_GID_NONE
) {
1829 const auto &new_info
= fsmap
.get_info_gid(newgid
);
1830 dout(1) << "assigned standby " << new_info
.addrs
1831 << " as mds." << mds
<< dendl
;
1833 mon
->clog
->info() << new_info
.human_name() << " assigned to "
1834 "filesystem " << mds_map
.fs_name
<< " as rank "
1835 << mds
<< " (now has " << mds_map
.get_num_in_mds() + 1
1837 fsmap
.promote(newgid
, *fs
, mds
);
1839 } else if (in
> max
) {
1840 mds_rank_t target
= in
- 1;
1841 const auto &info
= mds_map
.get_info(target
);
1842 if (mds_map
.is_active(target
)) {
1843 dout(1) << "stopping " << target
<< dendl
;
1844 mon
->clog
->info() << "stopping " << info
.human_name();
1845 auto f
= [](auto& info
) {
1846 info
.state
= MDSMap::STATE_STOPPING
;
1848 fsmap
.modify_daemon(info
.global_id
, f
);
1851 dout(20) << "skipping stop of " << target
<< dendl
;
1861 * If a daemon is laggy, and a suitable replacement
1862 * is available, fail this daemon (remove from map) and pass its
1863 * role to another daemon.
1865 void MDSMonitor::maybe_replace_gid(FSMap
&fsmap
, mds_gid_t gid
,
1866 const MDSMap::mds_info_t
& info
, bool *mds_propose
, bool *osd_propose
)
1868 ceph_assert(mds_propose
!= nullptr);
1869 ceph_assert(osd_propose
!= nullptr);
1871 const auto fscid
= fsmap
.mds_roles
.at(gid
);
1873 // We will only take decisive action (replacing/removing a daemon)
1874 // if we have some indicating that some other daemon(s) are successfully
1875 // getting beacons through recently.
1876 mono_time latest_beacon
= mono_clock::zero();
1877 for (const auto &p
: last_beacon
) {
1878 latest_beacon
= std::max(p
.second
.stamp
, latest_beacon
);
1880 mono_time now
= mono_clock::now();
1881 chrono::duration
<double> since
= now
-latest_beacon
;
1882 const bool frozen
= info
.is_frozen();
1883 const bool may_replace
= since
.count() <
1884 std::max(g_conf()->mds_beacon_interval
, g_conf()->mds_beacon_grace
* 0.5);
1887 // and is there a non-laggy standby that can take over for us?
1889 if (info
.rank
>= 0 &&
1890 info
.state
!= MDSMap::STATE_STANDBY
&&
1891 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
&&
1894 !fsmap
.get_filesystem(fscid
)->mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
) &&
1895 (sgid
= fsmap
.find_replacement_for({fscid
, info
.rank
}, info
.name
)) != MDS_GID_NONE
)
1898 MDSMap::mds_info_t si
= fsmap
.get_info_gid(sgid
);
1899 dout(1) << " replacing " << gid
<< " " << info
.addrs
1900 << " mds." << info
.rank
<< "." << info
.inc
1901 << " " << ceph_mds_state_name(info
.state
)
1902 << " with " << sgid
<< "/" << si
.name
<< " " << si
.addrs
1905 mon
->clog
->warn() << info
.human_name()
1906 << " is not responding, replacing it "
1907 << "as rank " << info
.rank
1908 << " with standby " << si
.human_name();
1910 // Remember what NS the old one was in
1911 const fs_cluster_id_t fscid
= fsmap
.mds_roles
.at(gid
);
1913 // Remove the old one
1914 *osd_propose
|= fail_mds_gid(fsmap
, gid
);
1916 // Promote the replacement
1917 auto&& fs
= fsmap
.filesystems
.at(fscid
);
1918 fsmap
.promote(sgid
, *fs
, info
.rank
);
1920 *mds_propose
= true;
1921 } else if ((info
.state
== MDSMap::STATE_STANDBY_REPLAY
||
1922 info
.state
== MDSMap::STATE_STANDBY
) && may_replace
&& !frozen
) {
1923 dout(1) << " failing and removing " << gid
<< " " << info
.addrs
1924 << " mds." << info
.rank
1925 << "." << info
.inc
<< " " << ceph_mds_state_name(info
.state
)
1927 mon
->clog
->info() << "Standby " << info
.human_name() << " is not "
1928 "responding, dropping it";
1929 fail_mds_gid(fsmap
, gid
);
1930 *mds_propose
= true;
1931 } else if (!info
.laggy()) {
1932 dout(1) << " marking " << gid
<< " " << info
.addrs
1933 << " mds." << info
.rank
<< "." << info
.inc
1934 << " " << ceph_mds_state_name(info
.state
)
1935 << " laggy" << dendl
;
1936 fsmap
.modify_daemon(info
.global_id
, [](auto& info
) {
1937 info
.laggy_since
= ceph_clock_now();
1939 *mds_propose
= true;
1943 bool MDSMonitor::maybe_promote_standby(FSMap
&fsmap
, Filesystem
& fs
)
1945 if (fs
.mds_map
.test_flag(CEPH_MDSMAP_NOT_JOINABLE
)) {
1949 bool do_propose
= false;
1951 // have a standby take over?
1952 set
<mds_rank_t
> failed
;
1953 fs
.mds_map
.get_failed_mds_set(failed
);
1954 for (const auto& rank
: failed
) {
1955 auto&& sgid
= fsmap
.find_replacement_for({fs
.fscid
, rank
}, {});
1957 auto&& info
= fsmap
.get_info_gid(sgid
);
1958 dout(1) << " taking over failed mds." << rank
<< " with " << sgid
1959 << "/" << info
.name
<< " " << info
.addrs
<< dendl
;
1960 mon
->clog
->info() << "Standby " << info
.human_name()
1961 << " assigned to filesystem " << fs
.mds_map
.fs_name
1962 << " as rank " << rank
;
1964 fsmap
.promote(sgid
, fs
, rank
);
1969 if (fs
.mds_map
.allows_standby_replay() && !fs
.mds_map
.is_degraded()) {
1970 // There were no failures to replace, so try using any available standbys
1971 // as standby-replay daemons. Don't do this when the cluster is degraded
1972 // as a standby-replay daemon may try to read a journal being migrated.
1974 auto standby_gid
= fsmap
.get_available_standby();
1975 if (standby_gid
== MDS_GID_NONE
) break;
1976 dout(20) << "standby available mds." << standby_gid
<< dendl
;
1977 bool changed
= false;
1978 for (const auto& rank
: fs
.mds_map
.in
) {
1979 dout(20) << "exmaining " << rank
<< dendl
;
1980 if (fs
.mds_map
.is_followable(rank
)) {
1981 dout(1) << " setting mds." << standby_gid
1982 << " to follow mds rank " << rank
<< dendl
;
1983 fsmap
.assign_standby_replay(standby_gid
, fs
.fscid
, rank
);
1989 if (!changed
) break;
1996 void MDSMonitor::tick()
1998 // make sure mds's are still alive
1999 // ...if i am an active leader
2001 if (!is_active() || !is_leader()) return;
2003 auto &pending
= get_pending_fsmap_writeable();
2005 bool do_propose
= false;
2007 do_propose
|= pending
.check_health();
2009 // resize mds cluster (adjust @in)?
2010 for (auto &p
: pending
.filesystems
) {
2011 do_propose
|= maybe_resize_cluster(pending
, p
.second
->fscid
);
2014 mono_time now
= mono_clock::now();
2015 if (mono_clock::is_zero(last_tick
)) {
2018 chrono::duration
<double> since_last
= now
-last_tick
;
2020 if (since_last
.count() >
2021 (g_conf()->mds_beacon_grace
- g_conf()->mds_beacon_interval
)) {
2022 // This case handles either local slowness (calls being delayed
2023 // for whatever reason) or cluster election slowness (a long gap
2024 // between calls while an election happened)
2025 dout(1) << __func__
<< ": resetting beacon timeouts due to mon delay "
2026 "(slow election?) of " << now
- last_tick
<< " seconds" << dendl
;
2027 for (auto &p
: last_beacon
) {
2028 p
.second
.stamp
= now
;
2034 // make sure last_beacon is fully populated
2035 for (auto &p
: pending
.mds_roles
) {
2036 auto &gid
= p
.first
;
2037 last_beacon
.emplace(std::piecewise_construct
,
2038 std::forward_as_tuple(gid
),
2039 std::forward_as_tuple(mono_clock::now(), 0));
2043 // check beacon timestamps
2044 bool propose_osdmap
= false;
2045 bool osdmap_writeable
= mon
->osdmon()->is_writeable();
2046 for (auto it
= last_beacon
.begin(); it
!= last_beacon
.end(); ) {
2047 mds_gid_t gid
= it
->first
;
2048 auto beacon_info
= it
->second
;
2049 chrono::duration
<double> since_last
= now
-beacon_info
.stamp
;
2051 if (!pending
.gid_exists(gid
)) {
2053 it
= last_beacon
.erase(it
);
2058 if (since_last
.count() >= g_conf()->mds_beacon_grace
) {
2059 auto &info
= pending
.get_info_gid(gid
);
2060 dout(1) << "no beacon from mds." << info
.rank
<< "." << info
.inc
2061 << " (gid: " << gid
<< " addr: " << info
.addrs
2062 << " state: " << ceph_mds_state_name(info
.state
) << ")"
2063 << " since " << since_last
.count() << "s" << dendl
;
2064 // If the OSDMap is writeable, we can blacklist things, so we can
2065 // try failing any laggy MDS daemons. Consider each one for failure.
2066 if (osdmap_writeable
) {
2067 maybe_replace_gid(pending
, gid
, info
, &do_propose
, &propose_osdmap
);
2073 if (propose_osdmap
) {
2074 request_proposal(mon
->osdmon());
2077 for (auto &p
: pending
.filesystems
) {
2078 do_propose
|= maybe_promote_standby(pending
, *p
.second
);
2086 MDSMonitor::MDSMonitor(Monitor
*mn
, Paxos
*p
, string service_name
)
2087 : PaxosService(mn
, p
, service_name
)
2089 handlers
= FileSystemCommandHandler::load(p
);
2092 void MDSMonitor::on_restart()
2094 // Clear out the leader-specific state.
2095 last_tick
= mono_clock::now();
2096 last_beacon
.clear();