1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "MonmapMonitor.h"
17 #include "messages/MMonCommand.h"
18 #include "messages/MMonJoin.h"
20 #include "common/ceph_argparse.h"
21 #include "common/errno.h"
23 #include "common/config.h"
24 #include "common/cmdparse.h"
26 #include "include/assert.h"
27 #include "include/stringify.h"
29 #define dout_subsys ceph_subsys_mon
31 #define dout_prefix _prefix(_dout, mon)
32 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
) {
33 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
34 << "(" << mon
->get_state_name()
35 << ").monmap v" << mon
->monmap
->epoch
<< " ";
38 void MonmapMonitor::create_initial()
40 dout(10) << __func__
<< " using current monmap" << dendl
;
41 pending_map
= *mon
->monmap
;
42 pending_map
.epoch
= 1;
44 if (g_conf
->mon_debug_no_initial_persistent_features
) {
45 derr
<< __func__
<< " mon_debug_no_initial_persistent_features=true"
48 // initialize with default persistent features for new clusters
49 pending_map
.persistent_features
= ceph::features::mon::get_persistent();
53 void MonmapMonitor::update_from_paxos(bool *need_bootstrap
)
55 version_t version
= get_last_committed();
56 if (version
<= mon
->monmap
->get_epoch())
59 dout(10) << __func__
<< " version " << version
60 << ", my v " << mon
->monmap
->epoch
<< dendl
;
62 if (need_bootstrap
&& version
!= mon
->monmap
->get_epoch()) {
63 dout(10) << " signaling that we need a bootstrap" << dendl
;
64 *need_bootstrap
= true;
69 int ret
= get_version(version
, monmap_bl
);
71 assert(monmap_bl
.length());
73 dout(10) << __func__
<< " got " << version
<< dendl
;
74 mon
->monmap
->decode(monmap_bl
);
76 if (mon
->store
->exists("mkfs", "monmap")) {
77 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
78 t
->erase("mkfs", "monmap");
79 mon
->store
->apply_transaction(t
);
85 void MonmapMonitor::create_pending()
87 pending_map
= *mon
->monmap
;
89 pending_map
.last_changed
= ceph_clock_now();
90 dout(10) << __func__
<< " monmap epoch " << pending_map
.epoch
<< dendl
;
93 void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
95 dout(10) << __func__
<< " epoch " << pending_map
.epoch
<< dendl
;
97 assert(mon
->monmap
->epoch
+ 1 == pending_map
.epoch
||
98 pending_map
.epoch
== 1); // special case mkfs!
100 pending_map
.encode(bl
, mon
->get_quorum_con_features());
102 put_version(t
, pending_map
.epoch
, bl
);
103 put_last_committed(t
, pending_map
.epoch
);
105 // generate a cluster fingerprint, too?
106 if (pending_map
.epoch
== 1) {
107 mon
->prepare_new_fingerprint(t
);
111 class C_ApplyFeatures
: public Context
{
113 mon_feature_t features
;
115 C_ApplyFeatures(MonmapMonitor
*s
, const mon_feature_t
& f
) :
116 svc(s
), features(f
) { }
117 void finish(int r
) override
{
119 svc
->apply_mon_features(features
);
120 } else if (r
== -EAGAIN
|| r
== -ECANCELED
) {
121 // discard features if we're no longer on the quorum that
122 // established them in the first place.
125 assert(0 == "bad C_ApplyFeatures return value");
130 void MonmapMonitor::apply_mon_features(const mon_feature_t
& features
)
132 if (!is_writeable()) {
133 dout(5) << __func__
<< " wait for service to be writeable" << dendl
;
134 wait_for_writeable_ctx(new C_ApplyFeatures(this, features
));
138 assert(is_writeable());
139 assert(features
.contains_all(pending_map
.persistent_features
));
140 // we should never hit this because `features` should be the result
141 // of the quorum's supported features. But if it happens, die.
142 assert(ceph::features::mon::get_supported().contains_all(features
));
144 mon_feature_t new_features
=
145 (pending_map
.persistent_features
^
146 (features
& ceph::features::mon::get_persistent()));
148 if (new_features
.empty()) {
149 dout(10) << __func__
<< " features match current pending: "
150 << features
<< dendl
;
154 if (mon
->get_quorum().size() < mon
->monmap
->size()) {
155 dout(1) << __func__
<< " new features " << new_features
156 << " contains features that require a full quorum"
157 << " (quorum size is " << mon
->get_quorum().size()
158 << ", requires " << mon
->monmap
->size() << "): "
160 << " -- do not enable them!" << dendl
;
164 new_features
|= pending_map
.persistent_features
;
166 dout(5) << __func__
<< " applying new features to monmap;"
167 << " had " << pending_map
.persistent_features
168 << ", will have " << new_features
<< dendl
;
169 pending_map
.persistent_features
= new_features
;
173 void MonmapMonitor::on_active()
175 if (get_last_committed() >= 1 && !mon
->has_ever_joined
) {
176 // make note of the fact that i was, once, part of the quorum.
177 dout(10) << "noting that i was, once, part of an active quorum." << dendl
;
179 /* This is some form of nasty in-breeding we have between the MonmapMonitor
180 and the Monitor itself. We should find a way to get rid of it given our
181 new architecture. Until then, stick with it since we are a
182 single-threaded process and, truth be told, no one else relies on this
185 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
186 t
->put(Monitor::MONITOR_NAME
, "joined", 1);
187 mon
->store
->apply_transaction(t
);
188 mon
->has_ever_joined
= true;
191 if (mon
->is_leader()) {
192 mon
->clog
->debug() << "monmap " << *mon
->monmap
;
195 apply_mon_features(mon
->get_quorum_mon_features());
198 bool MonmapMonitor::preprocess_query(MonOpRequestRef op
)
200 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
201 switch (m
->get_type()) {
203 case MSG_MON_COMMAND
:
204 return preprocess_command(op
);
206 return preprocess_join(op
);
213 void MonmapMonitor::dump_info(Formatter
*f
)
215 f
->dump_unsigned("monmap_first_committed", get_first_committed());
216 f
->dump_unsigned("monmap_last_committed", get_last_committed());
217 f
->open_object_section("monmap");
218 mon
->monmap
->dump(f
);
220 f
->open_array_section("quorum");
221 for (set
<int>::iterator q
= mon
->get_quorum().begin(); q
!= mon
->get_quorum().end(); ++q
)
222 f
->dump_int("mon", *q
);
226 bool MonmapMonitor::preprocess_command(MonOpRequestRef op
)
228 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
233 map
<string
, cmd_vartype
> cmdmap
;
234 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
235 string rs
= ss
.str();
236 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
241 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
243 MonSession
*session
= m
->get_session();
245 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
250 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
251 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
253 if (prefix
== "mon stat") {
254 mon
->monmap
->print_summary(ss
);
255 ss
<< ", election epoch " << mon
->get_epoch() << ", leader "
256 << mon
->get_leader() << " " << mon
->get_leader_name()
257 << ", quorum " << mon
->get_quorum() << " " << mon
->get_quorum_names();
262 } else if (prefix
== "mon getmap" ||
263 prefix
== "mon dump") {
267 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)0);
270 MonMap
*p
= mon
->monmap
;
273 r
= get_version(epoch
, bl
);
275 ss
<< "there is no map for epoch " << epoch
;
279 assert(bl
.length() > 0);
286 if (prefix
== "mon getmap") {
287 p
->encode(rdata
, m
->get_connection()->get_features());
289 ss
<< "got monmap epoch " << p
->get_epoch();
290 } else if (prefix
== "mon dump") {
293 f
->open_object_section("monmap");
295 f
->open_array_section("quorum");
296 for (set
<int>::iterator q
= mon
->get_quorum().begin();
297 q
!= mon
->get_quorum().end(); ++q
) {
298 f
->dump_int("mon", *q
);
309 ss
<< "dumped monmap epoch " << p
->get_epoch();
311 if (p
!= mon
->monmap
)
314 } else if (prefix
== "mon feature ls") {
316 bool list_with_value
= false;
318 if (cmd_getval(g_ceph_context
, cmdmap
, "with_value", with_value
) &&
319 with_value
== "--with-value") {
320 list_with_value
= true;
323 MonMap
*p
= mon
->monmap
;
326 mon_feature_t supported
= ceph::features::mon::get_supported();
327 mon_feature_t persistent
= ceph::features::mon::get_persistent();
328 mon_feature_t required
= p
->get_required_features();
331 auto print_feature
= [&](mon_feature_t
& m_features
, const char* m_str
) {
334 m_features
.dump_with_value(f
.get(), m_str
);
336 m_features
.dump(f
.get(), m_str
);
339 m_features
.print_with_value(ds
);
341 m_features
.print(ds
);
346 f
->open_object_section("features");
348 f
->open_object_section("all");
349 print_feature(supported
, "supported");
350 print_feature(persistent
, "persistent");
351 f
->close_section(); // all
353 f
->open_object_section("monmap");
354 print_feature(p
->persistent_features
, "persistent");
355 print_feature(p
->optional_features
, "optional");
356 print_feature(required
, "required");
357 f
->close_section(); // monmap
359 f
->close_section(); // features
363 ds
<< "all features" << std::endl
365 print_feature(supported
, nullptr);
368 print_feature(persistent
, nullptr);
372 ds
<< "on current monmap (epoch "
373 << p
->get_epoch() << ")" << std::endl
375 print_feature(p
->persistent_features
, nullptr);
377 // omit optional features in plain-text
378 // makes it easier to read, and they're, currently, empty.
380 print_feature(required
, nullptr);
392 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
399 bool MonmapMonitor::prepare_update(MonOpRequestRef op
)
401 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
402 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
404 switch (m
->get_type()) {
405 case MSG_MON_COMMAND
:
406 return prepare_command(op
);
408 return prepare_join(op
);
416 bool MonmapMonitor::prepare_command(MonOpRequestRef op
)
418 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
423 map
<string
, cmd_vartype
> cmdmap
;
424 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
425 string rs
= ss
.str();
426 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
431 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
433 MonSession
*session
= m
->get_session();
435 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
439 /* We should follow the following rules:
441 * - 'monmap' is the current, consistent version of the monmap
442 * - 'pending_map' is the uncommitted version of the monmap
444 * All checks for the current state must be made against 'monmap'.
445 * All changes are made against 'pending_map'.
447 * If there are concurrent operations modifying 'pending_map', please
448 * follow the following rules.
450 * - if pending_map has already been changed, the second operation must
451 * wait for the proposal to finish and be run again; This is the easiest
452 * path to guarantee correctness but may impact performance (i.e., it
453 * will take longer for the user to get a reply).
455 * - if the result of the second operation can be guaranteed to be
456 * idempotent, the operation may reply to the user once the proposal
457 * finishes; still needs to wait for the proposal to finish.
459 * - An operation _NEVER_ returns to the user based on pending state.
461 * If an operation does not modify current stable monmap, it may be
462 * serialized before current pending map, regardless of any change that
463 * has been made to the pending map -- remember, pending is uncommitted
464 * state, thus we are not bound by it.
468 MonMap
&monmap
= *mon
->monmap
;
473 * Adding or removing monitors may lead to loss of quorum.
475 * Because quorum may be lost, it's important to reply something
476 * to the user, lest she end up waiting forever for a reply. And
477 * no reply will ever be sent until quorum is formed again.
479 * On the other hand, this means we're leaking uncommitted state
480 * to the user. As such, please be mindful of the reply message.
482 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
483 * operation and conveys its not-yet-permanent nature); whereas
484 * 'added monitor mon.foo' presumes the action has successfully
485 * completed and state has been committed, which may not be true.
489 bool propose
= false;
490 if (prefix
== "mon add") {
492 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
494 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
498 if (!addr
.parse(addrstr
.c_str())) {
500 ss
<< "addr " << addrstr
<< "does not parse";
504 if (addr
.get_port() == 0) {
505 ss
<< "port defaulted to " << CEPH_MON_PORT
;
506 addr
.set_port(CEPH_MON_PORT
);
510 * If we have a monitor with the same name and different addr, then EEXIST
511 * If we have a monitor with the same addr and different name, then EEXIST
512 * If we have a monitor with the same addr and same name, then wait for
513 * the proposal to finish and return success.
514 * If we don't have the monitor, add it.
518 if (!ss
.str().empty())
522 if (monmap
.contains(name
)) {
523 if (monmap
.get_addr(name
) == addr
) {
524 // stable map contains monitor with the same name at the same address.
525 // serialize before current pending map.
526 err
= 0; // for clarity; this has already been set above.
527 ss
<< "mon." << name
<< " at " << addr
<< " already exists";
531 << " already exists at address " << monmap
.get_addr(name
);
533 } else if (monmap
.contains(addr
)) {
534 // we established on the previous branch that name is different
535 ss
<< "mon." << monmap
.get_name(addr
)
536 << " already exists at address " << addr
;
545 /* Given there's no delay between proposals on the MonmapMonitor (see
546 * MonmapMonitor::should_propose()), there is no point in checking for
547 * a mismatch between name and addr on pending_map.
549 * Once we established the monitor does not exist in the committed state,
550 * we can simply go ahead and add the monitor.
553 pending_map
.add(name
, addr
);
554 pending_map
.last_changed
= ceph_clock_now();
555 ss
<< "adding mon." << name
<< " at " << addr
;
557 dout(0) << __func__
<< " proposing new mon." << name
<< dendl
;
559 } else if (prefix
== "mon remove" ||
560 prefix
== "mon rm") {
562 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
563 if (!monmap
.contains(name
)) {
565 ss
<< "mon." << name
<< " does not exist or has already been removed";
569 if (monmap
.size() == 1) {
571 ss
<< "error: refusing removal of last monitor " << name
;
575 /* At the time of writing, there is no risk of races when multiple clients
576 * attempt to use the same name. The reason is simple but may not be
579 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
580 * soon as we return 'true' below, PaxosService::dispatch() will check if
581 * the service should propose, and - if so - the service will be marked as
582 * 'proposing' and a proposal will be triggered. The PaxosService class
583 * guarantees that once a service is marked 'proposing' no further writes
586 * The decision on whether the service should propose or not is, in this
587 * case, made by MonmapMonitor::should_propose(), which always considers
588 * the proposal delay being 0.0 seconds. This is key for PaxosService to
589 * trigger the proposal immediately.
590 * 0.0 seconds of delay.
592 * From the above, there's no point in performing further checks on the
593 * pending_map, as we don't ever have multiple proposals in-flight in
594 * this service. As we've established the committed state contains the
595 * monitor, we can simply go ahead and remove it.
597 * Please note that the code hinges on all of the above to be true. It
598 * has been true since time immemorial and we don't see a good reason
599 * to make it sturdier at this time - mainly because we don't think it's
600 * going to change any time soon, lest for any bug that may be unwillingly
604 entity_addr_t addr
= pending_map
.get_addr(name
);
605 pending_map
.remove(name
);
606 pending_map
.last_changed
= ceph_clock_now();
607 ss
<< "removing mon." << name
<< " at " << addr
608 << ", there will be " << pending_map
.size() << " monitors" ;
612 } else if (prefix
== "mon feature set") {
616 * We currently only support setting/unsetting persistent features.
617 * This is by design, given at the moment we still don't have optional
618 * features, and, as such, there is no point introducing an interface
619 * to manipulate them. This allows us to provide a cleaner, more
620 * intuitive interface to the user, modifying solely persistent
623 * In the future we should consider adding another interface to handle
624 * optional features/flags; e.g., 'mon feature flag set/unset', or
625 * 'mon flag set/unset'.
628 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature_name", feature_name
)) {
629 ss
<< "missing required feature name";
634 mon_feature_t feature
;
635 feature
= ceph::features::mon::get_feature_by_name(feature_name
);
636 if (feature
== ceph::features::mon::FEATURE_NONE
) {
637 ss
<< "unknown feature '" << feature_name
<< "'";
643 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) ||
644 sure
!= "--yes-i-really-mean-it") {
645 ss
<< "please specify '--yes-i-really-mean-it' if you "
646 << "really, **really** want to set feature '"
647 << feature
<< "' in the monmap.";
652 if (!mon
->get_quorum_mon_features().contains_all(feature
)) {
653 ss
<< "current quorum does not support feature '" << feature
654 << "'; supported features: "
655 << mon
->get_quorum_mon_features();
660 ss
<< "setting feature '" << feature
<< "'";
663 if (monmap
.persistent_features
.contains_all(feature
)) {
664 dout(10) << __func__
<< " feature '" << feature
665 << "' already set on monmap; no-op." << dendl
;
669 pending_map
.persistent_features
.set_feature(feature
);
670 pending_map
.last_changed
= ceph_clock_now();
673 dout(1) << __func__
<< ss
.str() << "; new features will be: "
674 << "persistent = " << pending_map
.persistent_features
675 // output optional nevertheless, for auditing purposes.
676 << ", optional = " << pending_map
.optional_features
<< dendl
;
679 ss
<< "unknown command " << prefix
;
685 mon
->reply_command(op
, err
, rs
, get_last_committed());
686 // we are returning to the user; do not propose.
690 bool MonmapMonitor::preprocess_join(MonOpRequestRef op
)
692 MMonJoin
*join
= static_cast<MMonJoin
*>(op
->get_req());
693 dout(10) << __func__
<< " " << join
->name
<< " at " << join
->addr
<< dendl
;
695 MonSession
*session
= join
->get_session();
697 !session
->is_capable("mon", MON_CAP_W
| MON_CAP_X
)) {
698 dout(10) << " insufficient caps" << dendl
;
702 if (pending_map
.contains(join
->name
) && !pending_map
.get_addr(join
->name
).is_blank_ip()) {
703 dout(10) << " already have " << join
->name
<< dendl
;
706 if (pending_map
.contains(join
->addr
) && pending_map
.get_name(join
->addr
) == join
->name
) {
707 dout(10) << " already have " << join
->addr
<< dendl
;
712 bool MonmapMonitor::prepare_join(MonOpRequestRef op
)
714 MMonJoin
*join
= static_cast<MMonJoin
*>(op
->get_req());
715 dout(0) << "adding/updating " << join
->name
<< " at " << join
->addr
<< " to monitor cluster" << dendl
;
716 if (pending_map
.contains(join
->name
))
717 pending_map
.remove(join
->name
);
718 if (pending_map
.contains(join
->addr
))
719 pending_map
.remove(pending_map
.get_name(join
->addr
));
720 pending_map
.add(join
->name
, join
->addr
);
721 pending_map
.last_changed
= ceph_clock_now();
725 bool MonmapMonitor::should_propose(double& delay
)
731 void MonmapMonitor::get_health(list
<pair
<health_status_t
, string
> >& summary
,
732 list
<pair
<health_status_t
, string
> > *detail
,
733 CephContext
*cct
) const
735 int max
= mon
->monmap
->size();
736 int actual
= mon
->get_quorum().size();
739 ss
<< (max
-actual
) << " mons down, quorum " << mon
->get_quorum() << " " << mon
->get_quorum_names();
740 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
742 set
<int> q
= mon
->get_quorum();
743 for (int i
=0; i
<max
; i
++) {
744 if (q
.count(i
) == 0) {
746 ss
<< "mon." << mon
->monmap
->get_name(i
) << " (rank " << i
747 << ") addr " << mon
->monmap
->get_addr(i
)
748 << " is down (out of quorum)";
749 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
756 int MonmapMonitor::get_monmap(bufferlist
&bl
)
758 version_t latest_ver
= get_last_committed();
759 dout(10) << __func__
<< " ver " << latest_ver
<< dendl
;
761 if (!mon
->store
->exists(get_service_name(), stringify(latest_ver
)))
764 int err
= get_version(latest_ver
, bl
);
766 dout(1) << __func__
<< " error obtaining monmap: "
767 << cpp_strerror(err
) << dendl
;
773 void MonmapMonitor::check_subs()
775 const string type
= "monmap";
776 mon
->with_session_map([this, &type
](const MonSessionMap
& session_map
) {
777 auto subs
= session_map
.subs
.find(type
);
778 if (subs
== session_map
.subs
.end())
780 for (auto sub
: *subs
->second
) {
786 void MonmapMonitor::check_sub(Subscription
*sub
)
788 const auto epoch
= mon
->monmap
->get_epoch();
790 << " monmap next " << sub
->next
791 << " have " << epoch
<< dendl
;
792 if (sub
->next
<= epoch
) {
793 mon
->send_latest_monmap(sub
->session
->con
.get());
795 mon
->with_session_map([this, sub
](MonSessionMap
& session_map
) {
796 session_map
.remove_sub(sub
);
799 sub
->next
= epoch
+ 1;