1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "MonmapMonitor.h"
17 #include "messages/MMonCommand.h"
18 #include "messages/MMonJoin.h"
20 #include "common/ceph_argparse.h"
21 #include "common/errno.h"
23 #include "common/config.h"
24 #include "common/cmdparse.h"
26 #include "include/assert.h"
27 #include "include/stringify.h"
29 #define dout_subsys ceph_subsys_mon
31 #define dout_prefix _prefix(_dout, mon)
32 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
) {
33 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
34 << "(" << mon
->get_state_name()
35 << ").monmap v" << mon
->monmap
->epoch
<< " ";
38 void MonmapMonitor::create_initial()
40 dout(10) << __func__
<< " using current monmap" << dendl
;
41 pending_map
= *mon
->monmap
;
42 pending_map
.epoch
= 1;
44 if (g_conf
->mon_debug_no_initial_persistent_features
) {
45 derr
<< __func__
<< " mon_debug_no_initial_persistent_features=true"
48 // initialize with default persistent features for new clusters
49 pending_map
.persistent_features
= ceph::features::mon::get_persistent();
53 void MonmapMonitor::update_from_paxos(bool *need_bootstrap
)
55 version_t version
= get_last_committed();
56 if (version
<= mon
->monmap
->get_epoch())
59 dout(10) << __func__
<< " version " << version
60 << ", my v " << mon
->monmap
->epoch
<< dendl
;
62 if (need_bootstrap
&& version
!= mon
->monmap
->get_epoch()) {
63 dout(10) << " signaling that we need a bootstrap" << dendl
;
64 *need_bootstrap
= true;
69 int ret
= get_version(version
, monmap_bl
);
71 assert(monmap_bl
.length());
73 dout(10) << __func__
<< " got " << version
<< dendl
;
74 mon
->monmap
->decode(monmap_bl
);
76 if (mon
->store
->exists("mkfs", "monmap")) {
77 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
78 t
->erase("mkfs", "monmap");
79 mon
->store
->apply_transaction(t
);
85 void MonmapMonitor::create_pending()
87 pending_map
= *mon
->monmap
;
89 pending_map
.last_changed
= ceph_clock_now();
90 dout(10) << __func__
<< " monmap epoch " << pending_map
.epoch
<< dendl
;
93 void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
95 dout(10) << __func__
<< " epoch " << pending_map
.epoch
<< dendl
;
97 assert(mon
->monmap
->epoch
+ 1 == pending_map
.epoch
||
98 pending_map
.epoch
== 1); // special case mkfs!
100 pending_map
.encode(bl
, mon
->get_quorum_con_features());
102 put_version(t
, pending_map
.epoch
, bl
);
103 put_last_committed(t
, pending_map
.epoch
);
105 // generate a cluster fingerprint, too?
106 if (pending_map
.epoch
== 1) {
107 mon
->prepare_new_fingerprint(t
);
111 class C_ApplyFeatures
: public Context
{
113 mon_feature_t features
;
115 C_ApplyFeatures(MonmapMonitor
*s
, const mon_feature_t
& f
) :
116 svc(s
), features(f
) { }
117 void finish(int r
) override
{
119 svc
->apply_mon_features(features
);
120 } else if (r
== -EAGAIN
|| r
== -ECANCELED
) {
121 // discard features if we're no longer on the quorum that
122 // established them in the first place.
125 assert(0 == "bad C_ApplyFeatures return value");
130 void MonmapMonitor::apply_mon_features(const mon_feature_t
& features
)
132 if (!is_writeable()) {
133 dout(5) << __func__
<< " wait for service to be writeable" << dendl
;
134 wait_for_writeable_ctx(new C_ApplyFeatures(this, features
));
138 assert(is_writeable());
139 assert(features
.contains_all(pending_map
.persistent_features
));
140 // we should never hit this because `features` should be the result
141 // of the quorum's supported features. But if it happens, die.
142 assert(ceph::features::mon::get_supported().contains_all(features
));
144 mon_feature_t new_features
=
145 (pending_map
.persistent_features
^
146 (features
& ceph::features::mon::get_persistent()));
148 if (new_features
.empty()) {
149 dout(10) << __func__
<< " features match current pending: "
150 << features
<< dendl
;
154 if (mon
->get_quorum().size() < mon
->monmap
->size()) {
155 dout(1) << __func__
<< " new features " << new_features
156 << " contains features that require a full quorum"
157 << " (quorum size is " << mon
->get_quorum().size()
158 << ", requires " << mon
->monmap
->size() << "): "
160 << " -- do not enable them!" << dendl
;
164 new_features
|= pending_map
.persistent_features
;
166 dout(5) << __func__
<< " applying new features to monmap;"
167 << " had " << pending_map
.persistent_features
168 << ", will have " << new_features
<< dendl
;
169 pending_map
.persistent_features
= new_features
;
173 void MonmapMonitor::on_active()
175 if (get_last_committed() >= 1 && !mon
->has_ever_joined
) {
176 // make note of the fact that i was, once, part of the quorum.
177 dout(10) << "noting that i was, once, part of an active quorum." << dendl
;
179 /* This is some form of nasty in-breeding we have between the MonmapMonitor
180 and the Monitor itself. We should find a way to get rid of it given our
181 new architecture. Until then, stick with it since we are a
182 single-threaded process and, truth be told, no one else relies on this
185 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
186 t
->put(Monitor::MONITOR_NAME
, "joined", 1);
187 mon
->store
->apply_transaction(t
);
188 mon
->has_ever_joined
= true;
191 if (mon
->is_leader())
192 mon
->clog
->info() << "monmap " << *mon
->monmap
;
194 apply_mon_features(mon
->get_quorum_mon_features());
197 bool MonmapMonitor::preprocess_query(MonOpRequestRef op
)
199 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
200 switch (m
->get_type()) {
202 case MSG_MON_COMMAND
:
203 return preprocess_command(op
);
205 return preprocess_join(op
);
212 void MonmapMonitor::dump_info(Formatter
*f
)
214 f
->dump_unsigned("monmap_first_committed", get_first_committed());
215 f
->dump_unsigned("monmap_last_committed", get_last_committed());
216 f
->open_object_section("monmap");
217 mon
->monmap
->dump(f
);
219 f
->open_array_section("quorum");
220 for (set
<int>::iterator q
= mon
->get_quorum().begin(); q
!= mon
->get_quorum().end(); ++q
)
221 f
->dump_int("mon", *q
);
225 bool MonmapMonitor::preprocess_command(MonOpRequestRef op
)
227 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
232 map
<string
, cmd_vartype
> cmdmap
;
233 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
234 string rs
= ss
.str();
235 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
240 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
242 MonSession
*session
= m
->get_session();
244 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
249 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
250 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
252 if (prefix
== "mon stat") {
253 mon
->monmap
->print_summary(ss
);
254 ss
<< ", election epoch " << mon
->get_epoch() << ", leader "
255 << mon
->get_leader() << " " << mon
->get_leader_name()
256 << ", quorum " << mon
->get_quorum() << " " << mon
->get_quorum_names();
261 } else if (prefix
== "mon getmap" ||
262 prefix
== "mon dump") {
266 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)0);
269 MonMap
*p
= mon
->monmap
;
272 r
= get_version(epoch
, bl
);
274 ss
<< "there is no map for epoch " << epoch
;
278 assert(bl
.length() > 0);
285 if (prefix
== "mon getmap") {
286 p
->encode(rdata
, m
->get_connection()->get_features());
288 ss
<< "got monmap epoch " << p
->get_epoch();
289 } else if (prefix
== "mon dump") {
292 f
->open_object_section("monmap");
294 f
->open_array_section("quorum");
295 for (set
<int>::iterator q
= mon
->get_quorum().begin();
296 q
!= mon
->get_quorum().end(); ++q
) {
297 f
->dump_int("mon", *q
);
308 ss
<< "dumped monmap epoch " << p
->get_epoch();
310 if (p
!= mon
->monmap
)
313 } else if (prefix
== "mon feature ls") {
315 bool list_with_value
= false;
317 if (cmd_getval(g_ceph_context
, cmdmap
, "with_value", with_value
) &&
318 with_value
== "--with-value") {
319 list_with_value
= true;
322 MonMap
*p
= mon
->monmap
;
325 mon_feature_t supported
= ceph::features::mon::get_supported();
326 mon_feature_t persistent
= ceph::features::mon::get_persistent();
327 mon_feature_t required
= p
->get_required_features();
330 auto print_feature
= [&](mon_feature_t
& m_features
, const char* m_str
) {
333 m_features
.dump_with_value(f
.get(), m_str
);
335 m_features
.dump(f
.get(), m_str
);
338 m_features
.print_with_value(ds
);
340 m_features
.print(ds
);
345 f
->open_object_section("features");
347 f
->open_object_section("all");
348 print_feature(supported
, "supported");
349 print_feature(persistent
, "persistent");
350 f
->close_section(); // all
352 f
->open_object_section("monmap");
353 print_feature(p
->persistent_features
, "persistent");
354 print_feature(p
->optional_features
, "optional");
355 print_feature(required
, "required");
356 f
->close_section(); // monmap
358 f
->close_section(); // features
362 ds
<< "all features" << std::endl
364 print_feature(supported
, nullptr);
367 print_feature(persistent
, nullptr);
371 ds
<< "on current monmap (epoch "
372 << p
->get_epoch() << ")" << std::endl
374 print_feature(p
->persistent_features
, nullptr);
376 // omit optional features in plain-text
377 // makes it easier to read, and they're, currently, empty.
379 print_feature(required
, nullptr);
391 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
398 bool MonmapMonitor::prepare_update(MonOpRequestRef op
)
400 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
401 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
403 switch (m
->get_type()) {
404 case MSG_MON_COMMAND
:
405 return prepare_command(op
);
407 return prepare_join(op
);
415 bool MonmapMonitor::prepare_command(MonOpRequestRef op
)
417 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
422 map
<string
, cmd_vartype
> cmdmap
;
423 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
424 string rs
= ss
.str();
425 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
430 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
432 MonSession
*session
= m
->get_session();
434 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
438 /* We should follow the following rules:
440 * - 'monmap' is the current, consistent version of the monmap
441 * - 'pending_map' is the uncommitted version of the monmap
443 * All checks for the current state must be made against 'monmap'.
444 * All changes are made against 'pending_map'.
446 * If there are concurrent operations modifying 'pending_map', please
447 * follow the following rules.
449 * - if pending_map has already been changed, the second operation must
450 * wait for the proposal to finish and be run again; This is the easiest
451 * path to guarantee correctness but may impact performance (i.e., it
452 * will take longer for the user to get a reply).
454 * - if the result of the second operation can be guaranteed to be
455 * idempotent, the operation may reply to the user once the proposal
456 * finishes; still needs to wait for the proposal to finish.
458 * - An operation _NEVER_ returns to the user based on pending state.
460 * If an operation does not modify current stable monmap, it may be
461 * serialized before current pending map, regardless of any change that
462 * has been made to the pending map -- remember, pending is uncommitted
463 * state, thus we are not bound by it.
467 MonMap
&monmap
= *mon
->monmap
;
472 * Adding or removing monitors may lead to loss of quorum.
474 * Because quorum may be lost, it's important to reply something
475 * to the user, lest she end up waiting forever for a reply. And
476 * no reply will ever be sent until quorum is formed again.
478 * On the other hand, this means we're leaking uncommitted state
479 * to the user. As such, please be mindful of the reply message.
481 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
482 * operation and conveys its not-yet-permanent nature); whereas
483 * 'added monitor mon.foo' presumes the action has successfully
484 * completed and state has been committed, which may not be true.
488 bool propose
= false;
489 if (prefix
== "mon add") {
491 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
493 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
497 if (!addr
.parse(addrstr
.c_str())) {
499 ss
<< "addr " << addrstr
<< "does not parse";
503 if (addr
.get_port() == 0) {
504 ss
<< "port defaulted to " << CEPH_MON_PORT
;
505 addr
.set_port(CEPH_MON_PORT
);
509 * If we have a monitor with the same name and different addr, then EEXIST
510 * If we have a monitor with the same addr and different name, then EEXIST
511 * If we have a monitor with the same addr and same name, then wait for
512 * the proposal to finish and return success.
513 * If we don't have the monitor, add it.
517 if (!ss
.str().empty())
521 if (monmap
.contains(name
)) {
522 if (monmap
.get_addr(name
) == addr
) {
523 // stable map contains monitor with the same name at the same address.
524 // serialize before current pending map.
525 err
= 0; // for clarity; this has already been set above.
526 ss
<< "mon." << name
<< " at " << addr
<< " already exists";
530 << " already exists at address " << monmap
.get_addr(name
);
532 } else if (monmap
.contains(addr
)) {
533 // we established on the previous branch that name is different
534 ss
<< "mon." << monmap
.get_name(addr
)
535 << " already exists at address " << addr
;
544 /* Given there's no delay between proposals on the MonmapMonitor (see
545 * MonmapMonitor::should_propose()), there is no point in checking for
546 * a mismatch between name and addr on pending_map.
548 * Once we established the monitor does not exist in the committed state,
549 * we can simply go ahead and add the monitor.
552 pending_map
.add(name
, addr
);
553 pending_map
.last_changed
= ceph_clock_now();
554 ss
<< "adding mon." << name
<< " at " << addr
;
556 dout(0) << __func__
<< " proposing new mon." << name
<< dendl
;
558 } else if (prefix
== "mon remove" ||
559 prefix
== "mon rm") {
561 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
562 if (!monmap
.contains(name
)) {
564 ss
<< "mon." << name
<< " does not exist or has already been removed";
568 if (monmap
.size() == 1) {
570 ss
<< "error: refusing removal of last monitor " << name
;
574 /* At the time of writing, there is no risk of races when multiple clients
575 * attempt to use the same name. The reason is simple but may not be
578 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
579 * soon as we return 'true' below, PaxosService::dispatch() will check if
580 * the service should propose, and - if so - the service will be marked as
581 * 'proposing' and a proposal will be triggered. The PaxosService class
582 * guarantees that once a service is marked 'proposing' no further writes
585 * The decision on whether the service should propose or not is, in this
586 * case, made by MonmapMonitor::should_propose(), which always considers
587 * the proposal delay being 0.0 seconds. This is key for PaxosService to
588 * trigger the proposal immediately.
589 * 0.0 seconds of delay.
591 * From the above, there's no point in performing further checks on the
592 * pending_map, as we don't ever have multiple proposals in-flight in
593 * this service. As we've established the committed state contains the
594 * monitor, we can simply go ahead and remove it.
596 * Please note that the code hinges on all of the above to be true. It
597 * has been true since time immemorial and we don't see a good reason
598 * to make it sturdier at this time - mainly because we don't think it's
599 * going to change any time soon, lest for any bug that may be unwillingly
603 entity_addr_t addr
= pending_map
.get_addr(name
);
604 pending_map
.remove(name
);
605 pending_map
.last_changed
= ceph_clock_now();
606 ss
<< "removing mon." << name
<< " at " << addr
607 << ", there will be " << pending_map
.size() << " monitors" ;
611 } else if (prefix
== "mon feature set") {
615 * We currently only support setting/unsetting persistent features.
616 * This is by design, given at the moment we still don't have optional
617 * features, and, as such, there is no point introducing an interface
618 * to manipulate them. This allows us to provide a cleaner, more
619 * intuitive interface to the user, modifying solely persistent
622 * In the future we should consider adding another interface to handle
623 * optional features/flags; e.g., 'mon feature flag set/unset', or
624 * 'mon flag set/unset'.
627 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature_name", feature_name
)) {
628 ss
<< "missing required feature name";
633 mon_feature_t feature
;
634 feature
= ceph::features::mon::get_feature_by_name(feature_name
);
635 if (feature
== ceph::features::mon::FEATURE_NONE
) {
636 ss
<< "unknown feature '" << feature_name
<< "'";
642 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) ||
643 sure
!= "--yes-i-really-mean-it") {
644 ss
<< "please specify '--yes-i-really-mean-it' if you "
645 << "really, **really** want to set feature '"
646 << feature
<< "' in the monmap.";
651 if (!mon
->get_quorum_mon_features().contains_all(feature
)) {
652 ss
<< "current quorum does not support feature '" << feature
653 << "'; supported features: "
654 << mon
->get_quorum_mon_features();
659 ss
<< "setting feature '" << feature
<< "'";
662 if (monmap
.persistent_features
.contains_all(feature
)) {
663 dout(10) << __func__
<< " feature '" << feature
664 << "' already set on monmap; no-op." << dendl
;
668 pending_map
.persistent_features
.set_feature(feature
);
669 pending_map
.last_changed
= ceph_clock_now();
672 dout(1) << __func__
<< ss
.str() << "; new features will be: "
673 << "persistent = " << pending_map
.persistent_features
674 // output optional nevertheless, for auditing purposes.
675 << ", optional = " << pending_map
.optional_features
<< dendl
;
678 ss
<< "unknown command " << prefix
;
684 mon
->reply_command(op
, err
, rs
, get_last_committed());
685 // we are returning to the user; do not propose.
689 bool MonmapMonitor::preprocess_join(MonOpRequestRef op
)
691 MMonJoin
*join
= static_cast<MMonJoin
*>(op
->get_req());
692 dout(10) << __func__
<< " " << join
->name
<< " at " << join
->addr
<< dendl
;
694 MonSession
*session
= join
->get_session();
696 !session
->is_capable("mon", MON_CAP_W
| MON_CAP_X
)) {
697 dout(10) << " insufficient caps" << dendl
;
701 if (pending_map
.contains(join
->name
) && !pending_map
.get_addr(join
->name
).is_blank_ip()) {
702 dout(10) << " already have " << join
->name
<< dendl
;
705 if (pending_map
.contains(join
->addr
) && pending_map
.get_name(join
->addr
) == join
->name
) {
706 dout(10) << " already have " << join
->addr
<< dendl
;
711 bool MonmapMonitor::prepare_join(MonOpRequestRef op
)
713 MMonJoin
*join
= static_cast<MMonJoin
*>(op
->get_req());
714 dout(0) << "adding/updating " << join
->name
<< " at " << join
->addr
<< " to monitor cluster" << dendl
;
715 if (pending_map
.contains(join
->name
))
716 pending_map
.remove(join
->name
);
717 if (pending_map
.contains(join
->addr
))
718 pending_map
.remove(pending_map
.get_name(join
->addr
));
719 pending_map
.add(join
->name
, join
->addr
);
720 pending_map
.last_changed
= ceph_clock_now();
724 bool MonmapMonitor::should_propose(double& delay
)
730 void MonmapMonitor::get_health(list
<pair
<health_status_t
, string
> >& summary
,
731 list
<pair
<health_status_t
, string
> > *detail
,
732 CephContext
*cct
) const
734 int max
= mon
->monmap
->size();
735 int actual
= mon
->get_quorum().size();
738 ss
<< (max
-actual
) << " mons down, quorum " << mon
->get_quorum() << " " << mon
->get_quorum_names();
739 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
741 set
<int> q
= mon
->get_quorum();
742 for (int i
=0; i
<max
; i
++) {
743 if (q
.count(i
) == 0) {
745 ss
<< "mon." << mon
->monmap
->get_name(i
) << " (rank " << i
746 << ") addr " << mon
->monmap
->get_addr(i
)
747 << " is down (out of quorum)";
748 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
755 int MonmapMonitor::get_monmap(bufferlist
&bl
)
757 version_t latest_ver
= get_last_committed();
758 dout(10) << __func__
<< " ver " << latest_ver
<< dendl
;
760 if (!mon
->store
->exists(get_service_name(), stringify(latest_ver
)))
763 int err
= get_version(latest_ver
, bl
);
765 dout(1) << __func__
<< " error obtaining monmap: "
766 << cpp_strerror(err
) << dendl
;
772 void MonmapMonitor::check_subs()
774 const string type
= "monmap";
775 mon
->with_session_map([this, &type
](const MonSessionMap
& session_map
) {
776 auto subs
= session_map
.subs
.find(type
);
777 if (subs
== session_map
.subs
.end())
779 for (auto sub
: *subs
->second
) {
785 void MonmapMonitor::check_sub(Subscription
*sub
)
787 const auto epoch
= mon
->monmap
->get_epoch();
789 << " monmap next " << sub
->next
790 << " have " << epoch
<< dendl
;
791 if (sub
->next
<= epoch
) {
792 mon
->send_latest_monmap(sub
->session
->con
.get());
794 mon
->with_session_map([this, sub
](MonSessionMap
& session_map
) {
795 session_map
.remove_sub(sub
);
798 sub
->next
= epoch
+ 1;