1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "MonmapMonitor.h"
17 #include "messages/MMonCommand.h"
18 #include "messages/MMonJoin.h"
20 #include "common/ceph_argparse.h"
21 #include "common/errno.h"
23 #include "common/config.h"
24 #include "common/cmdparse.h"
26 #include "include/assert.h"
27 #include "include/stringify.h"
29 #define dout_subsys ceph_subsys_mon
31 #define dout_prefix _prefix(_dout, mon)
32 static ostream
& _prefix(std::ostream
*_dout
, Monitor
*mon
) {
33 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
34 << "(" << mon
->get_state_name()
35 << ").monmap v" << mon
->monmap
->epoch
<< " ";
38 void MonmapMonitor::create_initial()
40 dout(10) << "create_initial using current monmap" << dendl
;
41 pending_map
= *mon
->monmap
;
42 pending_map
.epoch
= 1;
44 if (g_conf
->mon_debug_no_initial_persistent_features
) {
45 derr
<< __func__
<< " mon_debug_no_initial_persistent_features=true"
48 // initialize with default persistent features for new clusters
49 pending_map
.persistent_features
= ceph::features::mon::get_persistent();
53 void MonmapMonitor::update_from_paxos(bool *need_bootstrap
)
55 version_t version
= get_last_committed();
56 if (version
<= mon
->monmap
->get_epoch())
59 dout(10) << __func__
<< " version " << version
60 << ", my v " << mon
->monmap
->epoch
<< dendl
;
62 if (need_bootstrap
&& version
!= mon
->monmap
->get_epoch()) {
63 dout(10) << " signaling that we need a bootstrap" << dendl
;
64 *need_bootstrap
= true;
69 int ret
= get_version(version
, monmap_bl
);
71 assert(monmap_bl
.length());
73 dout(10) << "update_from_paxos got " << version
<< dendl
;
74 mon
->monmap
->decode(monmap_bl
);
76 if (mon
->store
->exists("mkfs", "monmap")) {
77 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
78 t
->erase("mkfs", "monmap");
79 mon
->store
->apply_transaction(t
);
85 void MonmapMonitor::create_pending()
87 pending_map
= *mon
->monmap
;
89 pending_map
.last_changed
= ceph_clock_now();
90 dout(10) << "create_pending monmap epoch " << pending_map
.epoch
<< dendl
;
93 void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
95 dout(10) << "encode_pending epoch " << pending_map
.epoch
<< dendl
;
97 assert(mon
->monmap
->epoch
+ 1 == pending_map
.epoch
||
98 pending_map
.epoch
== 1); // special case mkfs!
100 pending_map
.encode(bl
, mon
->get_quorum_con_features());
102 put_version(t
, pending_map
.epoch
, bl
);
103 put_last_committed(t
, pending_map
.epoch
);
105 // generate a cluster fingerprint, too?
106 if (pending_map
.epoch
== 1) {
107 mon
->prepare_new_fingerprint(t
);
111 class C_ApplyFeatures
: public Context
{
113 mon_feature_t features
;
115 C_ApplyFeatures(MonmapMonitor
*s
, const mon_feature_t
& f
) :
116 svc(s
), features(f
) { }
117 void finish(int r
) override
{
119 svc
->apply_mon_features(features
);
120 } else if (r
== -EAGAIN
|| r
== -ECANCELED
) {
121 // discard features if we're no longer on the quorum that
122 // established them in the first place.
125 assert(0 == "bad C_ApplyFeatures return value");
130 void MonmapMonitor::apply_mon_features(const mon_feature_t
& features
)
132 if (!is_writeable()) {
133 dout(5) << __func__
<< " wait for service to be writeable" << dendl
;
134 wait_for_writeable_ctx(new C_ApplyFeatures(this, features
));
138 assert(is_writeable());
139 assert(features
.contains_all(pending_map
.persistent_features
));
140 // we should never hit this because `features` should be the result
141 // of the quorum's supported features. But if it happens, die.
142 assert(ceph::features::mon::get_supported().contains_all(features
));
144 mon_feature_t new_features
=
145 (pending_map
.persistent_features
^
146 (features
& ceph::features::mon::get_persistent()));
148 if (new_features
.empty()) {
149 dout(10) << __func__
<< " features match current pending: "
150 << features
<< dendl
;
154 if (mon
->get_quorum().size() < mon
->monmap
->size()) {
155 dout(1) << __func__
<< " new features " << new_features
156 << " contains features that require a full quorum"
157 << " (quorum size is " << mon
->get_quorum().size()
158 << ", requires " << mon
->monmap
->size() << "): "
160 << " -- do not enable them!" << dendl
;
164 new_features
|= pending_map
.persistent_features
;
166 dout(5) << __func__
<< " applying new features to monmap;"
167 << " had " << pending_map
.persistent_features
168 << ", will have " << new_features
<< dendl
;
169 pending_map
.persistent_features
= new_features
;
173 void MonmapMonitor::on_active()
175 if (get_last_committed() >= 1 && !mon
->has_ever_joined
) {
176 // make note of the fact that i was, once, part of the quorum.
177 dout(10) << "noting that i was, once, part of an active quorum." << dendl
;
179 /* This is some form of nasty in-breeding we have between the MonmapMonitor
180 and the Monitor itself. We should find a way to get rid of it given our
181 new architecture. Until then, stick with it since we are a
182 single-threaded process and, truth be told, no one else relies on this
185 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
186 t
->put(Monitor::MONITOR_NAME
, "joined", 1);
187 mon
->store
->apply_transaction(t
);
188 mon
->has_ever_joined
= true;
191 if (mon
->is_leader())
192 mon
->clog
->info() << "monmap " << *mon
->monmap
;
194 apply_mon_features(mon
->get_quorum_mon_features());
197 bool MonmapMonitor::preprocess_query(MonOpRequestRef op
)
199 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
200 switch (m
->get_type()) {
202 case MSG_MON_COMMAND
:
203 return preprocess_command(op
);
205 return preprocess_join(op
);
212 void MonmapMonitor::dump_info(Formatter
*f
)
214 f
->dump_unsigned("monmap_first_committed", get_first_committed());
215 f
->dump_unsigned("monmap_last_committed", get_last_committed());
216 f
->open_object_section("monmap");
217 mon
->monmap
->dump(f
);
219 f
->open_array_section("quorum");
220 for (set
<int>::iterator q
= mon
->get_quorum().begin(); q
!= mon
->get_quorum().end(); ++q
)
221 f
->dump_int("mon", *q
);
225 bool MonmapMonitor::preprocess_command(MonOpRequestRef op
)
227 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
232 map
<string
, cmd_vartype
> cmdmap
;
233 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
234 string rs
= ss
.str();
235 mon
->reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
240 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
242 MonSession
*session
= m
->get_session();
244 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
249 cmd_getval(g_ceph_context
, cmdmap
, "format", format
, string("plain"));
250 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
252 if (prefix
== "mon stat") {
253 mon
->monmap
->print_summary(ss
);
254 ss
<< ", election epoch " << mon
->get_epoch() << ", quorum " << mon
->get_quorum()
255 << " " << mon
->get_quorum_names();
260 } else if (prefix
== "mon getmap" ||
261 prefix
== "mon dump") {
265 cmd_getval(g_ceph_context
, cmdmap
, "epoch", epochnum
, (int64_t)0);
268 MonMap
*p
= mon
->monmap
;
271 r
= get_version(epoch
, bl
);
273 ss
<< "there is no map for epoch " << epoch
;
277 assert(bl
.length() > 0);
284 if (prefix
== "mon getmap") {
285 p
->encode(rdata
, m
->get_connection()->get_features());
287 ss
<< "got monmap epoch " << p
->get_epoch();
288 } else if (prefix
== "mon dump") {
291 f
->open_object_section("monmap");
293 f
->open_array_section("quorum");
294 for (set
<int>::iterator q
= mon
->get_quorum().begin();
295 q
!= mon
->get_quorum().end(); ++q
) {
296 f
->dump_int("mon", *q
);
307 ss
<< "dumped monmap epoch " << p
->get_epoch();
309 if (p
!= mon
->monmap
)
312 } else if (prefix
== "mon feature list") {
314 bool list_with_value
= false;
316 if (cmd_getval(g_ceph_context
, cmdmap
, "with_value", with_value
) &&
317 with_value
== "--with-value") {
318 list_with_value
= true;
321 MonMap
*p
= mon
->monmap
;
324 mon_feature_t supported
= ceph::features::mon::get_supported();
325 mon_feature_t persistent
= ceph::features::mon::get_persistent();
326 mon_feature_t required
= p
->get_required_features();
329 auto print_feature
= [&](mon_feature_t
& m_features
, const char* m_str
) {
332 m_features
.dump_with_value(f
.get(), m_str
);
334 m_features
.dump(f
.get(), m_str
);
337 m_features
.print_with_value(ds
);
339 m_features
.print(ds
);
344 f
->open_object_section("features");
346 f
->open_object_section("all");
347 print_feature(supported
, "supported");
348 print_feature(persistent
, "persistent");
349 f
->close_section(); // all
351 f
->open_object_section("monmap");
352 print_feature(p
->persistent_features
, "persistent");
353 print_feature(p
->optional_features
, "optional");
354 print_feature(required
, "required");
355 f
->close_section(); // monmap
357 f
->close_section(); // features
361 ds
<< "all features" << std::endl
363 print_feature(supported
, nullptr);
366 print_feature(persistent
, nullptr);
370 ds
<< "on current monmap (epoch "
371 << p
->get_epoch() << ")" << std::endl
373 print_feature(p
->persistent_features
, nullptr);
375 // omit optional features in plain-text
376 // makes it easier to read, and they're, currently, empty.
378 print_feature(required
, nullptr);
390 mon
->reply_command(op
, r
, rs
, rdata
, get_last_committed());
397 bool MonmapMonitor::prepare_update(MonOpRequestRef op
)
399 PaxosServiceMessage
*m
= static_cast<PaxosServiceMessage
*>(op
->get_req());
400 dout(7) << "prepare_update " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
402 switch (m
->get_type()) {
403 case MSG_MON_COMMAND
:
404 return prepare_command(op
);
406 return prepare_join(op
);
414 bool MonmapMonitor::prepare_command(MonOpRequestRef op
)
416 MMonCommand
*m
= static_cast<MMonCommand
*>(op
->get_req());
421 map
<string
, cmd_vartype
> cmdmap
;
422 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
423 string rs
= ss
.str();
424 mon
->reply_command(op
, -EINVAL
, rs
, get_last_committed());
429 cmd_getval(g_ceph_context
, cmdmap
, "prefix", prefix
);
431 MonSession
*session
= m
->get_session();
433 mon
->reply_command(op
, -EACCES
, "access denied", get_last_committed());
437 /* We should follow the following rules:
439 * - 'monmap' is the current, consistent version of the monmap
440 * - 'pending_map' is the uncommitted version of the monmap
442 * All checks for the current state must be made against 'monmap'.
443 * All changes are made against 'pending_map'.
445 * If there are concurrent operations modifying 'pending_map', please
446 * follow the following rules.
448 * - if pending_map has already been changed, the second operation must
449 * wait for the proposal to finish and be run again; This is the easiest
450 * path to guarantee correctness but may impact performance (i.e., it
451 * will take longer for the user to get a reply).
453 * - if the result of the second operation can be guaranteed to be
454 * idempotent, the operation may reply to the user once the proposal
455 * finishes; still needs to wait for the proposal to finish.
457 * - An operation _NEVER_ returns to the user based on pending state.
459 * If an operation does not modify current stable monmap, it may be
460 * serialized before current pending map, regardless of any change that
461 * has been made to the pending map -- remember, pending is uncommitted
462 * state, thus we are not bound by it.
466 MonMap
&monmap
= *mon
->monmap
;
471 * Adding or removing monitors may lead to loss of quorum.
473 * Because quorum may be lost, it's important to reply something
474 * to the user, lest she end up waiting forever for a reply. And
475 * no reply will ever be sent until quorum is formed again.
477 * On the other hand, this means we're leaking uncommitted state
478 * to the user. As such, please be mindful of the reply message.
480 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
481 * operation and conveys its not-yet-permanent nature); whereas
482 * 'added monitor mon.foo' presumes the action has successfully
483 * completed and state has been committed, which may not be true.
487 bool propose
= false;
488 if (prefix
== "mon add") {
490 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
492 cmd_getval(g_ceph_context
, cmdmap
, "addr", addrstr
);
496 if (!addr
.parse(addrstr
.c_str())) {
498 ss
<< "addr " << addrstr
<< "does not parse";
502 if (addr
.get_port() == 0) {
503 ss
<< "port defaulted to " << CEPH_MON_PORT
;
504 addr
.set_port(CEPH_MON_PORT
);
508 * If we have a monitor with the same name and different addr, then EEXIST
509 * If we have a monitor with the same addr and different name, then EEXIST
510 * If we have a monitor with the same addr and same name, then wait for
511 * the proposal to finish and return success.
512 * If we don't have the monitor, add it.
516 if (!ss
.str().empty())
520 if (monmap
.contains(name
)) {
521 if (monmap
.get_addr(name
) == addr
) {
522 // stable map contains monitor with the same name at the same address.
523 // serialize before current pending map.
524 err
= 0; // for clarity; this has already been set above.
525 ss
<< "mon." << name
<< " at " << addr
<< " already exists";
529 << " already exists at address " << monmap
.get_addr(name
);
531 } else if (monmap
.contains(addr
)) {
532 // we established on the previous branch that name is different
533 ss
<< "mon." << monmap
.get_name(addr
)
534 << " already exists at address " << addr
;
543 /* Given there's no delay between proposals on the MonmapMonitor (see
544 * MonmapMonitor::should_propose()), there is no point in checking for
545 * a mismatch between name and addr on pending_map.
547 * Once we established the monitor does not exist in the committed state,
548 * we can simply go ahead and add the monitor.
551 pending_map
.add(name
, addr
);
552 pending_map
.last_changed
= ceph_clock_now();
553 ss
<< "adding mon." << name
<< " at " << addr
;
555 dout(0) << __func__
<< " proposing new mon." << name
<< dendl
;
557 } else if (prefix
== "mon remove" ||
558 prefix
== "mon rm") {
560 cmd_getval(g_ceph_context
, cmdmap
, "name", name
);
561 if (!monmap
.contains(name
)) {
563 ss
<< "mon." << name
<< " does not exist or has already been removed";
567 if (monmap
.size() == 1) {
569 ss
<< "error: refusing removal of last monitor " << name
;
573 /* At the time of writing, there is no risk of races when multiple clients
574 * attempt to use the same name. The reason is simple but may not be
577 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
578 * soon as we return 'true' below, PaxosService::dispatch() will check if
579 * the service should propose, and - if so - the service will be marked as
580 * 'proposing' and a proposal will be triggered. The PaxosService class
581 * guarantees that once a service is marked 'proposing' no further writes
584 * The decision on whether the service should propose or not is, in this
585 * case, made by MonmapMonitor::should_propose(), which always considers
586 * the proposal delay being 0.0 seconds. This is key for PaxosService to
587 * trigger the proposal immediately.
588 * 0.0 seconds of delay.
590 * From the above, there's no point in performing further checks on the
591 * pending_map, as we don't ever have multiple proposals in-flight in
592 * this service. As we've established the committed state contains the
593 * monitor, we can simply go ahead and remove it.
595 * Please note that the code hinges on all of the above to be true. It
596 * has been true since time immemorial and we don't see a good reason
597 * to make it sturdier at this time - mainly because we don't think it's
598 * going to change any time soon, lest for any bug that may be unwillingly
602 entity_addr_t addr
= pending_map
.get_addr(name
);
603 pending_map
.remove(name
);
604 pending_map
.last_changed
= ceph_clock_now();
605 ss
<< "removing mon." << name
<< " at " << addr
606 << ", there will be " << pending_map
.size() << " monitors" ;
610 } else if (prefix
== "mon feature set") {
614 * We currently only support setting/unsetting persistent features.
615 * This is by design, given at the moment we still don't have optional
616 * features, and, as such, there is no point introducing an interface
617 * to manipulate them. This allows us to provide a cleaner, more
618 * intuitive interface to the user, modifying solely persistent
621 * In the future we should consider adding another interface to handle
622 * optional features/flags; e.g., 'mon feature flag set/unset', or
623 * 'mon flag set/unset'.
626 if (!cmd_getval(g_ceph_context
, cmdmap
, "feature_name", feature_name
)) {
627 ss
<< "missing required feature name";
632 mon_feature_t feature
;
633 feature
= ceph::features::mon::get_feature_by_name(feature_name
);
634 if (feature
== ceph::features::mon::FEATURE_NONE
) {
635 ss
<< "unknown feature '" << feature_name
<< "'";
641 if (!cmd_getval(g_ceph_context
, cmdmap
, "sure", sure
) ||
642 sure
!= "--yes-i-really-mean-it") {
643 ss
<< "please specify '--yes-i-really-mean-it' if you "
644 << "really, **really** want to set feature '"
645 << feature
<< "' in the monmap.";
650 if (!mon
->get_quorum_mon_features().contains_all(feature
)) {
651 ss
<< "current quorum does not support feature '" << feature
652 << "'; supported features: "
653 << mon
->get_quorum_mon_features();
658 ss
<< "setting feature '" << feature
<< "'";
661 if (monmap
.persistent_features
.contains_all(feature
)) {
662 dout(10) << __func__
<< " feature '" << feature
663 << "' already set on monmap; no-op." << dendl
;
667 pending_map
.persistent_features
.set_feature(feature
);
668 pending_map
.last_changed
= ceph_clock_now();
671 dout(1) << __func__
<< ss
.str() << "; new features will be: "
672 << "persistent = " << pending_map
.persistent_features
673 // output optional nevertheless, for auditing purposes.
674 << ", optional = " << pending_map
.optional_features
<< dendl
;
677 ss
<< "unknown command " << prefix
;
683 mon
->reply_command(op
, err
, rs
, get_last_committed());
684 // we are returning to the user; do not propose.
688 bool MonmapMonitor::preprocess_join(MonOpRequestRef op
)
690 MMonJoin
*join
= static_cast<MMonJoin
*>(op
->get_req());
691 dout(10) << "preprocess_join " << join
->name
<< " at " << join
->addr
<< dendl
;
693 MonSession
*session
= join
->get_session();
695 !session
->is_capable("mon", MON_CAP_W
| MON_CAP_X
)) {
696 dout(10) << " insufficient caps" << dendl
;
700 if (pending_map
.contains(join
->name
) && !pending_map
.get_addr(join
->name
).is_blank_ip()) {
701 dout(10) << " already have " << join
->name
<< dendl
;
704 if (pending_map
.contains(join
->addr
) && pending_map
.get_name(join
->addr
) == join
->name
) {
705 dout(10) << " already have " << join
->addr
<< dendl
;
710 bool MonmapMonitor::prepare_join(MonOpRequestRef op
)
712 MMonJoin
*join
= static_cast<MMonJoin
*>(op
->get_req());
713 dout(0) << "adding/updating " << join
->name
<< " at " << join
->addr
<< " to monitor cluster" << dendl
;
714 if (pending_map
.contains(join
->name
))
715 pending_map
.remove(join
->name
);
716 if (pending_map
.contains(join
->addr
))
717 pending_map
.remove(pending_map
.get_name(join
->addr
));
718 pending_map
.add(join
->name
, join
->addr
);
719 pending_map
.last_changed
= ceph_clock_now();
723 bool MonmapMonitor::should_propose(double& delay
)
729 void MonmapMonitor::get_health(list
<pair
<health_status_t
, string
> >& summary
,
730 list
<pair
<health_status_t
, string
> > *detail
,
731 CephContext
*cct
) const
733 int max
= mon
->monmap
->size();
734 int actual
= mon
->get_quorum().size();
737 ss
<< (max
-actual
) << " mons down, quorum " << mon
->get_quorum() << " " << mon
->get_quorum_names();
738 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
740 set
<int> q
= mon
->get_quorum();
741 for (int i
=0; i
<max
; i
++) {
742 if (q
.count(i
) == 0) {
744 ss
<< "mon." << mon
->monmap
->get_name(i
) << " (rank " << i
745 << ") addr " << mon
->monmap
->get_addr(i
)
746 << " is down (out of quorum)";
747 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
754 int MonmapMonitor::get_monmap(bufferlist
&bl
)
756 version_t latest_ver
= get_last_committed();
757 dout(10) << __func__
<< " ver " << latest_ver
<< dendl
;
759 if (!mon
->store
->exists(get_service_name(), stringify(latest_ver
)))
762 int err
= get_version(latest_ver
, bl
);
764 dout(1) << __func__
<< " error obtaining monmap: "
765 << cpp_strerror(err
) << dendl
;
771 void MonmapMonitor::check_subs()
773 const string type
= "monmap";
774 mon
->with_session_map([this, &type
](const MonSessionMap
& session_map
) {
775 auto subs
= session_map
.subs
.find(type
);
776 if (subs
== session_map
.subs
.end())
778 for (auto sub
: *subs
->second
) {
784 void MonmapMonitor::check_sub(Subscription
*sub
)
786 const auto epoch
= mon
->monmap
->get_epoch();
788 << " monmap next " << sub
->next
789 << " have " << epoch
<< dendl
;
790 if (sub
->next
<= epoch
) {
791 mon
->send_latest_monmap(sub
->session
->con
.get());
793 mon
->with_session_map([this, sub
](MonSessionMap
& session_map
) {
794 session_map
.remove_sub(sub
);
797 sub
->next
= epoch
+ 1;