1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "MonmapMonitor.h"
17 #include "OSDMonitor.h"
18 #include "messages/MMonCommand.h"
19 #include "messages/MMonJoin.h"
21 #include "common/ceph_argparse.h"
22 #include "common/errno.h"
24 #include "common/config.h"
25 #include "common/cmdparse.h"
27 #include "include/ceph_assert.h"
28 #include "include/stringify.h"
30 #define dout_subsys ceph_subsys_mon
32 #define dout_prefix _prefix(_dout, mon)
33 using namespace TOPNSPC::common
;
42 using std::ostringstream
;
47 using std::stringstream
;
50 using std::unique_ptr
;
52 using ceph::bufferlist
;
55 using ceph::Formatter
;
56 using ceph::JSONFormatter
;
57 using ceph::make_message
;
58 using ceph::mono_clock
;
59 using ceph::mono_time
;
60 using ceph::timespan_str
;
61 static ostream
& _prefix(std::ostream
*_dout
, Monitor
&mon
) {
62 return *_dout
<< "mon." << mon
.name
<< "@" << mon
.rank
63 << "(" << mon
.get_state_name()
64 << ").monmap v" << mon
.monmap
->epoch
<< " ";
67 void MonmapMonitor::create_initial()
69 dout(10) << __func__
<< " using current monmap" << dendl
;
70 pending_map
= *mon
.monmap
;
71 pending_map
.epoch
= 1;
73 if (g_conf()->mon_debug_no_initial_persistent_features
) {
74 derr
<< __func__
<< " mon_debug_no_initial_persistent_features=true"
77 // initialize with default persistent features for new clusters
78 pending_map
.persistent_features
= ceph::features::mon::get_persistent();
79 pending_map
.min_mon_release
= ceph_release();
83 void MonmapMonitor::update_from_paxos(bool *need_bootstrap
)
85 version_t version
= get_last_committed();
86 if (version
<= mon
.monmap
->get_epoch())
89 dout(10) << __func__
<< " version " << version
90 << ", my v " << mon
.monmap
->epoch
<< dendl
;
92 if (need_bootstrap
&& version
!= mon
.monmap
->get_epoch()) {
93 dout(10) << " signaling that we need a bootstrap" << dendl
;
94 *need_bootstrap
= true;
99 int ret
= get_version(version
, monmap_bl
);
100 ceph_assert(ret
== 0);
101 ceph_assert(monmap_bl
.length());
103 dout(10) << __func__
<< " got " << version
<< dendl
;
104 mon
.monmap
->decode(monmap_bl
);
106 if (mon
.store
->exists("mkfs", "monmap")) {
107 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
108 t
->erase("mkfs", "monmap");
109 mon
.store
->apply_transaction(t
);
114 // make sure we've recorded min_mon_release
116 if (mon
.store
->read_meta("min_mon_release", &val
) < 0 ||
118 atoi(val
.c_str()) != (int)ceph_release()) {
119 dout(10) << __func__
<< " updating min_mon_release meta" << dendl
;
120 mon
.store
->write_meta("min_mon_release",
121 stringify(ceph_release()));
124 mon
.notify_new_monmap(true);
127 void MonmapMonitor::create_pending()
129 pending_map
= *mon
.monmap
;
131 pending_map
.last_changed
= ceph_clock_now();
132 dout(10) << __func__
<< " monmap epoch " << pending_map
.epoch
<< dendl
;
135 void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t
)
137 dout(10) << __func__
<< " epoch " << pending_map
.epoch
<< dendl
;
139 ceph_assert(mon
.monmap
->epoch
+ 1 == pending_map
.epoch
||
140 pending_map
.epoch
== 1); // special case mkfs!
142 pending_map
.encode(bl
, mon
.get_quorum_con_features());
144 put_version(t
, pending_map
.epoch
, bl
);
145 put_last_committed(t
, pending_map
.epoch
);
147 // generate a cluster fingerprint, too?
148 if (pending_map
.epoch
== 1) {
149 mon
.prepare_new_fingerprint(t
);
153 health_check_map_t next
;
154 pending_map
.check_health(&next
);
155 encode_health(next
, t
);
158 class C_ApplyFeatures
: public Context
{
160 mon_feature_t features
;
161 ceph_release_t min_mon_release
;
163 C_ApplyFeatures(MonmapMonitor
*s
, const mon_feature_t
& f
, ceph_release_t mmr
) :
164 svc(s
), features(f
), min_mon_release(mmr
) { }
165 void finish(int r
) override
{
167 svc
->apply_mon_features(features
, min_mon_release
);
168 } else if (r
== -EAGAIN
|| r
== -ECANCELED
) {
169 // discard features if we're no longer on the quorum that
170 // established them in the first place.
173 ceph_abort_msg("bad C_ApplyFeatures return value");
178 void MonmapMonitor::apply_mon_features(const mon_feature_t
& features
,
179 ceph_release_t min_mon_release
)
181 if (!is_writeable()) {
182 dout(5) << __func__
<< " wait for service to be writeable" << dendl
;
183 wait_for_writeable_ctx(new C_ApplyFeatures(this, features
, min_mon_release
));
187 // do nothing here unless we have a full quorum
188 if (mon
.get_quorum().size() < mon
.monmap
->size()) {
192 ceph_assert(is_writeable());
193 ceph_assert(features
.contains_all(pending_map
.persistent_features
));
194 // we should never hit this because `features` should be the result
195 // of the quorum's supported features. But if it happens, die.
196 ceph_assert(ceph::features::mon::get_supported().contains_all(features
));
198 mon_feature_t new_features
=
199 (pending_map
.persistent_features
^
200 (features
& ceph::features::mon::get_persistent()));
202 if (new_features
.empty() &&
203 pending_map
.min_mon_release
== min_mon_release
) {
204 dout(10) << __func__
<< " min_mon_release (" << (int)min_mon_release
205 << ") and features (" << features
<< ") match" << dendl
;
209 if (!new_features
.empty()) {
210 dout(1) << __func__
<< " applying new features "
211 << new_features
<< ", had " << pending_map
.persistent_features
213 << (new_features
| pending_map
.persistent_features
)
215 pending_map
.persistent_features
|= new_features
;
217 if (min_mon_release
> pending_map
.min_mon_release
) {
218 dout(1) << __func__
<< " increasing min_mon_release to "
219 << to_integer
<int>(min_mon_release
) << " (" << min_mon_release
221 pending_map
.min_mon_release
= min_mon_release
;
227 void MonmapMonitor::on_active()
229 if (get_last_committed() >= 1 && !mon
.has_ever_joined
) {
230 // make note of the fact that i was, once, part of the quorum.
231 dout(10) << "noting that i was, once, part of an active quorum." << dendl
;
233 /* This is some form of nasty in-breeding we have between the MonmapMonitor
234 and the Monitor itself. We should find a way to get rid of it given our
235 new architecture. Until then, stick with it since we are a
236 single-threaded process and, truth be told, no one else relies on this
239 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
240 t
->put(Monitor::MONITOR_NAME
, "joined", 1);
241 mon
.store
->apply_transaction(t
);
242 mon
.has_ever_joined
= true;
245 if (mon
.is_leader()) {
246 mon
.clog
->debug() << "monmap " << *mon
.monmap
;
249 apply_mon_features(mon
.get_quorum_mon_features(),
250 mon
.quorum_min_mon_release
);
253 bool MonmapMonitor::preprocess_query(MonOpRequestRef op
)
255 auto m
= op
->get_req
<PaxosServiceMessage
>();
256 switch (m
->get_type()) {
258 case MSG_MON_COMMAND
:
260 return preprocess_command(op
);
262 catch (const bad_cmd_get
& e
) {
264 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
268 return preprocess_join(op
);
275 void MonmapMonitor::dump_info(Formatter
*f
)
277 f
->dump_unsigned("monmap_first_committed", get_first_committed());
278 f
->dump_unsigned("monmap_last_committed", get_last_committed());
279 f
->open_object_section("monmap");
282 f
->open_array_section("quorum");
283 for (set
<int>::iterator q
= mon
.get_quorum().begin(); q
!= mon
.get_quorum().end(); ++q
)
284 f
->dump_int("mon", *q
);
288 bool MonmapMonitor::preprocess_command(MonOpRequestRef op
)
290 auto m
= op
->get_req
<MMonCommand
>();
296 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
297 string rs
= ss
.str();
298 mon
.reply_command(op
, -EINVAL
, rs
, rdata
, get_last_committed());
303 cmd_getval(cmdmap
, "prefix", prefix
);
305 MonSession
*session
= op
->get_session();
307 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
312 cmd_getval(cmdmap
, "format", format
, string("plain"));
313 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
));
315 if (prefix
== "mon stat") {
317 f
->open_object_section("monmap");
318 mon
.monmap
->dump_summary(f
.get());
319 f
->dump_string("leader", mon
.get_leader_name());
320 f
->open_array_section("quorum");
321 for (auto rank
: mon
.get_quorum()) {
322 std::string name
= mon
.monmap
->get_name(rank
);
323 f
->open_object_section("mon");
324 f
->dump_int("rank", rank
);
325 f
->dump_string("name", name
);
326 f
->close_section(); // mon
328 f
->close_section(); // quorum
329 f
->close_section(); // monmap
332 mon
.monmap
->print_summary(ss
);
333 ss
<< ", election epoch " << mon
.get_epoch() << ", leader "
334 << mon
.get_leader() << " " << mon
.get_leader_name()
335 << ", quorum " << mon
.get_quorum()
336 << " " << mon
.get_quorum_names();
343 } else if (prefix
== "mon getmap" ||
344 prefix
== "mon dump") {
348 cmd_getval(cmdmap
, "epoch", epochnum
, (int64_t)0);
351 MonMap
*p
= mon
.monmap
;
354 r
= get_version(epoch
, bl
);
356 ss
<< "there is no map for epoch " << epoch
;
360 ceph_assert(bl
.length() > 0);
367 if (prefix
== "mon getmap") {
368 p
->encode(rdata
, m
->get_connection()->get_features());
370 ss
<< "got monmap epoch " << p
->get_epoch();
371 } else if (prefix
== "mon dump") {
374 f
->open_object_section("monmap");
376 f
->open_array_section("quorum");
377 for (set
<int>::iterator q
= mon
.get_quorum().begin();
378 q
!= mon
.get_quorum().end(); ++q
) {
379 f
->dump_int("mon", *q
);
390 ss
<< "dumped monmap epoch " << p
->get_epoch();
392 if (p
!= mon
.monmap
) {
397 } else if (prefix
== "mon feature ls") {
399 bool list_with_value
= false;
401 if (cmd_getval(cmdmap
, "with_value", with_value
) &&
402 with_value
== "--with-value") {
403 list_with_value
= true;
406 MonMap
*p
= mon
.monmap
;
409 mon_feature_t supported
= ceph::features::mon::get_supported();
410 mon_feature_t persistent
= ceph::features::mon::get_persistent();
411 mon_feature_t required
= p
->get_required_features();
414 auto print_feature
= [&](mon_feature_t
& m_features
, const char* m_str
) {
417 m_features
.dump_with_value(f
.get(), m_str
);
419 m_features
.dump(f
.get(), m_str
);
422 m_features
.print_with_value(ds
);
424 m_features
.print(ds
);
429 f
->open_object_section("features");
431 f
->open_object_section("all");
432 print_feature(supported
, "supported");
433 print_feature(persistent
, "persistent");
434 f
->close_section(); // all
436 f
->open_object_section("monmap");
437 print_feature(p
->persistent_features
, "persistent");
438 print_feature(p
->optional_features
, "optional");
439 print_feature(required
, "required");
440 f
->close_section(); // monmap
442 f
->close_section(); // features
446 ds
<< "all features" << std::endl
448 print_feature(supported
, nullptr);
451 print_feature(persistent
, nullptr);
455 ds
<< "on current monmap (epoch "
456 << p
->get_epoch() << ")" << std::endl
458 print_feature(p
->persistent_features
, nullptr);
460 // omit optional features in plain-text
461 // makes it easier to read, and they're, currently, empty.
463 print_feature(required
, nullptr);
475 mon
.reply_command(op
, r
, rs
, rdata
, get_last_committed());
482 bool MonmapMonitor::prepare_update(MonOpRequestRef op
)
484 auto m
= op
->get_req
<PaxosServiceMessage
>();
485 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_orig_source_inst() << dendl
;
487 switch (m
->get_type()) {
488 case MSG_MON_COMMAND
:
490 return prepare_command(op
);
491 } catch (const bad_cmd_get
& e
) {
493 mon
.reply_command(op
, -EINVAL
, e
.what(), bl
, get_last_committed());
497 return prepare_join(op
);
505 bool MonmapMonitor::prepare_command(MonOpRequestRef op
)
507 auto m
= op
->get_req
<MMonCommand
>();
513 if (!cmdmap_from_json(m
->cmd
, &cmdmap
, ss
)) {
514 string rs
= ss
.str();
515 mon
.reply_command(op
, -EINVAL
, rs
, get_last_committed());
520 cmd_getval(cmdmap
, "prefix", prefix
);
522 MonSession
*session
= op
->get_session();
524 mon
.reply_command(op
, -EACCES
, "access denied", get_last_committed());
528 /* We should follow the following rules:
530 * - 'monmap' is the current, consistent version of the monmap
531 * - 'pending_map' is the uncommitted version of the monmap
533 * All checks for the current state must be made against 'monmap'.
534 * All changes are made against 'pending_map'.
536 * If there are concurrent operations modifying 'pending_map', please
537 * follow the following rules.
539 * - if pending_map has already been changed, the second operation must
540 * wait for the proposal to finish and be run again; This is the easiest
541 * path to guarantee correctness but may impact performance (i.e., it
542 * will take longer for the user to get a reply).
544 * - if the result of the second operation can be guaranteed to be
545 * idempotent, the operation may reply to the user once the proposal
546 * finishes; still needs to wait for the proposal to finish.
548 * - An operation _NEVER_ returns to the user based on pending state.
550 * If an operation does not modify current stable monmap, it may be
551 * serialized before current pending map, regardless of any change that
552 * has been made to the pending map -- remember, pending is uncommitted
553 * state, thus we are not bound by it.
556 ceph_assert(mon
.monmap
);
557 MonMap
&monmap
= *mon
.monmap
;
562 * Adding or removing monitors may lead to loss of quorum.
564 * Because quorum may be lost, it's important to reply something
565 * to the user, lest she end up waiting forever for a reply. And
566 * no reply will ever be sent until quorum is formed again.
568 * On the other hand, this means we're leaking uncommitted state
569 * to the user. As such, please be mindful of the reply message.
571 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
572 * operation and conveys its not-yet-permanent nature); whereas
573 * 'added monitor mon.foo' presumes the action has successfully
574 * completed and state has been committed, which may not be true.
578 bool propose
= false;
579 if (prefix
== "mon add") {
581 cmd_getval(cmdmap
, "name", name
);
583 cmd_getval(cmdmap
, "addr", addrstr
);
587 if (!addr
.parse(addrstr
.c_str())) {
589 ss
<< "addr " << addrstr
<< "does not parse";
593 vector
<string
> locationvec
;
594 map
<string
, string
> loc
;
595 cmd_getval(cmdmap
, "location", locationvec
);
596 CrushWrapper::parse_loc_map(locationvec
, &loc
);
597 if (locationvec
.size() &&
598 !mon
.get_quorum_mon_features().contains_all(
599 ceph::features::mon::FEATURE_PINGING
)) {
601 ss
<< "Not all monitors support adding monitors with a location; please upgrade first!";
604 if (locationvec
.size() && !loc
.size()) {
605 ss
<< "We could not parse your input location to anything real; " << locationvec
606 << " turned into an empty map!";
611 dout(10) << "mon add setting location for " << name
<< " to " << loc
<< dendl
;
613 // TODO: validate location in crush map
614 if (monmap
.stretch_mode_enabled
&& !loc
.size()) {
615 ss
<< "We are in stretch mode and new monitors must have a location, but "
616 << "could not parse your input location to anything real; " << locationvec
617 << " turned into an empty map!";
621 // TODO: validate location against any existing stretch config
623 entity_addrvec_t addrs
;
624 if (monmap
.persistent_features
.contains_all(
625 ceph::features::mon::FEATURE_NAUTILUS
)) {
626 if (addr
.get_port() == CEPH_MON_PORT_IANA
) {
627 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
629 if (addr
.get_port() == CEPH_MON_PORT_LEGACY
) {
630 // if they specified the *old* default they probably don't care
633 if (addr
.get_port()) {
634 addrs
.v
.push_back(addr
);
636 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
637 addr
.set_port(CEPH_MON_PORT_IANA
);
638 addrs
.v
.push_back(addr
);
639 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
640 addr
.set_port(CEPH_MON_PORT_LEGACY
);
641 addrs
.v
.push_back(addr
);
644 if (addr
.get_port() == 0) {
645 addr
.set_port(CEPH_MON_PORT_LEGACY
);
647 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
648 addrs
.v
.push_back(addr
);
650 dout(20) << __func__
<< " addr " << addr
<< " -> addrs " << addrs
<< dendl
;
653 * If we have a monitor with the same name and different addr, then EEXIST
654 * If we have a monitor with the same addr and different name, then EEXIST
655 * If we have a monitor with the same addr and same name, then wait for
656 * the proposal to finish and return success.
657 * If we don't have the monitor, add it.
661 if (!ss
.str().empty())
665 if (monmap
.contains(name
)) {
666 if (monmap
.get_addrs(name
) == addrs
) {
667 // stable map contains monitor with the same name at the same address.
668 // serialize before current pending map.
669 err
= 0; // for clarity; this has already been set above.
670 ss
<< "mon." << name
<< " at " << addrs
<< " already exists";
674 << " already exists at address " << monmap
.get_addrs(name
);
676 } else if (monmap
.contains(addrs
)) {
677 // we established on the previous branch that name is different
678 ss
<< "mon." << monmap
.get_name(addrs
)
679 << " already exists at address " << addr
;
688 if (pending_map
.stretch_mode_enabled
) {
692 /* Given there's no delay between proposals on the MonmapMonitor (see
693 * MonmapMonitor::should_propose()), there is no point in checking for
694 * a mismatch between name and addr on pending_map.
696 * Once we established the monitor does not exist in the committed state,
697 * we can simply go ahead and add the monitor.
700 pending_map
.add(name
, addrs
);
701 pending_map
.mon_info
[name
].crush_loc
= loc
;
702 pending_map
.last_changed
= ceph_clock_now();
703 ss
<< "adding mon." << name
<< " at " << addrs
;
705 dout(0) << __func__
<< " proposing new mon." << name
<< dendl
;
707 } else if (prefix
== "mon remove" ||
708 prefix
== "mon rm") {
710 cmd_getval(cmdmap
, "name", name
);
711 if (!monmap
.contains(name
)) {
713 ss
<< "mon." << name
<< " does not exist or has already been removed";
717 if (monmap
.size() == 1) {
719 ss
<< "error: refusing removal of last monitor " << name
;
723 /* At the time of writing, there is no risk of races when multiple clients
724 * attempt to use the same name. The reason is simple but may not be
727 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
728 * soon as we return 'true' below, PaxosService::dispatch() will check if
729 * the service should propose, and - if so - the service will be marked as
730 * 'proposing' and a proposal will be triggered. The PaxosService class
731 * guarantees that once a service is marked 'proposing' no further writes
734 * The decision on whether the service should propose or not is, in this
735 * case, made by MonmapMonitor::should_propose(), which always considers
736 * the proposal delay being 0.0 seconds. This is key for PaxosService to
737 * trigger the proposal immediately.
738 * 0.0 seconds of delay.
740 * From the above, there's no point in performing further checks on the
741 * pending_map, as we don't ever have multiple proposals in-flight in
742 * this service. As we've established the committed state contains the
743 * monitor, we can simply go ahead and remove it.
745 * Please note that the code hinges on all of the above to be true. It
746 * has been true since time immemorial and we don't see a good reason
747 * to make it sturdier at this time - mainly because we don't think it's
748 * going to change any time soon, lest for any bug that may be unwillingly
752 entity_addrvec_t addrs
= pending_map
.get_addrs(name
);
753 pending_map
.remove(name
);
754 pending_map
.last_changed
= ceph_clock_now();
755 ss
<< "removing mon." << name
<< " at " << addrs
756 << ", there will be " << pending_map
.size() << " monitors" ;
760 } else if (prefix
== "mon feature set") {
764 * We currently only support setting/unsetting persistent features.
765 * This is by design, given at the moment we still don't have optional
766 * features, and, as such, there is no point introducing an interface
767 * to manipulate them. This allows us to provide a cleaner, more
768 * intuitive interface to the user, modifying solely persistent
771 * In the future we should consider adding another interface to handle
772 * optional features/flags; e.g., 'mon feature flag set/unset', or
773 * 'mon flag set/unset'.
776 if (!cmd_getval(cmdmap
, "feature_name", feature_name
)) {
777 ss
<< "missing required feature name";
782 mon_feature_t feature
;
783 feature
= ceph::features::mon::get_feature_by_name(feature_name
);
784 if (feature
== ceph::features::mon::FEATURE_NONE
) {
785 ss
<< "unknown feature '" << feature_name
<< "'";
791 cmd_getval(cmdmap
, "yes_i_really_mean_it", sure
);
793 ss
<< "please specify '--yes-i-really-mean-it' if you "
794 << "really, **really** want to set feature '"
795 << feature
<< "' in the monmap.";
800 if (!mon
.get_quorum_mon_features().contains_all(feature
)) {
801 ss
<< "current quorum does not support feature '" << feature
802 << "'; supported features: "
803 << mon
.get_quorum_mon_features();
808 ss
<< "setting feature '" << feature
<< "'";
811 if (monmap
.persistent_features
.contains_all(feature
)) {
812 dout(10) << __func__
<< " feature '" << feature
813 << "' already set on monmap; no-op." << dendl
;
817 pending_map
.persistent_features
.set_feature(feature
);
818 pending_map
.last_changed
= ceph_clock_now();
821 dout(1) << __func__
<< " " << ss
.str() << "; new features will be: "
822 << "persistent = " << pending_map
.persistent_features
823 // output optional nevertheless, for auditing purposes.
824 << ", optional = " << pending_map
.optional_features
<< dendl
;
826 } else if (prefix
== "mon set-rank") {
829 if (!cmd_getval(cmdmap
, "name", name
) ||
830 !cmd_getval(cmdmap
, "rank", rank
)) {
834 int oldrank
= pending_map
.get_rank(name
);
836 ss
<< "mon." << name
<< " does not exist in monmap";
841 pending_map
.set_rank(name
, rank
);
842 pending_map
.last_changed
= ceph_clock_now();
844 } else if (prefix
== "mon set-addrs") {
847 if (!cmd_getval(cmdmap
, "name", name
) ||
848 !cmd_getval(cmdmap
, "addrs", addrs
)) {
852 if (!pending_map
.contains(name
)) {
853 ss
<< "mon." << name
<< " does not exist";
858 if (!av
.parse(addrs
.c_str(), nullptr)) {
859 ss
<< "failed to parse addrs '" << addrs
<< "'";
863 for (auto& a
: av
.v
) {
866 ss
<< "monitor must bind to a non-zero port, not " << a
;
872 pending_map
.set_addrvec(name
, av
);
873 pending_map
.last_changed
= ceph_clock_now();
875 } else if (prefix
== "mon set-weight") {
878 if (!cmd_getval(cmdmap
, "name", name
) ||
879 !cmd_getval(cmdmap
, "weight", weight
)) {
883 if (!pending_map
.contains(name
)) {
884 ss
<< "mon." << name
<< " does not exist";
889 pending_map
.set_weight(name
, weight
);
890 pending_map
.last_changed
= ceph_clock_now();
892 } else if (prefix
== "mon enable-msgr2") {
893 if (!monmap
.get_required_features().contains_all(
894 ceph::features::mon::FEATURE_NAUTILUS
)) {
896 ss
<< "all monitors must be running nautilus to enable v2";
899 for (auto& i
: pending_map
.mon_info
) {
900 if (i
.second
.public_addrs
.v
.size() == 1 &&
901 i
.second
.public_addrs
.front().is_legacy() &&
902 i
.second
.public_addrs
.front().get_port() == CEPH_MON_PORT_LEGACY
) {
904 entity_addr_t a
= i
.second
.public_addrs
.front();
905 a
.set_type(entity_addr_t::TYPE_MSGR2
);
906 a
.set_port(CEPH_MON_PORT_IANA
);
908 av
.v
.push_back(i
.second
.public_addrs
.front());
909 dout(10) << " setting mon." << i
.first
910 << " addrs " << i
.second
.public_addrs
911 << " -> " << av
<< dendl
;
912 pending_map
.set_addrvec(i
.first
, av
);
914 pending_map
.last_changed
= ceph_clock_now();
918 } else if (prefix
== "mon set election_strategy") {
919 if (!mon
.get_quorum_mon_features().contains_all(
920 ceph::features::mon::FEATURE_PINGING
)) {
922 ss
<< "Not all monitors support changing election strategies; please upgrade first!";
926 MonMap::election_strategy strategy
;
927 if (!cmd_getval(cmdmap
, "strategy", strat
)) {
931 if (strat
== "classic") {
932 strategy
= MonMap::CLASSIC
;
933 } else if (strat
== "disallow") {
934 strategy
= MonMap::DISALLOW
;
935 } else if (strat
== "connectivity") {
936 strategy
= MonMap::CONNECTIVITY
;
942 pending_map
.strategy
= strategy
;
944 } else if (prefix
== "mon add disallowed_leader") {
945 if (!mon
.get_quorum_mon_features().contains_all(
946 ceph::features::mon::FEATURE_PINGING
)) {
948 ss
<< "Not all monitors support changing election strategies; please upgrade first!";
952 if (!cmd_getval(cmdmap
, "name", name
)) {
956 if (pending_map
.strategy
!= MonMap::DISALLOW
&&
957 pending_map
.strategy
!= MonMap::CONNECTIVITY
) {
958 ss
<< "You cannot disallow monitors in your current election mode";
962 if (!pending_map
.contains(name
)) {
963 ss
<< "mon." << name
<< " does not exist";
967 if (pending_map
.disallowed_leaders
.count(name
)) {
968 ss
<< "mon." << name
<< " is already disallowed";
972 if (pending_map
.disallowed_leaders
.size() == pending_map
.size() - 1) {
973 ss
<< "mon." << name
<< " is the only remaining allowed leader!";
977 pending_map
.disallowed_leaders
.insert(name
);
980 } else if (prefix
== "mon rm disallowed_leader") {
981 if (!mon
.get_quorum_mon_features().contains_all(
982 ceph::features::mon::FEATURE_PINGING
)) {
984 ss
<< "Not all monitors support changing election strategies; please upgrade first!";
988 if (!cmd_getval(cmdmap
, "name", name
)) {
992 if (pending_map
.strategy
!= MonMap::DISALLOW
&&
993 pending_map
.strategy
!= MonMap::CONNECTIVITY
) {
994 ss
<< "You cannot disallow monitors in your current election mode";
998 if (!pending_map
.contains(name
)) {
999 ss
<< "mon." << name
<< " does not exist";
1003 if (!pending_map
.disallowed_leaders
.count(name
)) {
1004 ss
<< "mon." << name
<< " is already allowed";
1008 pending_map
.disallowed_leaders
.erase(name
);
1011 } else if (prefix
== "mon set_location") {
1012 if (!mon
.get_quorum_mon_features().contains_all(
1013 ceph::features::mon::FEATURE_PINGING
)) {
1015 ss
<< "Not all monitors support monitor locations; please upgrade first!";
1019 if (!cmd_getval(cmdmap
, "name", name
)) {
1023 if (!pending_map
.contains(name
)) {
1024 ss
<< "mon." << name
<< " does not exist";
1029 if (!mon
.osdmon()->is_readable()) {
1030 mon
.osdmon()->wait_for_readable(op
, new Monitor::C_RetryMessage(&mon
, op
));
1032 vector
<string
> argvec
;
1033 map
<string
, string
> loc
;
1034 cmd_getval(cmdmap
, "args", argvec
);
1035 CrushWrapper::parse_loc_map(argvec
, &loc
);
1037 dout(10) << "mon set_location for " << name
<< " to " << loc
<< dendl
;
1039 // TODO: validate location in crush map
1041 ss
<< "We could not parse your input location to anything real; " << argvec
1042 << " turned into an empty map!";
1046 // TODO: validate location against any existing stretch config
1047 pending_map
.mon_info
[name
].crush_loc
= loc
;
1050 } else if (prefix
== "mon enable_stretch_mode") {
1051 if (!mon
.osdmon()->is_writeable()) {
1053 << ": waiting for osdmon writeable for stretch mode" << dendl
;
1054 mon
.osdmon()->wait_for_writeable(op
, new Monitor::C_RetryMessage(&mon
, op
));
1058 if (monmap
.stretch_mode_enabled
) {
1059 ss
<< "stretch mode is already engaged";
1063 if (pending_map
.stretch_mode_enabled
) {
1064 ss
<< "stretch mode currently committing";
1068 string tiebreaker_mon
;
1069 if (!cmd_getval(cmdmap
, "tiebreaker_mon", tiebreaker_mon
)) {
1070 ss
<< "must specify a tiebreaker monitor";
1074 string new_crush_rule
;
1075 if (!cmd_getval(cmdmap
, "new_crush_rule", new_crush_rule
)) {
1076 ss
<< "must specify a new crush rule that spreads out copies over multiple sites";
1080 string dividing_bucket
;
1081 if (!cmd_getval(cmdmap
, "dividing_bucket", dividing_bucket
)) {
1082 ss
<< "must specify a dividing bucket";
1086 //okay, initial arguments make sense, check pools and cluster state
1087 err
= mon
.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE
, ss
);
1092 Plugger(Paxos
&p
) : p(p
) { p
.plug(); }
1093 ~Plugger() { p
.unplug(); }
1096 set
<pg_pool_t
*> pools
;
1100 mon
.osdmon()->try_enable_stretch_mode_pools(ss
, &okay
, &errcode
,
1101 &pools
, new_crush_rule
);
1106 try_enable_stretch_mode(ss
, &okay
, &errcode
, false,
1107 tiebreaker_mon
, dividing_bucket
);
1112 mon
.osdmon()->try_enable_stretch_mode(ss
, &okay
, &errcode
, false,
1113 dividing_bucket
, 2, pools
, new_crush_rule
);
1118 // everything looks good, actually commit the changes!
1119 try_enable_stretch_mode(ss
, &okay
, &errcode
, true,
1120 tiebreaker_mon
, dividing_bucket
);
1121 mon
.osdmon()->try_enable_stretch_mode(ss
, &okay
, &errcode
, true,
1123 2, // right now we only support 2 sites
1124 pools
, new_crush_rule
);
1125 ceph_assert(okay
== true);
1127 request_proposal(mon
.osdmon());
1131 ss
<< "unknown command " << prefix
;
1137 mon
.reply_command(op
, err
, rs
, get_last_committed());
1138 // we are returning to the user; do not propose.
1142 void MonmapMonitor::try_enable_stretch_mode(stringstream
& ss
, bool *okay
,
1143 int *errcode
, bool commit
,
1144 const string
& tiebreaker_mon
,
1145 const string
& dividing_bucket
)
1147 dout(20) << __func__
<< dendl
;
1149 if (pending_map
.strategy
!= MonMap::CONNECTIVITY
) {
1150 ss
<< "Monitors must use the connectivity strategy to enable stretch mode";
1152 ceph_assert(!commit
);
1155 if (!pending_map
.contains(tiebreaker_mon
)) {
1156 ss
<< "mon " << tiebreaker_mon
<< "does not seem to exist";
1158 ceph_assert(!commit
);
1161 map
<string
,string
> buckets
;
1162 for (const auto&mii
: mon
.monmap
->mon_info
) {
1163 const auto& mi
= mii
.second
;
1164 const auto& bi
= mi
.crush_loc
.find(dividing_bucket
);
1165 if (bi
== mi
.crush_loc
.end()) {
1166 ss
<< "Could not find location entry for " << dividing_bucket
1167 << " on monitor " << mi
.name
;
1169 ceph_assert(!commit
);
1172 buckets
[mii
.first
] = bi
->second
;
1174 string bucket1
, bucket2
, tiebreaker_bucket
;
1175 for (auto& i
: buckets
) {
1176 if (i
.first
== tiebreaker_mon
) {
1177 tiebreaker_bucket
= i
.second
;
1180 if (bucket1
.empty()) {
1183 if (bucket1
!= i
.second
&&
1187 if (bucket1
!= i
.second
&&
1188 bucket2
!= i
.second
) {
1189 ss
<< "There are too many monitor buckets for stretch mode, found "
1190 << bucket1
<< "," << bucket2
<< "," << i
.second
;
1192 ceph_assert(!commit
);
1196 if (bucket1
.empty() || bucket2
.empty()) {
1197 ss
<< "There are not enough monitor buckets for stretch mode;"
1198 << " must have at least 2 plus the tiebreaker but only found "
1199 << (bucket1
.empty() ? bucket1
: bucket2
);
1201 ceph_assert(!commit
);
1204 if (tiebreaker_bucket
== bucket1
||
1205 tiebreaker_bucket
== bucket2
) {
1206 ss
<< "The named tiebreaker monitor " << tiebreaker_mon
1207 << " is in the same CRUSH bucket " << tiebreaker_bucket
1208 << " as other monitors";
1210 ceph_assert(!commit
);
1214 pending_map
.disallowed_leaders
.insert(tiebreaker_mon
);
1215 pending_map
.tiebreaker_mon
= tiebreaker_mon
;
1216 pending_map
.stretch_mode_enabled
= true;
1221 void MonmapMonitor::trigger_degraded_stretch_mode(const set
<string
>& dead_mons
)
1223 dout(20) << __func__
<< dendl
;
1224 pending_map
.stretch_marked_down_mons
.insert(dead_mons
.begin(), dead_mons
.end());
1228 void MonmapMonitor::trigger_healthy_stretch_mode()
1230 dout(20) << __func__
<< dendl
;
1231 pending_map
.stretch_marked_down_mons
.clear();
1235 bool MonmapMonitor::preprocess_join(MonOpRequestRef op
)
1237 auto join
= op
->get_req
<MMonJoin
>();
1238 dout(10) << __func__
<< " " << join
->name
<< " at " << join
->addrs
<< dendl
;
1240 MonSession
*session
= op
->get_session();
1242 !session
->is_capable("mon", MON_CAP_W
| MON_CAP_X
)) {
1243 dout(10) << " insufficient caps" << dendl
;
1247 const auto name_info_i
= pending_map
.mon_info
.find(join
->name
);
1248 if (name_info_i
!= pending_map
.mon_info
.end() &&
1249 !name_info_i
->second
.public_addrs
.front().is_blank_ip() &&
1250 (!join
->force_loc
|| join
->crush_loc
== name_info_i
->second
.crush_loc
)) {
1251 dout(10) << " already have " << join
->name
<< dendl
;
1255 if (pending_map
.contains(join
->addrs
)) {
1256 addr_name
= pending_map
.get_name(join
->addrs
);
1258 if (!addr_name
.empty() &&
1259 addr_name
== join
->name
&&
1260 (!join
->force_loc
|| join
->crush_loc
.empty() ||
1261 pending_map
.mon_info
[addr_name
].crush_loc
== join
->crush_loc
)) {
1262 dout(10) << " already have " << join
->addrs
<< dendl
;
1265 if (pending_map
.stretch_mode_enabled
&&
1266 join
->crush_loc
.empty() &&
1267 (addr_name
.empty() ||
1268 pending_map
.mon_info
[addr_name
].crush_loc
.empty())) {
1269 dout(10) << "stretch mode engaged but no source of crush_loc" << dendl
;
1270 mon
.clog
->info() << join
->name
<< " attempted to join from " << join
->name
1271 << ' ' << join
->addrs
1272 << "; but lacks a crush_location for stretch mode";
1278 bool MonmapMonitor::prepare_join(MonOpRequestRef op
)
1280 auto join
= op
->get_req
<MMonJoin
>();
1281 dout(0) << "adding/updating " << join
->name
1282 << " at " << join
->addrs
<< " to monitor cluster" << dendl
;
1283 map
<string
,string
> existing_loc
;
1284 if (pending_map
.contains(join
->addrs
)) {
1285 string name
= pending_map
.get_name(join
->addrs
);
1286 existing_loc
= pending_map
.mon_info
[name
].crush_loc
;
1287 pending_map
.remove(name
);
1289 if (pending_map
.contains(join
->name
))
1290 pending_map
.remove(join
->name
);
1291 pending_map
.add(join
->name
, join
->addrs
);
1292 pending_map
.mon_info
[join
->name
].crush_loc
=
1293 ((join
->force_loc
|| existing_loc
.empty()) ?
1294 join
->crush_loc
: existing_loc
);
1295 pending_map
.last_changed
= ceph_clock_now();
1299 bool MonmapMonitor::should_propose(double& delay
)
1305 int MonmapMonitor::get_monmap(bufferlist
&bl
)
1307 version_t latest_ver
= get_last_committed();
1308 dout(10) << __func__
<< " ver " << latest_ver
<< dendl
;
1310 if (!mon
.store
->exists(get_service_name(), stringify(latest_ver
)))
1313 int err
= get_version(latest_ver
, bl
);
1315 dout(1) << __func__
<< " error obtaining monmap: "
1316 << cpp_strerror(err
) << dendl
;
1322 void MonmapMonitor::check_subs()
1324 const string type
= "monmap";
1325 mon
.with_session_map([this, &type
](const MonSessionMap
& session_map
) {
1326 auto subs
= session_map
.subs
.find(type
);
1327 if (subs
== session_map
.subs
.end())
1329 for (auto sub
: *subs
->second
) {
1335 void MonmapMonitor::check_sub(Subscription
*sub
)
1337 const auto epoch
= mon
.monmap
->get_epoch();
1338 dout(10) << __func__
1339 << " monmap next " << sub
->next
1340 << " have " << epoch
<< dendl
;
1341 if (sub
->next
<= epoch
) {
1342 mon
.send_latest_monmap(sub
->session
->con
.get());
1344 mon
.with_session_map([sub
](MonSessionMap
& session_map
) {
1345 session_map
.remove_sub(sub
);
1348 sub
->next
= epoch
+ 1;
1353 void MonmapMonitor::tick()
1360 if (mon
.monmap
->created
.is_zero()) {
1361 dout(10) << __func__
<< " detected empty created stamp" << dendl
;
1363 for (version_t v
= 1; v
<= get_last_committed(); v
++) {
1365 int r
= get_version(v
, bl
);
1370 auto p
= bl
.cbegin();
1372 if (!m
.last_changed
.is_zero()) {
1373 dout(10) << __func__
<< " first monmap with last_changed is "
1374 << v
<< " with " << m
.last_changed
<< dendl
;
1375 ctime
= m
.last_changed
;
1379 if (ctime
.is_zero()) {
1380 ctime
= ceph_clock_now();
1382 dout(10) << __func__
<< " updating created stamp to " << ctime
<< dendl
;
1383 pending_map
.created
= ctime
;