1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
12 #include <seastar/core/fstream.hh>
13 #include <seastar/core/reactor.hh>
14 #include <seastar/net/dns.hh>
15 #include "crimson/common/config_proxy.h"
18 #include "common/Formatter.h"
20 #include "include/ceph_features.h"
21 #include "include/addr_parsing.h"
22 #include "common/ceph_argparse.h"
23 #include "common/dns_resolve.h"
24 #include "common/errno.h"
25 #include "common/dout.h"
26 #include "common/Clock.h"
28 using ceph::Formatter
;
30 void mon_info_t::encode(bufferlist
& bl
, uint64_t features
) const
33 if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
36 ENCODE_START(v
, 1, bl
);
39 encode(public_addrs
.legacy_addr(), bl
, features
);
41 encode(public_addrs
, bl
, features
);
47 void mon_info_t::decode(bufferlist::const_iterator
& p
)
51 decode(public_addrs
, p
);
58 void mon_info_t::print(ostream
& out
) const
61 << " addrs " << public_addrs
62 << " priority " << priority
;
67 bool operator()(const mon_info_t
&a
, const mon_info_t
&b
) const {
68 if (a
.public_addrs
.legacy_or_front_addr() == b
.public_addrs
.legacy_or_front_addr())
69 return a
.name
< b
.name
;
70 return a
.public_addrs
.legacy_or_front_addr() < b
.public_addrs
.legacy_or_front_addr();
75 void MonMap::calc_legacy_ranks()
77 ranks
.resize(mon_info
.size());
79 // Used to order entries according to public_addr, because that's
80 // how the ranks are expected to be ordered by. We may expand this
81 // later on, according to some other criteria, by specifying a
82 // different comparator.
84 // Please note that we use a 'set' here instead of resorting to
85 // std::sort() because we need more info than that's available in
86 // the vector. The vector will thus be ordered by, e.g., public_addr
87 // while only containing the names of each individual monitor.
88 // The only way of achieving this with std::sort() would be to first
89 // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo'
90 // with custom comparison functions, and then copy each invidual entry
91 // to a new vector. Unless there's a simpler way, we don't think the
92 // added complexity makes up for the additional memory usage of a 'set'.
93 set
<mon_info_t
, rank_cmp
> tmp
;
95 for (map
<string
,mon_info_t
>::iterator p
= mon_info
.begin();
98 mon_info_t
&m
= p
->second
;
102 // map the set to the actual ranks etc
104 for (set
<mon_info_t
>::iterator p
= tmp
.begin();
111 void MonMap::encode(bufferlist
& blist
, uint64_t con_features
) const
113 if ((con_features
& CEPH_FEATURE_MONNAMES
) == 0) {
117 encode_raw(fsid
, blist
);
118 encode(epoch
, blist
);
119 vector
<entity_inst_t
> mon_inst(ranks
.size());
120 for (unsigned n
= 0; n
< ranks
.size(); n
++) {
121 mon_inst
[n
].name
= entity_name_t::MON(n
);
122 mon_inst
[n
].addr
= get_addrs(n
).legacy_addr();
124 encode(mon_inst
, blist
, con_features
);
125 encode(last_changed
, blist
);
126 encode(created
, blist
);
130 map
<string
,entity_addr_t
> legacy_mon_addr
;
131 if (!HAVE_FEATURE(con_features
, MONENC
) ||
132 !HAVE_FEATURE(con_features
, SERVER_NAUTILUS
)) {
133 for (auto& [name
, info
] : mon_info
) {
134 legacy_mon_addr
[name
] = info
.public_addrs
.legacy_addr();
138 if (!HAVE_FEATURE(con_features
, MONENC
)) {
139 /* we keep the mon_addr map when encoding to ensure compatibility
140 * with clients and other monitors that do not yet support the 'mons'
141 * map. This map keeps its original behavior, containing a mapping of
142 * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public
143 * address -- which is obtained from the public address of each entry
149 encode_raw(fsid
, blist
);
150 encode(epoch
, blist
);
151 encode(legacy_mon_addr
, blist
, con_features
);
152 encode(last_changed
, blist
);
153 encode(created
, blist
);
157 if (!HAVE_FEATURE(con_features
, SERVER_NAUTILUS
)) {
158 ENCODE_START(5, 3, blist
);
159 encode_raw(fsid
, blist
);
160 encode(epoch
, blist
);
161 encode(legacy_mon_addr
, blist
, con_features
);
162 encode(last_changed
, blist
);
163 encode(created
, blist
);
164 encode(persistent_features
, blist
);
165 encode(optional_features
, blist
);
166 encode(mon_info
, blist
, con_features
);
167 ENCODE_FINISH(blist
);
171 ENCODE_START(7, 6, blist
);
172 encode_raw(fsid
, blist
);
173 encode(epoch
, blist
);
174 encode(last_changed
, blist
);
175 encode(created
, blist
);
176 encode(persistent_features
, blist
);
177 encode(optional_features
, blist
);
178 encode(mon_info
, blist
, con_features
);
179 encode(ranks
, blist
);
180 encode(min_mon_release
, blist
);
181 ENCODE_FINISH(blist
);
184 void MonMap::decode(bufferlist::const_iterator
& p
)
186 map
<string
,entity_addr_t
> mon_addr
;
187 DECODE_START_LEGACY_COMPAT_LEN_16(7, 3, 3, p
);
191 vector
<entity_inst_t
> mon_inst
;
193 for (unsigned i
= 0; i
< mon_inst
.size(); i
++) {
198 mon_addr
[name
] = mon_inst
[i
].addr
;
200 } else if (struct_v
< 6) {
203 decode(last_changed
, p
);
206 decode(persistent_features
, p
);
207 decode(optional_features
, p
);
210 // generate mon_info from legacy mon_addr
211 for (auto& [name
, addr
] : mon_addr
) {
212 mon_info_t
&m
= mon_info
[name
];
214 m
.public_addrs
= entity_addrvec_t(addr
);
225 decode(min_mon_release
, p
);
227 min_mon_release
= infer_ceph_release_from_mon_features(persistent_features
);
233 void MonMap::generate_test_instances(list
<MonMap
*>& o
)
235 o
.push_back(new MonMap
);
236 o
.push_back(new MonMap
);
238 o
.back()->last_changed
= utime_t(123, 456);
239 o
.back()->created
= utime_t(789, 101112);
240 o
.back()->add("one", entity_addrvec_t());
242 MonMap
*m
= new MonMap
;
245 m
->last_changed
= utime_t(123, 456);
247 entity_addrvec_t empty_addr_one
= entity_addrvec_t(entity_addr_t());
248 empty_addr_one
.v
[0].set_nonce(1);
249 m
->add("empty_addr_one", empty_addr_one
);
250 entity_addrvec_t empty_addr_two
= entity_addrvec_t(entity_addr_t());
251 empty_addr_two
.v
[0].set_nonce(2);
252 m
->add("empty_addr_two", empty_addr_two
);
254 const char *local_pub_addr_s
= "127.0.1.2";
256 const char *end_p
= local_pub_addr_s
+ strlen(local_pub_addr_s
);
257 entity_addrvec_t local_pub_addr
;
258 local_pub_addr
.parse(local_pub_addr_s
, &end_p
);
260 m
->add(mon_info_t("filled_pub_addr", entity_addrvec_t(local_pub_addr
), 1));
262 m
->add("empty_addr_zero", entity_addrvec_t());
267 // read from/write to a file
268 int MonMap::write(const char *fn
)
272 encode(bl
, CEPH_FEATURES_ALL
);
274 return bl
.write_file(fn
);
277 int MonMap::read(const char *fn
)
282 int r
= bl
.read_file(fn
, &error
);
289 void MonMap::print_summary(ostream
& out
) const
291 out
<< "e" << epoch
<< ": "
292 << mon_info
.size() << " mons at {";
293 // the map that we used to print, as it was, no longer
294 // maps strings to the monitor's public address, but to
295 // mon_info_t instead. As such, print the map in a way
296 // that keeps the expected format.
297 bool has_printed
= false;
298 for (map
<string
,mon_info_t
>::const_iterator p
= mon_info
.begin();
303 out
<< p
->first
<< "=" << p
->second
.public_addrs
;
309 void MonMap::print(ostream
& out
) const
311 out
<< "epoch " << epoch
<< "\n";
312 out
<< "fsid " << fsid
<< "\n";
313 out
<< "last_changed " << last_changed
<< "\n";
314 out
<< "created " << created
<< "\n";
315 out
<< "min_mon_release " << (int)min_mon_release
316 << " (" << ceph_release_name(min_mon_release
) << ")\n";
318 for (vector
<string
>::const_iterator p
= ranks
.begin();
321 out
<< i
++ << ": " << get_addrs(*p
) << " mon." << *p
<< "\n";
325 void MonMap::dump(Formatter
*f
) const
327 f
->dump_unsigned("epoch", epoch
);
328 f
->dump_stream("fsid") << fsid
;
329 f
->dump_stream("modified") << last_changed
;
330 f
->dump_stream("created") << created
;
331 f
->dump_unsigned("min_mon_release", min_mon_release
);
332 f
->dump_string("min_mon_release_name", ceph_release_name(min_mon_release
));
333 f
->open_object_section("features");
334 persistent_features
.dump(f
, "persistent");
335 optional_features
.dump(f
, "optional");
337 f
->open_array_section("mons");
339 for (vector
<string
>::const_iterator p
= ranks
.begin();
342 f
->open_object_section("mon");
343 f
->dump_int("rank", i
);
344 f
->dump_string("name", *p
);
345 f
->dump_object("public_addrs", get_addrs(*p
));
346 // compat: make these look like pre-nautilus entity_addr_t
347 f
->dump_stream("addr") << get_addrs(*p
).get_legacy_str();
348 f
->dump_stream("public_addr") << get_addrs(*p
).get_legacy_str();
354 // an ambiguous mon addr may be legacy or may be msgr2--we aren' sure.
355 // when that happens we need to try them both (unless we can
356 // reasonably infer from the port number which it is).
357 void MonMap::_add_ambiguous_addr(const string
& name
,
362 if (addr
.get_type() != entity_addr_t::TYPE_ANY
) {
363 // a v1: or v2: prefix was specified
364 if (addr
.get_port() == 0) {
366 if (addr
.get_type() == entity_addr_t::TYPE_ANY
) {
367 addr
.set_port(CEPH_MON_PORT_IANA
);
368 } else if (addr
.get_type() == entity_addr_t::TYPE_LEGACY
) {
369 addr
.set_port(CEPH_MON_PORT_LEGACY
);
370 } else if (addr
.get_type() == entity_addr_t::TYPE_MSGR2
) {
371 addr
.set_port(CEPH_MON_PORT_IANA
);
376 if (!contains(addr
)) {
377 add(name
, entity_addrvec_t(addr
));
380 if (!contains(addr
)) {
381 add(name
, entity_addrvec_t(addr
), priority
);
385 // no v1: or v2: prefix specified
386 if (addr
.get_port() == CEPH_MON_PORT_LEGACY
) {
387 // legacy port implies legacy addr
388 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
389 if (!contains(addr
)) {
391 add(name
+ "-legacy", entity_addrvec_t(addr
));
393 add(name
, entity_addrvec_t(addr
));
396 } else if (addr
.get_port() == CEPH_MON_PORT_IANA
) {
397 // iana port implies msgr2 addr
398 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
399 if (!contains(addr
)) {
400 add(name
, entity_addrvec_t(addr
));
402 } else if (addr
.get_port() == 0) {
403 // no port; include both msgr2 and legacy ports
405 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
406 addr
.set_port(CEPH_MON_PORT_IANA
);
407 if (!contains(addr
)) {
408 add(name
, entity_addrvec_t(addr
));
410 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
411 addr
.set_port(CEPH_MON_PORT_LEGACY
);
412 if (!contains(addr
)) {
413 add(name
+ "-legacy", entity_addrvec_t(addr
));
417 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
418 addr
.set_port(CEPH_MON_PORT_IANA
);
419 av
.v
.push_back(addr
);
420 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
421 addr
.set_port(CEPH_MON_PORT_LEGACY
);
422 av
.v
.push_back(addr
);
428 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
429 if (!contains(addr
)) {
430 add(name
, entity_addrvec_t(addr
), priority
);
433 // try legacy on same port too
434 addr
.set_type(entity_addr_t::TYPE_LEGACY
);
435 if (!contains(addr
)) {
436 add(name
+ "-legacy", entity_addrvec_t(addr
), priority
);
443 int MonMap::init_with_ips(const std::string
& ips
,
445 const std::string
&prefix
)
447 vector
<entity_addrvec_t
> addrs
;
448 if (!parse_ip_port_vec(
450 entity_addr_t::TYPE_ANY
)) {
455 for (unsigned i
=0; i
<addrs
.size(); i
++) {
462 if (addrs
[i
].v
.size() == 1) {
463 _add_ambiguous_addr(name
, addrs
[i
].front(), 0, for_mkfs
);
465 // they specified an addrvec, so let's assume they also specified
466 // the addr *type* and *port*. (we could possibly improve this?)
467 add(name
, addrs
[i
], 0);
473 int MonMap::init_with_hosts(const std::string
& hostlist
,
475 const std::string
& prefix
)
477 // maybe they passed us a DNS-resolvable name
478 char *hosts
= resolve_addrs(hostlist
.c_str());
482 vector
<entity_addrvec_t
> addrs
;
483 bool success
= parse_ip_port_vec(
485 for_mkfs
? entity_addr_t::TYPE_MSGR2
: entity_addr_t::TYPE_ANY
);
491 for (unsigned i
=0; i
<addrs
.size(); i
++) {
495 string name
= prefix
;
497 if (addrs
[i
].v
.size() == 1) {
498 _add_ambiguous_addr(name
, addrs
[i
].front(), 0);
500 add(name
, addrs
[i
], 0);
507 void MonMap::set_initial_members(CephContext
*cct
,
508 list
<std::string
>& initial_members
,
510 const entity_addrvec_t
& my_addrs
,
511 set
<entity_addrvec_t
> *removed
)
513 // remove non-initial members
516 string n
= get_name(i
);
517 if (std::find(initial_members
.begin(), initial_members
.end(), n
)
518 != initial_members
.end()) {
519 lgeneric_dout(cct
, 1) << " keeping " << n
<< " " << get_addrs(i
) << dendl
;
524 lgeneric_dout(cct
, 1) << " removing " << get_name(i
) << " " << get_addrs(i
)
527 removed
->insert(get_addrs(i
));
530 ceph_assert(!contains(n
));
533 // add missing initial members
534 for (auto& p
: initial_members
) {
537 lgeneric_dout(cct
, 1) << " adding self " << p
<< " " << my_addrs
542 a
.set_type(entity_addr_t::TYPE_LEGACY
);
543 a
.set_family(AF_INET
);
544 for (int n
=1; ; n
++) {
549 lgeneric_dout(cct
, 1) << " adding " << p
<< " " << a
<< dendl
;
550 add(p
, entity_addrvec_t(a
));
552 ceph_assert(contains(p
));
558 int MonMap::init_with_config_file(const ConfigProxy
& conf
,
559 std::ostream
& errout
)
561 std::vector
<std::string
> sections
;
562 int ret
= conf
.get_all_sections(sections
);
564 errout
<< "Unable to find any monitors in the configuration "
565 << "file, because there was an error listing the sections. error "
569 std::vector
<std::string
> mon_names
;
570 for (const auto& section
: sections
) {
571 if (section
.substr(0, 4) == "mon." && section
.size() > 4) {
572 mon_names
.push_back(section
.substr(4));
576 // Find an address for each monitor in the config file.
577 for (const auto& mon_name
: mon_names
) {
578 std::vector
<std::string
> sections
;
579 std::string
m_name("mon");
582 sections
.push_back(m_name
);
583 sections
.push_back("mon");
584 sections
.push_back("global");
586 int res
= conf
.get_val_from_conf_file(sections
, "mon addr", val
, true);
588 errout
<< "failed to get an address for mon." << mon_name
589 << ": error " << res
<< std::endl
;
592 // the 'mon addr' field is a legacy field, so assume anything
593 // there on a weird port is a v1 address, and do not handle
596 if (!addr
.parse(val
.c_str(), nullptr, entity_addr_t::TYPE_LEGACY
)) {
597 errout
<< "unable to parse address for mon." << mon_name
598 << ": addr='" << val
<< "'" << std::endl
;
601 if (addr
.get_port() == 0) {
602 addr
.set_port(CEPH_MON_PORT_LEGACY
);
604 uint16_t priority
= 0;
605 if (!conf
.get_val_from_conf_file(sections
, "mon priority", val
, false)) {
607 priority
= std::stoul(val
);
608 } catch (std::logic_error
&) {
609 errout
<< "unable to parse priority for mon." << mon_name
610 << ": priority='" << val
<< "'" << std::endl
;
615 // the make sure this mon isn't already in the map
617 remove(get_name(addr
));
618 if (contains(mon_name
))
620 _add_ambiguous_addr(mon_name
, addr
, priority
);
627 using namespace seastar
;
629 future
<> MonMap::read_monmap(const std::string
& monmap
)
631 return open_file_dma(monmap
, open_flags::ro
).then([this] (file f
) {
632 return f
.size().then([this, f
= std::move(f
)](size_t s
) {
633 return do_with(make_file_input_stream(f
), [this, s
](input_stream
<char>& in
) {
634 return in
.read_exactly(s
).then([this](temporary_buffer
<char> buf
) {
636 bl
.append(buffer::create(std::move(buf
)));
644 future
<> MonMap::init_with_dns_srv(bool for_mkfs
, const std::string
& name
)
647 string service
= name
;
648 // check if domain is also provided and extract it from srv_name
649 size_t idx
= name
.find("_");
650 if (idx
!= name
.npos
) {
651 domain
= name
.substr(idx
+ 1);
652 service
= name
.substr(0, idx
);
654 return net::dns::get_srv_records(
655 net::dns_resolver::srv_proto::tcp
,
656 service
, domain
).then([this](net::dns_resolver::srv_records records
) {
657 return parallel_for_each(records
, [this](auto record
) {
658 return net::dns::resolve_name(record
.target
).then(
659 [record
,this](net::inet_address a
) {
660 // the resolved address does not contain ceph specific info like nonce
661 // nonce or msgr proto (legacy, msgr2), so set entity_addr_t manually
663 addr
.set_type(entity_addr_t::TYPE_ANY
);
664 addr
.set_family(int(a
.in_family()));
665 addr
.set_port(record
.port
);
666 switch (a
.in_family()) {
667 case net::inet_address::family::INET
:
668 addr
.in4_addr().sin_addr
= a
;
670 case net::inet_address::family::INET6
:
671 addr
.in6_addr().sin6_addr
= a
;
674 _add_ambiguous_addr(record
.target
, addr
, record
.priority
);
677 }).handle_exception_type([](const std::system_error
& e
) {
678 // ignore DNS failures
679 return seastar::make_ready_future
<>();
683 seastar::future
<> MonMap::build_monmap(const ceph::common::ConfigProxy
& conf
,
687 if (const auto mon_host
= conf
.get_val
<std::string
>("mon_host");
689 if (auto ret
= init_with_ips(mon_host
, for_mkfs
, "noname-"); ret
== 0) {
690 return make_ready_future
<>();
692 // TODO: resolve_addrs() is a blocking call
693 if (auto ret
= init_with_hosts(mon_host
, for_mkfs
, "noname-"); ret
== 0) {
694 return make_ready_future
<>();
696 throw std::runtime_error(cpp_strerror(ret
));
700 // What monitors are in the config file?
701 ostringstream errout
;
702 if (auto ret
= init_with_config_file(conf
, errout
); ret
< 0) {
703 throw std::runtime_error(errout
.str());
706 return make_ready_future
<>();
708 // no info found from conf options lets try use DNS SRV records
709 const string srv_name
= conf
.get_val
<std::string
>("mon_dns_srv_name");
710 return init_with_dns_srv(for_mkfs
, srv_name
).then([this] {
712 throw std::runtime_error("no monitors specified to connect to.");
717 future
<> MonMap::build_initial(const ceph::common::ConfigProxy
& conf
, bool for_mkfs
)
720 if (const auto monmap
= conf
.get_val
<std::string
>("monmap");
722 return read_monmap(monmap
);
725 if (const auto new_fsid
= conf
.get_val
<uuid_d
>("fsid");
726 !new_fsid
.is_zero()) {
729 return build_monmap(conf
, for_mkfs
).then([this] {
730 created
= ceph_clock_now();
731 last_changed
= created
;
737 #else // WITH_SEASTAR
739 int MonMap::init_with_monmap(const std::string
& monmap
, std::ostream
& errout
)
743 r
= read(monmap
.c_str());
744 } catch (buffer::error
&) {
749 errout
<< "unable to read/decode monmap from " << monmap
750 << ": " << cpp_strerror(-r
) << std::endl
;
754 int MonMap::init_with_dns_srv(CephContext
* cct
,
755 std::string srv_name
,
757 std::ostream
& errout
)
760 // check if domain is also provided and extract it from srv_name
761 size_t idx
= srv_name
.find("_");
762 if (idx
!= string::npos
) {
763 domain
= srv_name
.substr(idx
+ 1);
764 srv_name
= srv_name
.substr(0, idx
);
767 map
<string
, DNSResolver::Record
> records
;
768 if (DNSResolver::get_instance()->resolve_srv_hosts(cct
, srv_name
,
769 DNSResolver::SRV_Protocol::TCP
, domain
, &records
) != 0) {
771 errout
<< "unable to get monitor info from DNS SRV with service name: "
772 << "ceph-mon" << std::endl
;
775 for (auto& record
: records
) {
776 record
.second
.addr
.set_type(entity_addr_t::TYPE_ANY
);
777 _add_ambiguous_addr(record
.first
, record
.second
.addr
,
778 record
.second
.priority
);
784 int MonMap::build_initial(CephContext
*cct
, bool for_mkfs
, ostream
& errout
)
786 const auto& conf
= cct
->_conf
;
788 if (const auto monmap
= conf
.get_val
<std::string
>("monmap");
790 return init_with_monmap(monmap
, errout
);
794 if (const auto new_fsid
= conf
.get_val
<uuid_d
>("fsid");
795 !new_fsid
.is_zero()) {
799 if (const auto mon_host
= conf
.get_val
<std::string
>("mon_host");
801 auto ret
= init_with_ips(mon_host
, for_mkfs
, "noname-");
802 if (ret
== -EINVAL
) {
803 ret
= init_with_hosts(mon_host
, for_mkfs
, "noname-");
806 errout
<< "unable to parse addrs in '" << mon_host
<< "'"
812 // What monitors are in the config file?
813 if (auto ret
= init_with_config_file(conf
, errout
); ret
< 0) {
818 // no info found from conf options lets try use DNS SRV records
819 string srv_name
= conf
.get_val
<std::string
>("mon_dns_srv_name");
820 if (auto ret
= init_with_dns_srv(cct
, srv_name
, for_mkfs
, errout
); ret
< 0) {
825 errout
<< "no monitors specified to connect to." << std::endl
;
828 created
= ceph_clock_now();
829 last_changed
= created
;
833 #endif // WITH_SEASTAR