1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2012 Inktank
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/pick_address.h"
18 #include <netinet/in.h>
23 #include <boost/algorithm/string/predicate.hpp>
24 #include <fmt/format.h>
26 #include "include/ipaddr.h"
27 #include "include/str_list.h"
28 #include "common/ceph_context.h"
30 #include "common/config.h"
31 #include "common/config_obs.h"
33 #include "common/debug.h"
34 #include "common/errno.h"
35 #include "common/numa.h"
37 #ifndef HAVE_IN_ADDR_T
38 typedef uint32_t in_addr_t
;
41 #ifndef IN_LOOPBACKNET
42 #define IN_LOOPBACKNET 127
45 #define dout_subsys ceph_subsys_
52 bool matches_with_name(const ifaddrs
& ifa
, const std::string
& if_name
)
54 return if_name
.compare(ifa
.ifa_name
) == 0;
57 static int is_loopback_addr(sockaddr
* addr
)
59 if (addr
->sa_family
== AF_INET
) {
60 const sockaddr_in
* sin
= (struct sockaddr_in
*)(addr
);
61 const in_addr_t net
= ntohl(sin
->sin_addr
.s_addr
) >> IN_CLASSA_NSHIFT
;
62 return net
== IN_LOOPBACKNET
? 1 : 0;
63 } else if (addr
->sa_family
== AF_INET6
) {
64 sockaddr_in6
* sin6
= (struct sockaddr_in6
*)(addr
);
65 return IN6_IS_ADDR_LOOPBACK(&sin6
->sin6_addr
) ? 1 : 0;
71 static int grade_addr(const ifaddrs
& ifa
)
73 if (ifa
.ifa_addr
== nullptr) {
77 if (ifa
.ifa_flags
& IFF_UP
) {
80 switch (is_loopback_addr(ifa
.ifa_addr
)) {
82 // prefer non-loopback addresses
95 bool matches_with_net(const ifaddrs
& ifa
,
97 unsigned int prefix_len
,
100 switch (net
->sa_family
) {
102 if (ipv
& CEPH_PICK_ADDRESS_IPV4
) {
103 return matches_ipv4_in_subnet(ifa
, (struct sockaddr_in
*)net
, prefix_len
);
107 if (ipv
& CEPH_PICK_ADDRESS_IPV6
) {
108 return matches_ipv6_in_subnet(ifa
, (struct sockaddr_in6
*)net
, prefix_len
);
115 bool matches_with_net(CephContext
*cct
,
117 const std::string
& s
,
120 struct sockaddr_storage net
;
121 unsigned int prefix_len
;
122 if (!parse_network(s
.c_str(), &net
, &prefix_len
)) {
123 lderr(cct
) << "unable to parse network: " << s
<< dendl
;
126 return matches_with_net(ifa
, (sockaddr
*)&net
, prefix_len
, ipv
);
129 int grade_with_numa_node(const ifaddrs
& ifa
, int numa_node
)
131 #if defined(WITH_SEASTAR) || defined(_WIN32)
138 int r
= get_iface_numa_node(ifa
.ifa_name
, &if_node
);
142 return if_node
== numa_node
? 1 : 0;
147 const struct sockaddr
*find_ip_in_subnet_list(
149 const struct ifaddrs
*ifa
,
151 const std::string
&networks
,
152 const std::string
&interfaces
,
155 const auto ifs
= get_str_list(interfaces
);
156 const auto nets
= get_str_list(networks
);
157 if (!ifs
.empty() && nets
.empty()) {
158 lderr(cct
) << "interface names specified but not network names" << dendl
;
163 const sockaddr
* best_addr
= nullptr;
164 for (const auto* addr
= ifa
; addr
!= nullptr; addr
= addr
->ifa_next
) {
166 std::none_of(std::begin(ifs
), std::end(ifs
),
167 [&](const auto& if_name
) {
168 return matches_with_name(*addr
, if_name
);
173 std::none_of(std::begin(nets
), std::end(nets
),
174 [&](const auto& net
) {
175 return matches_with_net(cct
, *addr
, net
, ipv
);
179 int score
= grade_addr(*addr
);
183 score
+= grade_with_numa_node(*addr
, numa_node
);
184 if (score
> best_score
) {
186 best_addr
= addr
->ifa_addr
;
193 // observe this change
194 struct Observer
: public md_config_obs_t
{
196 explicit Observer(const char *c
) {
201 const char** get_tracked_conf_keys() const override
{
202 return (const char **)keys
;
204 void handle_conf_change(const ConfigProxy
& conf
,
205 const std::set
<std::string
> &changed
) override
{
210 static void fill_in_one_address(CephContext
*cct
,
211 const struct ifaddrs
*ifa
,
212 const string
&networks
,
213 const string
&interfaces
,
214 const char *conf_var
,
217 const struct sockaddr
*found
= find_ip_in_subnet_list(
220 CEPH_PICK_ADDRESS_IPV4
|CEPH_PICK_ADDRESS_IPV6
,
225 lderr(cct
) << "unable to find any IP address in networks '" << networks
226 << "' interfaces '" << interfaces
<< "'" << dendl
;
230 char buf
[INET6_ADDRSTRLEN
];
233 err
= getnameinfo(found
,
234 (found
->sa_family
== AF_INET
)
235 ? sizeof(struct sockaddr_in
)
236 : sizeof(struct sockaddr_in6
),
242 lderr(cct
) << "unable to convert chosen address to string: " << gai_strerror(err
) << dendl
;
246 Observer
obs(conf_var
);
248 cct
->_conf
.add_observer(&obs
);
250 cct
->_conf
.set_val_or_die(conf_var
, buf
);
251 cct
->_conf
.apply_changes(nullptr);
253 cct
->_conf
.remove_observer(&obs
);
256 void pick_addresses(CephContext
*cct
, int needs
)
258 auto public_addr
= cct
->_conf
.get_val
<entity_addr_t
>("public_addr");
259 auto public_network
= cct
->_conf
.get_val
<std::string
>("public_network");
260 auto public_network_interface
=
261 cct
->_conf
.get_val
<std::string
>("public_network_interface");
262 auto cluster_addr
= cct
->_conf
.get_val
<entity_addr_t
>("cluster_addr");
263 auto cluster_network
= cct
->_conf
.get_val
<std::string
>("cluster_network");
264 auto cluster_network_interface
=
265 cct
->_conf
.get_val
<std::string
>("cluster_network_interface");
268 int r
= getifaddrs(&ifa
);
270 string err
= cpp_strerror(errno
);
271 lderr(cct
) << "unable to fetch interfaces and addresses: " << err
<< dendl
;
274 auto free_ifa
= make_scope_guard([ifa
] { freeifaddrs(ifa
); });
275 if ((needs
& CEPH_PICK_ADDRESS_PUBLIC
) &&
276 public_addr
.is_blank_ip() && !public_network
.empty()) {
277 fill_in_one_address(cct
, ifa
, public_network
, public_network_interface
,
281 if ((needs
& CEPH_PICK_ADDRESS_CLUSTER
) && cluster_addr
.is_blank_ip()) {
282 if (!cluster_network
.empty()) {
283 fill_in_one_address(cct
, ifa
, cluster_network
, cluster_network_interface
,
286 if (!public_network
.empty()) {
287 lderr(cct
) << "Public network was set, but cluster network was not set " << dendl
;
288 lderr(cct
) << " Using public network also for cluster network" << dendl
;
289 fill_in_one_address(cct
, ifa
, public_network
, public_network_interface
,
295 #endif // !WITH_SEASTAR
297 static std::optional
<entity_addr_t
> get_one_address(
299 const struct ifaddrs
*ifa
,
301 const string
&networks
,
302 const string
&interfaces
,
305 const struct sockaddr
*found
= find_ip_in_subnet_list(cct
, ifa
, ipv
,
310 std::string_view ip_type
;
311 if ((ipv
& CEPH_PICK_ADDRESS_IPV4
) && (ipv
& CEPH_PICK_ADDRESS_IPV6
)) {
312 ip_type
= "IPv4 or IPv6";
313 } else if (ipv
& CEPH_PICK_ADDRESS_IPV4
) {
318 lderr(cct
) << "unable to find any " << ip_type
<< " address in networks '"
319 << networks
<< "' interfaces '" << interfaces
<< "'" << dendl
;
323 char buf
[INET6_ADDRSTRLEN
];
326 err
= getnameinfo(found
,
327 (found
->sa_family
== AF_INET
)
328 ? sizeof(struct sockaddr_in
)
329 : sizeof(struct sockaddr_in6
),
335 lderr(cct
) << "unable to convert chosen address to string: " << gai_strerror(err
) << dendl
;
340 if (addr
.parse(buf
)) {
351 entity_addrvec_t
*addrs
,
352 int preferred_numa_node
)
356 unsigned addrt
= (flags
& (CEPH_PICK_ADDRESS_PUBLIC
|
357 CEPH_PICK_ADDRESS_CLUSTER
));
359 addrt
== (CEPH_PICK_ADDRESS_PUBLIC
|
360 CEPH_PICK_ADDRESS_CLUSTER
)) {
363 unsigned msgrv
= flags
& (CEPH_PICK_ADDRESS_MSGR1
|
364 CEPH_PICK_ADDRESS_MSGR2
);
366 if (cct
->_conf
.get_val
<bool>("ms_bind_msgr1")) {
367 msgrv
|= CEPH_PICK_ADDRESS_MSGR1
;
369 if (cct
->_conf
.get_val
<bool>("ms_bind_msgr2")) {
370 msgrv
|= CEPH_PICK_ADDRESS_MSGR2
;
376 unsigned ipv
= flags
& (CEPH_PICK_ADDRESS_IPV4
|
377 CEPH_PICK_ADDRESS_IPV6
);
379 if (cct
->_conf
.get_val
<bool>("ms_bind_ipv4")) {
380 ipv
|= CEPH_PICK_ADDRESS_IPV4
;
382 if (cct
->_conf
.get_val
<bool>("ms_bind_ipv6")) {
383 ipv
|= CEPH_PICK_ADDRESS_IPV6
;
388 if (cct
->_conf
.get_val
<bool>("ms_bind_prefer_ipv4")) {
389 flags
|= CEPH_PICK_ADDRESS_PREFER_IPV4
;
391 flags
&= ~CEPH_PICK_ADDRESS_PREFER_IPV4
;
398 if (addrt
& CEPH_PICK_ADDRESS_PUBLIC
) {
399 addr
= cct
->_conf
.get_val
<entity_addr_t
>("public_addr");
400 networks
= cct
->_conf
.get_val
<std::string
>("public_network");
402 cct
->_conf
.get_val
<std::string
>("public_network_interface");
404 addr
= cct
->_conf
.get_val
<entity_addr_t
>("cluster_addr");
405 networks
= cct
->_conf
.get_val
<std::string
>("cluster_network");
407 cct
->_conf
.get_val
<std::string
>("cluster_network_interface");
408 if (networks
.empty()) {
409 lderr(cct
) << "Falling back to public interface" << dendl
;
410 // fall back to public_ network and interface if cluster is not set
411 networks
= cct
->_conf
.get_val
<std::string
>("public_network");
413 cct
->_conf
.get_val
<std::string
>("public_network_interface");
416 if (addr
.is_blank_ip() &&
418 // note: pass in ipv to filter the matching addresses
419 for (auto pick_mask
: {CEPH_PICK_ADDRESS_IPV4
, CEPH_PICK_ADDRESS_IPV6
}) {
420 if (ipv
& pick_mask
) {
421 auto ip_addr
= get_one_address(cct
, ifa
, pick_mask
,
422 networks
, interfaces
,
423 preferred_numa_node
);
425 addrs
->v
.push_back(*ip_addr
);
427 // picked but not found
434 // note: we may have a blank addr here
437 if (addrs
->v
.empty()) {
438 addr
.set_type(entity_addr_t::TYPE_MSGR2
);
439 for (auto pick_mask
: {CEPH_PICK_ADDRESS_IPV4
, CEPH_PICK_ADDRESS_IPV6
}) {
440 if (ipv
& pick_mask
) {
441 addr
.set_family(pick_mask
== CEPH_PICK_ADDRESS_IPV4
? AF_INET
: AF_INET6
);
442 addrs
->v
.push_back(addr
);
447 std::sort(addrs
->v
.begin(), addrs
->v
.end(),
448 [flags
] (entity_addr_t
& lhs
, entity_addr_t
& rhs
) {
449 if (flags
& CEPH_PICK_ADDRESS_PREFER_IPV4
) {
450 return lhs
.is_ipv4() && rhs
.is_ipv6();
452 return lhs
.is_ipv6() && rhs
.is_ipv4();
456 // msgr2 or legacy or both?
457 if (msgrv
== (CEPH_PICK_ADDRESS_MSGR1
| CEPH_PICK_ADDRESS_MSGR2
)) {
458 vector
<entity_addr_t
> v
;
461 a
.set_type(entity_addr_t::TYPE_MSGR2
);
462 if (flags
& CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS
) {
463 a
.set_port(CEPH_MON_PORT_IANA
);
465 addrs
->v
.push_back(a
);
466 a
.set_type(entity_addr_t::TYPE_LEGACY
);
467 if (flags
& CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS
) {
468 a
.set_port(CEPH_MON_PORT_LEGACY
);
470 addrs
->v
.push_back(a
);
472 } else if (msgrv
== CEPH_PICK_ADDRESS_MSGR1
) {
473 for (auto& a
: addrs
->v
) {
474 a
.set_type(entity_addr_t::TYPE_LEGACY
);
477 for (auto& a
: addrs
->v
) {
478 a
.set_type(entity_addr_t::TYPE_MSGR2
);
488 entity_addrvec_t
*addrs
,
489 int preferred_numa_node
)
492 int r
= getifaddrs(&ifa
);
495 string err
= cpp_strerror(r
);
496 lderr(cct
) << "unable to fetch interfaces and addresses: "
497 << cpp_strerror(r
) << dendl
;
500 r
= pick_addresses(cct
, flags
, ifa
, addrs
, preferred_numa_node
);
505 std::string
pick_iface(CephContext
*cct
, const struct sockaddr_storage
&network
)
508 int r
= getifaddrs(&ifa
);
510 string err
= cpp_strerror(errno
);
511 lderr(cct
) << "unable to fetch interfaces and addresses: " << err
<< dendl
;
514 auto free_ifa
= make_scope_guard([ifa
] { freeifaddrs(ifa
); });
515 const unsigned int prefix_len
= std::max(sizeof(in_addr::s_addr
), sizeof(in6_addr::s6_addr
)) * CHAR_BIT
;
516 for (auto addr
= ifa
; addr
!= nullptr; addr
= addr
->ifa_next
) {
517 if (matches_with_net(*ifa
, (const struct sockaddr
*) &network
, prefix_len
,
518 CEPH_PICK_ADDRESS_IPV4
| CEPH_PICK_ADDRESS_IPV6
)) {
519 return addr
->ifa_name
;
526 bool have_local_addr(CephContext
*cct
, const std::list
<entity_addr_t
>& ls
, entity_addr_t
*match
)
529 int r
= getifaddrs(&ifa
);
531 lderr(cct
) << "unable to fetch interfaces and addresses: " << cpp_strerror(errno
) << dendl
;
534 auto free_ifa
= make_scope_guard([ifa
] { freeifaddrs(ifa
); });
536 for (struct ifaddrs
*addrs
= ifa
; addrs
!= nullptr; addrs
= addrs
->ifa_next
) {
537 if (addrs
->ifa_addr
) {
539 a
.set_sockaddr(addrs
->ifa_addr
);
541 if (a
.is_same_host(p
)) {
551 int get_iface_numa_node(
552 const std::string
& iface
,
558 } ifatype
= iface_t::PHY_PORT
;
559 std::string_view ifa
{iface
};
560 if (auto pos
= ifa
.find(":"); pos
!= ifa
.npos
) {
561 ifa
.remove_suffix(ifa
.size() - pos
);
563 string fn
= fmt::format("/sys/class/net/{}/device/numa_node", ifa
);
564 int fd
= ::open(fn
.c_str(), O_RDONLY
);
566 fn
= fmt::format("/sys/class/net/{}/bonding/slaves", ifa
);
567 fd
= ::open(fn
.c_str(), O_RDONLY
);
571 ifatype
= iface_t::BOND_PORT
;
577 r
= safe_read(fd
, &buf
, sizeof(buf
));
582 while (r
> 0 && ::isspace(buf
[--r
])) {
587 case iface_t::PHY_PORT
:
588 *node
= strtoll(buf
, &endptr
, 10);
589 if (endptr
!= buf
+ strlen(buf
)) {
595 case iface_t::BOND_PORT
:
597 std::vector
<std::string
> sv
;
598 std::string ifacestr
= buf
;
599 get_str_vec(ifacestr
, " ", sv
);
600 for (auto& iter
: sv
) {
602 r
= get_iface_numa_node(iter
, &bn
);
604 if (bond_node
== -1 || bn
== bond_node
) {