]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright (C) 2014 Cloudius Systems, Ltd. | |
20 | */ | |
21 | ||
22 | #include <random> | |
9f95a23c | 23 | |
f67539c2 | 24 | #include <sys/socket.h> |
9f95a23c TL |
25 | #include <linux/if.h> |
26 | #include <linux/netlink.h> | |
27 | #include <linux/rtnetlink.h> | |
28 | #include <net/route.h> | |
29 | ||
f67539c2 TL |
30 | #include <seastar/core/loop.hh> |
31 | #include <seastar/core/reactor.hh> | |
11fdf7f2 TL |
32 | #include <seastar/net/posix-stack.hh> |
33 | #include <seastar/net/net.hh> | |
34 | #include <seastar/net/packet.hh> | |
35 | #include <seastar/net/api.hh> | |
9f95a23c TL |
36 | #include <seastar/net/inet_address.hh> |
37 | #include <seastar/util/std-compat.hh> | |
11fdf7f2 TL |
38 | #include <netinet/tcp.h> |
39 | #include <netinet/sctp.h> | |
40 | ||
9f95a23c TL |
41 | namespace std { |
42 | ||
43 | template <> | |
44 | struct hash<seastar::net::posix_ap_server_socket_impl::protocol_and_socket_address> { | |
45 | size_t operator()(const seastar::net::posix_ap_server_socket_impl::protocol_and_socket_address& t_sa) const { | |
46 | auto h1 = std::hash<int>()(std::get<0>(t_sa)); | |
47 | auto h2 = std::hash<seastar::net::socket_address>()(std::get<1>(t_sa)); | |
48 | return h1 ^ h2; | |
49 | } | |
50 | }; | |
51 | ||
52 | } | |
53 | ||
f67539c2 TL |
54 | |
55 | namespace { | |
56 | ||
57 | // reinterpret_cast<foo*>() on a pointer that the compiler knows points to an | |
58 | // object with a different type is disliked by the compiler as it violates | |
59 | // strict aliasing rules. This safe version does the same thing but keeps the | |
60 | // compiler happy. | |
61 | template <typename T> | |
62 | T | |
63 | copy_reinterpret_cast(const void* ptr) { | |
64 | T tmp; | |
65 | std::memcpy(&tmp, ptr, sizeof(T)); | |
66 | return tmp; | |
67 | } | |
68 | ||
69 | } | |
70 | ||
11fdf7f2 TL |
71 | namespace seastar { |
72 | ||
73 | namespace net { | |
74 | ||
75 | using namespace seastar; | |
76 | ||
9f95a23c TL |
77 | class posix_connected_socket_operations { |
78 | public: | |
79 | virtual ~posix_connected_socket_operations() = default; | |
80 | virtual void set_nodelay(file_desc& fd, bool nodelay) const = 0; | |
81 | virtual bool get_nodelay(file_desc& fd) const = 0; | |
82 | virtual void set_keepalive(file_desc& _fd, bool keepalive) const = 0; | |
83 | virtual bool get_keepalive(file_desc& _fd) const = 0; | |
84 | virtual void set_keepalive_parameters(file_desc& _fd, const keepalive_params& params) const = 0; | |
85 | virtual keepalive_params get_keepalive_parameters(file_desc& _fd) const = 0; | |
f67539c2 TL |
86 | virtual void set_sockopt(file_desc& _fd, int level, int optname, const void* data, size_t len) const { |
87 | _fd.setsockopt(level, optname, data, socklen_t(len)); | |
88 | } | |
89 | virtual int get_sockopt(file_desc& _fd, int level, int optname, void* data, size_t len) const { | |
90 | return _fd.getsockopt(level, optname, reinterpret_cast<char*>(data), socklen_t(len)); | |
91 | } | |
20effc67 TL |
92 | virtual socket_address local_address(file_desc& _fd) const { |
93 | return _fd.get_address(); | |
94 | } | |
9f95a23c TL |
95 | }; |
96 | ||
97 | thread_local posix_ap_server_socket_impl::sockets_map_t posix_ap_server_socket_impl::sockets{}; | |
98 | thread_local posix_ap_server_socket_impl::conn_map_t posix_ap_server_socket_impl::conn_q{}; | |
11fdf7f2 | 99 | |
9f95a23c | 100 | class posix_tcp_connected_socket_operations : public posix_connected_socket_operations { |
11fdf7f2 | 101 | public: |
9f95a23c | 102 | virtual void set_nodelay(file_desc& _fd, bool nodelay) const override { |
11fdf7f2 TL |
103 | _fd.setsockopt(IPPROTO_TCP, TCP_NODELAY, int(nodelay)); |
104 | } | |
9f95a23c | 105 | virtual bool get_nodelay(file_desc& _fd) const override { |
11fdf7f2 TL |
106 | return _fd.getsockopt<int>(IPPROTO_TCP, TCP_NODELAY); |
107 | } | |
9f95a23c | 108 | virtual void set_keepalive(file_desc& _fd, bool keepalive) const override { |
11fdf7f2 TL |
109 | _fd.setsockopt(SOL_SOCKET, SO_KEEPALIVE, int(keepalive)); |
110 | } | |
9f95a23c | 111 | virtual bool get_keepalive(file_desc& _fd) const override { |
11fdf7f2 TL |
112 | return _fd.getsockopt<int>(SOL_SOCKET, SO_KEEPALIVE); |
113 | } | |
9f95a23c | 114 | virtual void set_keepalive_parameters(file_desc& _fd, const keepalive_params& params) const override { |
f67539c2 | 115 | const tcp_keepalive_params& pms = std::get<tcp_keepalive_params>(params); |
11fdf7f2 TL |
116 | _fd.setsockopt(IPPROTO_TCP, TCP_KEEPCNT, pms.count); |
117 | _fd.setsockopt(IPPROTO_TCP, TCP_KEEPIDLE, int(pms.idle.count())); | |
118 | _fd.setsockopt(IPPROTO_TCP, TCP_KEEPINTVL, int(pms.interval.count())); | |
119 | } | |
9f95a23c | 120 | virtual keepalive_params get_keepalive_parameters(file_desc& _fd) const override { |
11fdf7f2 TL |
121 | return tcp_keepalive_params { |
122 | std::chrono::seconds(_fd.getsockopt<int>(IPPROTO_TCP, TCP_KEEPIDLE)), | |
123 | std::chrono::seconds(_fd.getsockopt<int>(IPPROTO_TCP, TCP_KEEPINTVL)), | |
124 | _fd.getsockopt<unsigned>(IPPROTO_TCP, TCP_KEEPCNT) | |
125 | }; | |
126 | } | |
127 | }; | |
128 | ||
9f95a23c | 129 | class posix_sctp_connected_socket_operations : public posix_connected_socket_operations { |
11fdf7f2 | 130 | public: |
9f95a23c | 131 | virtual void set_nodelay(file_desc& _fd, bool nodelay) const override { |
11fdf7f2 TL |
132 | _fd.setsockopt(SOL_SCTP, SCTP_NODELAY, int(nodelay)); |
133 | } | |
9f95a23c | 134 | virtual bool get_nodelay(file_desc& _fd) const override { |
11fdf7f2 TL |
135 | return _fd.getsockopt<int>(SOL_SCTP, SCTP_NODELAY); |
136 | } | |
9f95a23c | 137 | virtual void set_keepalive(file_desc& _fd, bool keepalive) const override { |
11fdf7f2 TL |
138 | auto heartbeat = _fd.getsockopt<sctp_paddrparams>(SOL_SCTP, SCTP_PEER_ADDR_PARAMS); |
139 | if (keepalive) { | |
140 | heartbeat.spp_flags |= SPP_HB_ENABLE; | |
141 | } else { | |
142 | heartbeat.spp_flags &= ~SPP_HB_ENABLE; | |
143 | } | |
144 | _fd.setsockopt(SOL_SCTP, SCTP_PEER_ADDR_PARAMS, heartbeat); | |
145 | } | |
9f95a23c | 146 | virtual bool get_keepalive(file_desc& _fd) const override { |
11fdf7f2 TL |
147 | return _fd.getsockopt<sctp_paddrparams>(SOL_SCTP, SCTP_PEER_ADDR_PARAMS).spp_flags & SPP_HB_ENABLE; |
148 | } | |
9f95a23c | 149 | virtual void set_keepalive_parameters(file_desc& _fd, const keepalive_params& kpms) const override { |
f67539c2 | 150 | const sctp_keepalive_params& pms = std::get<sctp_keepalive_params>(kpms); |
11fdf7f2 TL |
151 | auto params = _fd.getsockopt<sctp_paddrparams>(SOL_SCTP, SCTP_PEER_ADDR_PARAMS); |
152 | params.spp_hbinterval = pms.interval.count() * 1000; // in milliseconds | |
153 | params.spp_pathmaxrxt = pms.count; | |
154 | _fd.setsockopt(SOL_SCTP, SCTP_PEER_ADDR_PARAMS, params); | |
155 | } | |
9f95a23c | 156 | virtual keepalive_params get_keepalive_parameters(file_desc& _fd) const override { |
11fdf7f2 TL |
157 | auto params = _fd.getsockopt<sctp_paddrparams>(SOL_SCTP, SCTP_PEER_ADDR_PARAMS); |
158 | return sctp_keepalive_params { | |
159 | std::chrono::seconds(params.spp_hbinterval/1000), // in seconds | |
160 | params.spp_pathmaxrxt | |
161 | }; | |
162 | } | |
163 | }; | |
164 | ||
9f95a23c TL |
165 | class posix_unix_stream_connected_socket_operations : public posix_connected_socket_operations { |
166 | public: | |
167 | virtual void set_nodelay(file_desc& fd, bool nodelay) const override { | |
168 | assert(nodelay); // make sure nobody actually tries to use this non-existing functionality | |
169 | } | |
170 | virtual bool get_nodelay(file_desc& fd) const override { | |
171 | return true; | |
172 | } | |
173 | virtual void set_keepalive(file_desc& fd, bool keepalive) const override {} | |
174 | virtual bool get_keepalive(file_desc& fd) const override { | |
175 | return false; | |
176 | } | |
177 | virtual void set_keepalive_parameters(file_desc& fd, const keepalive_params& p) const override {} | |
178 | virtual keepalive_params get_keepalive_parameters(file_desc& fd) const override { | |
179 | return keepalive_params{}; | |
180 | } | |
181 | }; | |
182 | ||
183 | static const posix_connected_socket_operations* | |
184 | get_posix_connected_socket_ops(sa_family_t family, int protocol) { | |
185 | static posix_tcp_connected_socket_operations tcp_ops; | |
186 | static posix_sctp_connected_socket_operations sctp_ops; | |
187 | static posix_unix_stream_connected_socket_operations unix_ops; | |
188 | switch (family) { | |
189 | case AF_INET: | |
190 | case AF_INET6: | |
191 | switch (protocol) { | |
192 | case IPPROTO_TCP: return &tcp_ops; | |
193 | case IPPROTO_SCTP: return &sctp_ops; | |
194 | default: abort(); | |
195 | } | |
196 | case AF_UNIX: | |
197 | return &unix_ops; | |
198 | default: | |
199 | abort(); | |
200 | } | |
201 | } | |
202 | ||
1e59de90 TL |
203 | static void shutdown_socket_fd(pollable_fd& fd, int how) noexcept { |
204 | try { | |
205 | // file_desc::shutdown ignores ENOTCONN. Other reasons for exception | |
206 | // EINVAL (wrong "how") -- impossible | |
207 | // ENOTSOCK (not a socket) -- incredible | |
208 | // EBADF (invalid file descriptor) -- irretrievable | |
209 | fd.shutdown(how); | |
210 | } catch (...) { | |
211 | on_internal_error(seastar_logger, format("socket shutdown({}, {}) failed: {}", fd.get_file_desc().fdinfo(), how, std::current_exception())); | |
212 | } | |
213 | } | |
214 | ||
9f95a23c | 215 | class posix_connected_socket_impl final : public connected_socket_impl { |
f67539c2 | 216 | pollable_fd _fd; |
9f95a23c | 217 | const posix_connected_socket_operations* _ops; |
11fdf7f2 | 218 | conntrack::handle _handle; |
f67539c2 | 219 | std::pmr::polymorphic_allocator<char>* _allocator; |
11fdf7f2 | 220 | private: |
f67539c2 | 221 | explicit posix_connected_socket_impl(sa_family_t family, int protocol, pollable_fd fd, std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) : |
9f95a23c | 222 | _fd(std::move(fd)), _ops(get_posix_connected_socket_ops(family, protocol)), _allocator(allocator) {} |
f67539c2 TL |
223 | explicit posix_connected_socket_impl(sa_family_t family, int protocol, pollable_fd fd, conntrack::handle&& handle, |
224 | std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) : _fd(std::move(fd)) | |
9f95a23c | 225 | , _ops(get_posix_connected_socket_ops(family, protocol)), _handle(std::move(handle)), _allocator(allocator) {} |
11fdf7f2 TL |
226 | public: |
227 | virtual data_source source() override { | |
f67539c2 TL |
228 | return source(connected_socket_input_stream_config()); |
229 | } | |
230 | virtual data_source source(connected_socket_input_stream_config csisc) override { | |
231 | return data_source(std::make_unique<posix_data_source_impl>(_fd, csisc, _allocator)); | |
11fdf7f2 TL |
232 | } |
233 | virtual data_sink sink() override { | |
234 | return data_sink(std::make_unique< posix_data_sink_impl>(_fd)); | |
235 | } | |
236 | virtual void shutdown_input() override { | |
1e59de90 | 237 | shutdown_socket_fd(_fd, SHUT_RD); |
11fdf7f2 TL |
238 | } |
239 | virtual void shutdown_output() override { | |
1e59de90 | 240 | shutdown_socket_fd(_fd, SHUT_WR); |
11fdf7f2 TL |
241 | } |
242 | virtual void set_nodelay(bool nodelay) override { | |
f67539c2 | 243 | return _ops->set_nodelay(_fd.get_file_desc(), nodelay); |
11fdf7f2 TL |
244 | } |
245 | virtual bool get_nodelay() const override { | |
f67539c2 | 246 | return _ops->get_nodelay(_fd.get_file_desc()); |
11fdf7f2 TL |
247 | } |
248 | void set_keepalive(bool keepalive) override { | |
f67539c2 | 249 | return _ops->set_keepalive(_fd.get_file_desc(), keepalive); |
11fdf7f2 TL |
250 | } |
251 | bool get_keepalive() const override { | |
f67539c2 | 252 | return _ops->get_keepalive(_fd.get_file_desc()); |
11fdf7f2 TL |
253 | } |
254 | void set_keepalive_parameters(const keepalive_params& p) override { | |
f67539c2 | 255 | return _ops->set_keepalive_parameters(_fd.get_file_desc(), p); |
11fdf7f2 TL |
256 | } |
257 | keepalive_params get_keepalive_parameters() const override { | |
f67539c2 TL |
258 | return _ops->get_keepalive_parameters(_fd.get_file_desc()); |
259 | } | |
260 | void set_sockopt(int level, int optname, const void* data, size_t len) override { | |
261 | return _ops->set_sockopt(_fd.get_file_desc(), level, optname, data, len); | |
262 | } | |
263 | int get_sockopt(int level, int optname, void* data, size_t len) const override { | |
264 | return _ops->get_sockopt(_fd.get_file_desc(), level, optname, data, len); | |
11fdf7f2 | 265 | } |
20effc67 TL |
266 | socket_address local_address() const noexcept override { |
267 | return _ops->local_address(_fd.get_file_desc()); | |
268 | } | |
1e59de90 TL |
269 | future<> wait_input_shutdown() override { |
270 | return _fd.poll_rdhup(); | |
271 | } | |
20effc67 | 272 | |
9f95a23c TL |
273 | friend class posix_server_socket_impl; |
274 | friend class posix_ap_server_socket_impl; | |
275 | friend class posix_reuseport_server_socket_impl; | |
11fdf7f2 TL |
276 | friend class posix_network_stack; |
277 | friend class posix_ap_network_stack; | |
278 | friend class posix_socket_impl; | |
279 | }; | |
9f95a23c TL |
280 | |
281 | static void resolve_outgoing_address(socket_address& a) { | |
282 | if (a.family() != AF_INET6 | |
283 | || a.as_posix_sockaddr_in6().sin6_scope_id != inet_address::invalid_scope | |
284 | || !IN6_IS_ADDR_LINKLOCAL(&a.as_posix_sockaddr_in6().sin6_addr) | |
285 | ) { | |
286 | return; | |
287 | } | |
288 | ||
289 | FILE *f; | |
290 | ||
291 | if (!(f = fopen("/proc/net/ipv6_route", "r"))) { | |
292 | throw std::system_error(errno, std::system_category(), "resolve_address"); | |
293 | } | |
294 | ||
295 | auto holder = std::unique_ptr<FILE, int(*)(FILE *)>(f, &::fclose); | |
296 | ||
297 | /** | |
298 | Here all configured IPv6 routes are shown in a special format. The example displays for loopback interface only. The meaning is shown below (see net/ipv6/route.c for more). | |
299 | ||
300 | # cat /proc/net/ipv6_route | |
301 | 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000001 00200200 lo | |
302 | +------------------------------+ ++ +------------------------------+ ++ +------------------------------+ +------+ +------+ +------+ +------+ ++ | |
303 | | | | | | | | | | | | |
304 | 1 2 3 4 5 6 7 8 9 10 | |
305 | ||
306 | 1: IPv6 destination network displayed in 32 hexadecimal chars without colons as separator | |
307 | ||
308 | 2: IPv6 destination prefix length in hexadecimal | |
309 | ||
310 | 3: IPv6 source network displayed in 32 hexadecimal chars without colons as separator | |
311 | ||
312 | 4: IPv6 source prefix length in hexadecimal | |
313 | ||
314 | 5: IPv6 next hop displayed in 32 hexadecimal chars without colons as separator | |
315 | ||
316 | 6: Metric in hexadecimal | |
317 | ||
318 | 7: Reference counter | |
319 | ||
320 | 8: Use counter | |
321 | ||
322 | 9: Flags | |
323 | ||
324 | 10: Device name | |
325 | ||
326 | */ | |
327 | ||
328 | uint32_t prefix_len, src_prefix_len; | |
329 | unsigned long flags; | |
330 | char device[16]; | |
331 | char dest_str[40]; | |
332 | ||
333 | for (;;) { | |
334 | auto n = fscanf(f, "%4s%4s%4s%4s%4s%4s%4s%4s %02x " | |
335 | "%*4s%*4s%*4s%*4s%*4s%*4s%*4s%*4s %02x " | |
336 | "%*4s%*4s%*4s%*4s%*4s%*4s%*4s%*4s " | |
337 | "%*08x %*08x %*08x %08lx %8s", | |
338 | &dest_str[0], &dest_str[5], &dest_str[10], &dest_str[15], | |
339 | &dest_str[20], &dest_str[25], &dest_str[30], &dest_str[35], | |
340 | &prefix_len, | |
341 | &src_prefix_len, | |
342 | &flags, device); | |
343 | if (n != 12) { | |
344 | break; | |
345 | } | |
346 | ||
347 | if ((prefix_len > 128) || (src_prefix_len != 0) | |
348 | || (flags & (RTF_POLICY | RTF_FLOW)) | |
349 | || ((flags & RTF_REJECT) && prefix_len == 0) /* reject all */) { | |
350 | continue; | |
351 | } | |
352 | ||
353 | dest_str[4] = dest_str[9] = dest_str[14] = dest_str[19] = dest_str[24] = dest_str[29] = dest_str[34] = ':'; | |
354 | dest_str[39] = '\0'; | |
355 | ||
356 | struct in6_addr addr; | |
357 | if (inet_pton(AF_INET6, dest_str, &addr) < 0) { | |
358 | /* not an Ipv6 address */ | |
359 | continue; | |
360 | } | |
361 | ||
362 | auto bytes = prefix_len / 8; | |
363 | auto bits = prefix_len % 8; | |
364 | ||
365 | auto& src = a.as_posix_sockaddr_in6().sin6_addr; | |
366 | ||
367 | if (bytes > 0 && memcmp(&src, &addr, bytes)) { | |
368 | continue; | |
369 | } | |
370 | if (bits > 0) { | |
371 | auto c1 = src.s6_addr[bytes]; | |
372 | auto c2 = addr.s6_addr[bytes]; | |
373 | auto mask = 0xffu << (8 - bits); | |
374 | if ((c1 & mask) != (c2 & mask)) { | |
375 | continue; | |
376 | } | |
377 | } | |
378 | ||
379 | // found the route. | |
380 | for (auto& nif : engine().net().network_interfaces()) { | |
381 | if (nif.name() == device || nif.display_name() == device) { | |
382 | a.as_posix_sockaddr_in6().sin6_scope_id = nif.index(); | |
383 | return; | |
384 | } | |
385 | } | |
386 | } | |
387 | } | |
11fdf7f2 TL |
388 | |
389 | class posix_socket_impl final : public socket_impl { | |
f67539c2 TL |
390 | pollable_fd _fd; |
391 | std::pmr::polymorphic_allocator<char>* _allocator; | |
9f95a23c | 392 | bool _reuseaddr = false; |
11fdf7f2 TL |
393 | |
394 | future<> find_port_and_connect(socket_address sa, socket_address local, transport proto = transport::TCP) { | |
395 | static thread_local std::default_random_engine random_engine{std::random_device{}()}; | |
396 | static thread_local std::uniform_int_distribution<uint16_t> u(49152/smp::count + 1, 65535/smp::count - 1); | |
9f95a23c TL |
397 | // If no explicit local address, set to dest address family wildcard. |
398 | if (local.is_unspecified()) { | |
399 | local = net::inet_address(sa.addr().in_family()); | |
400 | } | |
401 | resolve_outgoing_address(sa); | |
11fdf7f2 | 402 | return repeat([this, sa, local, proto, attempts = 0, requested_port = ntoh(local.as_posix_sockaddr_in().sin_port)] () mutable { |
9f95a23c | 403 | _fd = engine().make_pollable_fd(sa, int(proto)); |
f67539c2 TL |
404 | _fd.get_file_desc().setsockopt(SOL_SOCKET, SO_REUSEADDR, int(_reuseaddr)); |
405 | uint16_t port = attempts++ < 5 && requested_port == 0 && proto == transport::TCP ? u(random_engine) * smp::count + this_shard_id() : requested_port; | |
11fdf7f2 | 406 | local.as_posix_sockaddr_in().sin_port = hton(port); |
f67539c2 | 407 | return futurize_invoke([this, sa, local] { return engine().posix_connect(_fd, sa, local); }).then_wrapped([port, requested_port] (future<> f) { |
11fdf7f2 TL |
408 | try { |
409 | f.get(); | |
410 | return stop_iteration::yes; | |
411 | } catch (std::system_error& err) { | |
9f95a23c | 412 | if (port != requested_port && (err.code().value() == EADDRINUSE || err.code().value() == EADDRNOTAVAIL)) { |
11fdf7f2 TL |
413 | return stop_iteration::no; |
414 | } | |
415 | throw; | |
416 | } | |
417 | }); | |
418 | }); | |
419 | } | |
420 | ||
9f95a23c TL |
421 | /// an aux function to handle unix-domain-specific requests |
422 | future<connected_socket> connect_unix_domain(socket_address sa, socket_address local) { | |
423 | // note that if the 'local' address was not set by the client, it was created as an undefined address | |
424 | if (local.is_unspecified()) { | |
425 | local = socket_address{unix_domain_addr{std::string{}}}; | |
426 | } | |
427 | ||
428 | _fd = engine().make_pollable_fd(sa, 0); | |
429 | return engine().posix_connect(_fd, sa, local).then( | |
430 | [fd = _fd, allocator = _allocator](){ | |
431 | // a problem with 'private' interaction with 'unique_ptr' | |
432 | std::unique_ptr<connected_socket_impl> csi; | |
433 | csi.reset(new posix_connected_socket_impl{AF_UNIX, 0, std::move(fd), allocator}); | |
434 | return make_ready_future<connected_socket>(connected_socket(std::move(csi))); | |
435 | } | |
436 | ); | |
437 | } | |
438 | ||
11fdf7f2 | 439 | public: |
f67539c2 | 440 | explicit posix_socket_impl(std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) : _allocator(allocator) {} |
11fdf7f2 TL |
441 | |
442 | virtual future<connected_socket> connect(socket_address sa, socket_address local, transport proto = transport::TCP) override { | |
9f95a23c TL |
443 | if (sa.is_af_unix()) { |
444 | return connect_unix_domain(sa, local); | |
445 | } | |
446 | return find_port_and_connect(sa, local, proto).then([this, sa, proto, allocator = _allocator] () mutable { | |
11fdf7f2 | 447 | std::unique_ptr<connected_socket_impl> csi; |
9f95a23c | 448 | csi.reset(new posix_connected_socket_impl(sa.family(), static_cast<int>(proto), _fd, allocator)); |
11fdf7f2 TL |
449 | return make_ready_future<connected_socket>(connected_socket(std::move(csi))); |
450 | }); | |
451 | } | |
452 | ||
9f95a23c TL |
453 | void set_reuseaddr(bool reuseaddr) override { |
454 | _reuseaddr = reuseaddr; | |
455 | if (_fd) { | |
f67539c2 | 456 | _fd.get_file_desc().setsockopt(SOL_SOCKET, SO_REUSEADDR, int(reuseaddr)); |
9f95a23c TL |
457 | } |
458 | } | |
459 | ||
460 | bool get_reuseaddr() const override { | |
461 | if(_fd) { | |
f67539c2 | 462 | return _fd.get_file_desc().getsockopt<int>(SOL_SOCKET, SO_REUSEADDR); |
9f95a23c TL |
463 | } else { |
464 | return _reuseaddr; | |
465 | } | |
466 | } | |
467 | ||
11fdf7f2 TL |
468 | virtual void shutdown() override { |
469 | if (_fd) { | |
470 | try { | |
f67539c2 | 471 | _fd.shutdown(SHUT_RDWR); |
11fdf7f2 TL |
472 | } catch (std::system_error& e) { |
473 | if (e.code().value() != ENOTCONN) { | |
474 | throw; | |
475 | } | |
476 | } | |
477 | } | |
478 | } | |
479 | }; | |
480 | ||
9f95a23c TL |
481 | future<accept_result> |
482 | posix_server_socket_impl::accept() { | |
483 | return _lfd.accept().then([this] (std::tuple<pollable_fd, socket_address> fd_sa) { | |
484 | auto& fd = std::get<0>(fd_sa); | |
485 | auto& sa = std::get<1>(fd_sa); | |
486 | auto cth = [this, &sa] { | |
487 | switch(_lba) { | |
488 | case server_socket::load_balancing_algorithm::connection_distribution: | |
489 | return _conntrack.get_handle(); | |
490 | case server_socket::load_balancing_algorithm::port: | |
491 | return _conntrack.get_handle(ntoh(sa.as_posix_sockaddr_in().sin_port) % smp::count); | |
492 | case server_socket::load_balancing_algorithm::fixed: | |
493 | return _conntrack.get_handle(_fixed_cpu); | |
494 | default: abort(); | |
495 | } | |
496 | } (); | |
11fdf7f2 | 497 | auto cpu = cth.cpu(); |
f67539c2 | 498 | if (cpu == this_shard_id()) { |
11fdf7f2 | 499 | std::unique_ptr<connected_socket_impl> csi( |
f67539c2 | 500 | new posix_connected_socket_impl(sa.family(), _protocol, std::move(fd), std::move(cth), _allocator)); |
9f95a23c TL |
501 | return make_ready_future<accept_result>( |
502 | accept_result{connected_socket(std::move(csi)), sa}); | |
11fdf7f2 | 503 | } else { |
9f95a23c TL |
504 | // FIXME: future is discarded |
505 | (void)smp::submit_to(cpu, [protocol = _protocol, ssa = _sa, fd = std::move(fd.get_file_desc()), sa, cth = std::move(cth), allocator = _allocator] () mutable { | |
506 | posix_ap_server_socket_impl::move_connected_socket(protocol, ssa, pollable_fd(std::move(fd)), sa, std::move(cth), allocator); | |
11fdf7f2 TL |
507 | }); |
508 | return accept(); | |
509 | } | |
510 | }); | |
511 | } | |
512 | ||
11fdf7f2 | 513 | void |
9f95a23c | 514 | posix_server_socket_impl::abort_accept() { |
1e59de90 | 515 | _lfd.shutdown(SHUT_RD, pollable_fd::shutdown_kernel_only::no); |
11fdf7f2 TL |
516 | } |
517 | ||
9f95a23c TL |
518 | socket_address posix_server_socket_impl::local_address() const { |
519 | return _lfd.get_file_desc().get_address(); | |
520 | } | |
521 | ||
522 | future<accept_result> posix_ap_server_socket_impl::accept() { | |
523 | auto t_sa = std::make_tuple(_protocol, _sa); | |
524 | auto conni = conn_q.find(t_sa); | |
11fdf7f2 TL |
525 | if (conni != conn_q.end()) { |
526 | connection c = std::move(conni->second); | |
527 | conn_q.erase(conni); | |
528 | try { | |
529 | std::unique_ptr<connected_socket_impl> csi( | |
f67539c2 | 530 | new posix_connected_socket_impl(_sa.family(), _protocol, std::move(c.fd), std::move(c.connection_tracking_handle), _allocator)); |
9f95a23c | 531 | return make_ready_future<accept_result>(accept_result{connected_socket(std::move(csi)), std::move(c.addr)}); |
11fdf7f2 | 532 | } catch (...) { |
9f95a23c | 533 | return make_exception_future<accept_result>(std::current_exception()); |
11fdf7f2 TL |
534 | } |
535 | } else { | |
536 | try { | |
9f95a23c | 537 | auto i = sockets.emplace(std::piecewise_construct, std::make_tuple(t_sa), std::make_tuple()); |
11fdf7f2 TL |
538 | assert(i.second); |
539 | return i.first->second.get_future(); | |
540 | } catch (...) { | |
9f95a23c | 541 | return make_exception_future<accept_result>(std::current_exception()); |
11fdf7f2 TL |
542 | } |
543 | } | |
544 | } | |
545 | ||
11fdf7f2 | 546 | void |
9f95a23c TL |
547 | posix_ap_server_socket_impl::abort_accept() { |
548 | auto t_sa = std::make_tuple(_protocol, _sa); | |
549 | conn_q.erase(t_sa); | |
550 | auto i = sockets.find(t_sa); | |
11fdf7f2 TL |
551 | if (i != sockets.end()) { |
552 | i->second.set_exception(std::system_error(ECONNABORTED, std::system_category())); | |
553 | sockets.erase(i); | |
554 | } | |
555 | } | |
556 | ||
9f95a23c TL |
557 | future<accept_result> |
558 | posix_reuseport_server_socket_impl::accept() { | |
559 | return _lfd.accept().then([allocator = _allocator, protocol = _protocol] (std::tuple<pollable_fd, socket_address> fd_sa) { | |
560 | auto& fd = std::get<0>(fd_sa); | |
561 | auto& sa = std::get<1>(fd_sa); | |
11fdf7f2 | 562 | std::unique_ptr<connected_socket_impl> csi( |
f67539c2 | 563 | new posix_connected_socket_impl(sa.family(), protocol, std::move(fd), allocator)); |
9f95a23c TL |
564 | return make_ready_future<accept_result>( |
565 | accept_result{connected_socket(std::move(csi)), sa}); | |
11fdf7f2 TL |
566 | }); |
567 | } | |
568 | ||
11fdf7f2 | 569 | void |
9f95a23c | 570 | posix_reuseport_server_socket_impl::abort_accept() { |
1e59de90 | 571 | _lfd.shutdown(SHUT_RD, pollable_fd::shutdown_kernel_only::no); |
11fdf7f2 TL |
572 | } |
573 | ||
9f95a23c TL |
574 | socket_address posix_reuseport_server_socket_impl::local_address() const { |
575 | return _lfd.get_file_desc().get_address(); | |
576 | } | |
577 | ||
11fdf7f2 | 578 | void |
f67539c2 | 579 | posix_ap_server_socket_impl::move_connected_socket(int protocol, socket_address sa, pollable_fd fd, socket_address addr, conntrack::handle cth, std::pmr::polymorphic_allocator<char>* allocator) { |
9f95a23c TL |
580 | auto t_sa = std::make_tuple(protocol, sa); |
581 | auto i = sockets.find(t_sa); | |
11fdf7f2 TL |
582 | if (i != sockets.end()) { |
583 | try { | |
f67539c2 | 584 | std::unique_ptr<connected_socket_impl> csi(new posix_connected_socket_impl(sa.family(), protocol, std::move(fd), std::move(cth), allocator)); |
9f95a23c | 585 | i->second.set_value(accept_result{connected_socket(std::move(csi)), std::move(addr)}); |
11fdf7f2 TL |
586 | } catch (...) { |
587 | i->second.set_exception(std::current_exception()); | |
588 | } | |
589 | sockets.erase(i); | |
590 | } else { | |
9f95a23c | 591 | conn_q.emplace(std::piecewise_construct, std::make_tuple(t_sa), std::make_tuple(std::move(fd), std::move(addr), std::move(cth))); |
11fdf7f2 TL |
592 | } |
593 | } | |
594 | ||
595 | future<temporary_buffer<char>> | |
596 | posix_data_source_impl::get() { | |
1e59de90 | 597 | return _fd.recv_some(static_cast<internal::buffer_allocator*>(this)).then([this] (temporary_buffer<char> b) { |
f67539c2 TL |
598 | if (b.size() >= _config.buffer_size) { |
599 | _config.buffer_size *= 2; | |
600 | _config.buffer_size = std::min(_config.buffer_size, _config.max_buffer_size); | |
601 | } else if (b.size() <= _config.buffer_size / 4) { | |
602 | _config.buffer_size /= 2; | |
603 | _config.buffer_size = std::max(_config.buffer_size, _config.min_buffer_size); | |
604 | } | |
605 | return b; | |
11fdf7f2 TL |
606 | }); |
607 | } | |
608 | ||
f67539c2 TL |
609 | temporary_buffer<char> |
610 | posix_data_source_impl::allocate_buffer() { | |
611 | return make_temporary_buffer<char>(_buffer_allocator, _config.buffer_size); | |
612 | } | |
613 | ||
11fdf7f2 | 614 | future<> posix_data_source_impl::close() { |
f67539c2 | 615 | _fd.shutdown(SHUT_RD); |
11fdf7f2 TL |
616 | return make_ready_future<>(); |
617 | } | |
618 | ||
619 | std::vector<struct iovec> to_iovec(const packet& p) { | |
620 | std::vector<struct iovec> v; | |
621 | v.reserve(p.nr_frags()); | |
622 | for (auto&& f : p.fragments()) { | |
623 | v.push_back({.iov_base = f.base, .iov_len = f.size}); | |
624 | } | |
625 | return v; | |
626 | } | |
627 | ||
628 | std::vector<iovec> to_iovec(std::vector<temporary_buffer<char>>& buf_vec) { | |
629 | std::vector<iovec> v; | |
630 | v.reserve(buf_vec.size()); | |
631 | for (auto& buf : buf_vec) { | |
632 | v.push_back({.iov_base = buf.get_write(), .iov_len = buf.size()}); | |
633 | } | |
634 | return v; | |
635 | } | |
636 | ||
637 | future<> | |
638 | posix_data_sink_impl::put(temporary_buffer<char> buf) { | |
f67539c2 | 639 | return _fd.write_all(buf.get(), buf.size()).then([d = buf.release()] {}); |
11fdf7f2 TL |
640 | } |
641 | ||
642 | future<> | |
643 | posix_data_sink_impl::put(packet p) { | |
644 | _p = std::move(p); | |
f67539c2 | 645 | return _fd.write_all(_p).then([this] { _p.reset(); }); |
11fdf7f2 TL |
646 | } |
647 | ||
648 | future<> | |
649 | posix_data_sink_impl::close() { | |
f67539c2 | 650 | _fd.shutdown(SHUT_WR); |
11fdf7f2 TL |
651 | return make_ready_future<>(); |
652 | } | |
653 | ||
20effc67 | 654 | posix_network_stack::posix_network_stack(const program_options::option_group& opts, std::pmr::polymorphic_allocator<char>* allocator) |
f67539c2 TL |
655 | : _reuseport(engine().posix_reuseport_available()), _allocator(allocator) { |
656 | } | |
657 | ||
11fdf7f2 TL |
658 | server_socket |
659 | posix_network_stack::listen(socket_address sa, listen_options opt) { | |
f67539c2 | 660 | using server_socket = seastar::server_socket; |
9f95a23c TL |
661 | // allow unspecified bind address -> default to ipv4 wildcard |
662 | if (sa.is_unspecified()) { | |
663 | sa = inet_address(inet_address::family::INET); | |
664 | } | |
665 | if (sa.is_af_unix()) { | |
666 | return server_socket(std::make_unique<posix_server_socket_impl>(0, sa, engine().posix_listen(sa, opt), opt.lba, opt.fixed_cpu, _allocator)); | |
11fdf7f2 | 667 | } |
9f95a23c TL |
668 | auto protocol = static_cast<int>(opt.proto); |
669 | return _reuseport ? | |
670 | server_socket(std::make_unique<posix_reuseport_server_socket_impl>(protocol, sa, engine().posix_listen(sa, opt), _allocator)) | |
671 | : | |
672 | server_socket(std::make_unique<posix_server_socket_impl>(protocol, sa, engine().posix_listen(sa, opt), opt.lba, opt.fixed_cpu, _allocator)); | |
11fdf7f2 TL |
673 | } |
674 | ||
675 | ::seastar::socket posix_network_stack::socket() { | |
9f95a23c | 676 | return ::seastar::socket(std::make_unique<posix_socket_impl>(_allocator)); |
11fdf7f2 TL |
677 | } |
678 | ||
20effc67 TL |
679 | posix_ap_network_stack::posix_ap_network_stack(const program_options::option_group& opts, std::pmr::polymorphic_allocator<char>* allocator) |
680 | : posix_network_stack(opts, allocator), _reuseport(engine().posix_reuseport_available()) { | |
f67539c2 TL |
681 | } |
682 | ||
11fdf7f2 TL |
683 | server_socket |
684 | posix_ap_network_stack::listen(socket_address sa, listen_options opt) { | |
f67539c2 | 685 | using server_socket = seastar::server_socket; |
9f95a23c TL |
686 | // allow unspecified bind address -> default to ipv4 wildcard |
687 | if (sa.is_unspecified()) { | |
688 | sa = inet_address(inet_address::family::INET); | |
689 | } | |
690 | if (sa.is_af_unix()) { | |
691 | return server_socket(std::make_unique<posix_ap_server_socket_impl>(0, sa, _allocator)); | |
11fdf7f2 | 692 | } |
9f95a23c TL |
693 | auto protocol = static_cast<int>(opt.proto); |
694 | return _reuseport ? | |
695 | server_socket(std::make_unique<posix_reuseport_server_socket_impl>(protocol, sa, engine().posix_listen(sa, opt), _allocator)) | |
696 | : | |
697 | server_socket(std::make_unique<posix_ap_server_socket_impl>(protocol, sa, _allocator)); | |
11fdf7f2 TL |
698 | } |
699 | ||
700 | struct cmsg_with_pktinfo { | |
701 | struct cmsghdrcmh; | |
9f95a23c TL |
702 | union { |
703 | struct in_pktinfo pktinfo; | |
704 | struct in6_pktinfo pkt6info; | |
705 | }; | |
11fdf7f2 TL |
706 | }; |
707 | ||
708 | class posix_udp_channel : public udp_channel_impl { | |
709 | private: | |
710 | static constexpr int MAX_DATAGRAM_SIZE = 65507; | |
711 | struct recv_ctx { | |
712 | struct msghdr _hdr; | |
713 | struct iovec _iov; | |
714 | socket_address _src_addr; | |
715 | char* _buffer; | |
716 | cmsg_with_pktinfo _cmsg; | |
717 | ||
718 | recv_ctx() { | |
719 | memset(&_hdr, 0, sizeof(_hdr)); | |
720 | _hdr.msg_iov = &_iov; | |
721 | _hdr.msg_iovlen = 1; | |
722 | _hdr.msg_name = &_src_addr.u.sa; | |
723 | _hdr.msg_namelen = sizeof(_src_addr.u.sas); | |
724 | memset(&_cmsg, 0, sizeof(_cmsg)); | |
725 | _hdr.msg_control = &_cmsg; | |
726 | _hdr.msg_controllen = sizeof(_cmsg); | |
727 | } | |
728 | ||
729 | void prepare() { | |
730 | _buffer = new char[MAX_DATAGRAM_SIZE]; | |
731 | _iov.iov_base = _buffer; | |
732 | _iov.iov_len = MAX_DATAGRAM_SIZE; | |
733 | } | |
734 | }; | |
735 | struct send_ctx { | |
736 | struct msghdr _hdr; | |
737 | std::vector<struct iovec> _iovecs; | |
738 | socket_address _dst; | |
739 | packet _p; | |
740 | ||
741 | send_ctx() { | |
742 | memset(&_hdr, 0, sizeof(_hdr)); | |
743 | _hdr.msg_name = &_dst.u.sa; | |
f67539c2 | 744 | _hdr.msg_namelen = _dst.addr_length; |
11fdf7f2 TL |
745 | } |
746 | ||
9f95a23c TL |
747 | void prepare(const socket_address& dst, packet p) { |
748 | _dst = dst; | |
f67539c2 | 749 | _hdr.msg_namelen = _dst.addr_length; |
11fdf7f2 TL |
750 | _p = std::move(p); |
751 | _iovecs = to_iovec(_p); | |
752 | _hdr.msg_iov = _iovecs.data(); | |
753 | _hdr.msg_iovlen = _iovecs.size(); | |
9f95a23c | 754 | resolve_outgoing_address(_dst); |
11fdf7f2 TL |
755 | } |
756 | }; | |
f67539c2 | 757 | pollable_fd _fd; |
9f95a23c | 758 | socket_address _address; |
11fdf7f2 TL |
759 | recv_ctx _recv; |
760 | send_ctx _send; | |
761 | bool _closed; | |
762 | public: | |
9f95a23c | 763 | posix_udp_channel(const socket_address& bind_address) |
11fdf7f2 | 764 | : _closed(false) { |
9f95a23c | 765 | auto sa = bind_address.is_unspecified() ? socket_address(inet_address(inet_address::family::INET)) : bind_address; |
11fdf7f2 TL |
766 | file_desc fd = file_desc::socket(sa.u.sa.sa_family, SOCK_DGRAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0); |
767 | fd.setsockopt(SOL_IP, IP_PKTINFO, true); | |
768 | if (engine().posix_reuseport_available()) { | |
769 | fd.setsockopt(SOL_SOCKET, SO_REUSEPORT, 1); | |
770 | } | |
771 | fd.bind(sa.u.sa, sizeof(sa.u.sas)); | |
9f95a23c | 772 | _address = fd.get_address(); |
f67539c2 | 773 | _fd = std::move(fd); |
11fdf7f2 TL |
774 | } |
775 | virtual ~posix_udp_channel() { if (!_closed) close(); }; | |
776 | virtual future<udp_datagram> receive() override; | |
9f95a23c TL |
777 | virtual future<> send(const socket_address& dst, const char *msg) override; |
778 | virtual future<> send(const socket_address& dst, packet p) override; | |
11fdf7f2 | 779 | virtual void shutdown_input() override { |
1e59de90 | 780 | _fd.shutdown(SHUT_RD, pollable_fd::shutdown_kernel_only::no); |
11fdf7f2 TL |
781 | } |
782 | virtual void shutdown_output() override { | |
1e59de90 | 783 | _fd.shutdown(SHUT_WR, pollable_fd::shutdown_kernel_only::no); |
11fdf7f2 TL |
784 | } |
785 | virtual void close() override { | |
786 | _closed = true; | |
f67539c2 | 787 | _fd = {}; |
11fdf7f2 TL |
788 | } |
789 | virtual bool is_closed() const override { return _closed; } | |
9f95a23c TL |
790 | socket_address local_address() const override { |
791 | assert(_address.u.sas.ss_family != AF_INET6 || (_address.addr_length > 20)); | |
792 | return _address; | |
793 | } | |
11fdf7f2 TL |
794 | }; |
795 | ||
9f95a23c | 796 | future<> posix_udp_channel::send(const socket_address& dst, const char *message) { |
11fdf7f2 | 797 | auto len = strlen(message); |
9f95a23c TL |
798 | auto a = dst; |
799 | resolve_outgoing_address(a); | |
f67539c2 | 800 | return _fd.sendto(a, message, len) |
11fdf7f2 TL |
801 | .then([len] (size_t size) { assert(size == len); }); |
802 | } | |
803 | ||
9f95a23c | 804 | future<> posix_udp_channel::send(const socket_address& dst, packet p) { |
11fdf7f2 TL |
805 | auto len = p.len(); |
806 | _send.prepare(dst, std::move(p)); | |
f67539c2 | 807 | return _fd.sendmsg(&_send._hdr) |
11fdf7f2 TL |
808 | .then([len] (size_t size) { assert(size == len); }); |
809 | } | |
810 | ||
811 | udp_channel | |
9f95a23c | 812 | posix_network_stack::make_udp_channel(const socket_address& addr) { |
11fdf7f2 TL |
813 | return udp_channel(std::make_unique<posix_udp_channel>(addr)); |
814 | } | |
815 | ||
9f95a23c TL |
816 | bool |
817 | posix_network_stack::supports_ipv6() const { | |
818 | static bool has_ipv6 = [] { | |
819 | try { | |
820 | posix_udp_channel c(ipv6_addr{"::1"}); | |
821 | c.close(); | |
822 | return true; | |
823 | } catch (...) {} | |
824 | return false; | |
825 | }(); | |
826 | ||
827 | return has_ipv6; | |
828 | } | |
829 | ||
11fdf7f2 TL |
830 | class posix_datagram : public udp_datagram_impl { |
831 | private: | |
9f95a23c TL |
832 | socket_address _src; |
833 | socket_address _dst; | |
11fdf7f2 TL |
834 | packet _p; |
835 | public: | |
9f95a23c TL |
836 | posix_datagram(const socket_address& src, const socket_address& dst, packet p) : _src(src), _dst(dst), _p(std::move(p)) {} |
837 | virtual socket_address get_src() override { return _src; } | |
838 | virtual socket_address get_dst() override { return _dst; } | |
839 | virtual uint16_t get_dst_port() override { return _dst.port(); } | |
11fdf7f2 TL |
840 | virtual packet& get_data() override { return _p; } |
841 | }; | |
842 | ||
843 | future<udp_datagram> | |
844 | posix_udp_channel::receive() { | |
845 | _recv.prepare(); | |
f67539c2 | 846 | return _fd.recvmsg(&_recv._hdr).then([this] (size_t size) { |
9f95a23c TL |
847 | socket_address dst; |
848 | for (auto* cmsg = CMSG_FIRSTHDR(&_recv._hdr); cmsg != nullptr; cmsg = CMSG_NXTHDR(&_recv._hdr, cmsg)) { | |
849 | if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_PKTINFO) { | |
f67539c2 | 850 | dst = ipv4_addr(copy_reinterpret_cast<in_pktinfo>(CMSG_DATA(cmsg)).ipi_addr, _address.port()); |
9f95a23c TL |
851 | break; |
852 | } else if (cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { | |
f67539c2 | 853 | dst = ipv6_addr(copy_reinterpret_cast<in6_pktinfo>(CMSG_DATA(cmsg)).ipi6_addr, _address.port()); |
9f95a23c TL |
854 | break; |
855 | } | |
856 | } | |
11fdf7f2 TL |
857 | return make_ready_future<udp_datagram>(udp_datagram(std::make_unique<posix_datagram>( |
858 | _recv._src_addr, dst, packet(fragment{_recv._buffer, size}, make_deleter([buf = _recv._buffer] { delete[] buf; }))))); | |
859 | }).handle_exception([p = _recv._buffer](auto ep) { | |
860 | delete[] p; | |
861 | return make_exception_future<udp_datagram>(std::move(ep)); | |
862 | }); | |
863 | } | |
864 | ||
20effc67 TL |
865 | network_stack_entry register_posix_stack() { |
866 | return network_stack_entry{ | |
867 | "posix", std::make_unique<program_options::option_group>(nullptr, "Posix"), | |
868 | [](const program_options::option_group& ops) { | |
11fdf7f2 TL |
869 | return smp::main_thread() ? posix_network_stack::create(ops) |
870 | : posix_ap_network_stack::create(ops); | |
871 | }, | |
20effc67 | 872 | true}; |
11fdf7f2 | 873 | } |
9f95a23c TL |
874 | |
875 | // nw interface stuff | |
876 | ||
877 | std::vector<network_interface> posix_network_stack::network_interfaces() { | |
878 | class posix_network_interface_impl final : public network_interface_impl { | |
879 | public: | |
880 | uint32_t _index = 0, _mtu = 0; | |
881 | sstring _name, _display_name; | |
882 | std::vector<net::inet_address> _addresses; | |
883 | std::vector<uint8_t> _hardware_address; | |
884 | bool _loopback = false, _virtual = false, _up = false; | |
885 | ||
886 | uint32_t index() const override { | |
887 | return _index; | |
888 | } | |
889 | uint32_t mtu() const override { | |
890 | return _mtu; | |
891 | } | |
892 | const sstring& name() const override { | |
893 | return _name; | |
894 | } | |
895 | const sstring& display_name() const override { | |
896 | return _display_name.empty() ? name() : _display_name; | |
897 | } | |
898 | const std::vector<net::inet_address>& addresses() const override { | |
899 | return _addresses; | |
900 | } | |
901 | const std::vector<uint8_t> hardware_address() const override { | |
902 | return _hardware_address; | |
903 | } | |
904 | bool is_loopback() const override { | |
905 | return _loopback; | |
906 | } | |
907 | bool is_virtual() const override { | |
908 | return _virtual; | |
909 | } | |
910 | bool is_up() const override { | |
911 | // TODO: should be checked on query? | |
912 | return _up; | |
913 | } | |
914 | bool supports_ipv6() const override { | |
915 | // TODO: this is not 100% correct. | |
916 | return std::any_of(_addresses.begin(), _addresses.end(), std::mem_fn(&inet_address::is_ipv6)); | |
917 | } | |
918 | }; | |
919 | ||
920 | // For now, keep an immutable set of interfaces created on start, shared across | |
921 | // shards | |
922 | static const std::vector<posix_network_interface_impl> global_interfaces = [] { | |
923 | auto fd = ::socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | |
924 | throw_system_error_on(fd < 0, "could not open netlink socket"); | |
925 | ||
926 | std::unique_ptr<int, void(*)(int*)> fd_guard(&fd, [](int* p) { ::close(*p); }); | |
927 | ||
928 | auto pid = ::getpid(); | |
929 | ||
930 | sockaddr_nl local = { 0, }; | |
931 | local.nl_family = AF_NETLINK; | |
932 | local.nl_pid = pid; | |
933 | local.nl_groups = RTMGRP_IPV6_IFADDR|RTMGRP_IPV4_IFADDR; | |
934 | ||
935 | throw_system_error_on(bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0, "could not bind netlink socket"); | |
936 | ||
937 | /* RTNL socket is ready for use, prepare and send requests */ | |
938 | ||
939 | std::vector<posix_network_interface_impl> res; | |
940 | ||
941 | for (auto msg : { RTM_GETLINK, RTM_GETADDR}) { | |
942 | struct nl_req { | |
943 | nlmsghdr hdr; | |
944 | union { | |
945 | rtgenmsg gen; | |
946 | ifaddrmsg addr; | |
947 | }; | |
948 | } req = { {0}, }; | |
949 | ||
950 | sockaddr_nl kernel = { 0, }; | |
951 | msghdr rtnl_msg = { 0, }; | |
952 | ||
953 | kernel.nl_family = AF_NETLINK; /* fill-in kernel address (destination of our message) */ | |
954 | ||
955 | req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)); | |
956 | req.hdr.nlmsg_type = msg; | |
957 | req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT; | |
958 | req.hdr.nlmsg_seq = 1; | |
959 | req.hdr.nlmsg_pid = pid; | |
960 | ||
961 | if (msg == RTM_GETLINK) { | |
962 | req.gen.rtgen_family = AF_PACKET; /* no preferred AF, we will get *all* interfaces */ | |
963 | } else { | |
964 | req.addr.ifa_family = AF_UNSPEC; | |
965 | } | |
966 | ||
967 | iovec io; | |
968 | ||
969 | io.iov_base = &req; | |
970 | io.iov_len = req.hdr.nlmsg_len; | |
971 | ||
972 | rtnl_msg.msg_iov = &io; | |
973 | rtnl_msg.msg_iovlen = 1; | |
974 | rtnl_msg.msg_name = &kernel; | |
975 | rtnl_msg.msg_namelen = sizeof(kernel); | |
976 | ||
977 | throw_system_error_on(::sendmsg(fd, (struct msghdr *) &rtnl_msg, 0) < 0, "could not send netlink request"); | |
978 | /* parse reply */ | |
979 | ||
980 | constexpr size_t reply_buffer_size = 8192; | |
981 | char reply[reply_buffer_size]; | |
982 | ||
983 | bool done = false; | |
984 | ||
985 | while (!done) { | |
986 | msghdr rtnl_reply = { 0, }; | |
987 | iovec io_reply = { 0, }; | |
988 | ||
989 | io_reply.iov_base = reply; | |
990 | io_reply.iov_len = reply_buffer_size; | |
991 | rtnl_reply.msg_iov = &io_reply; | |
992 | rtnl_reply.msg_iovlen = 1; | |
993 | rtnl_reply.msg_name = &kernel; | |
994 | rtnl_reply.msg_namelen = sizeof(kernel); | |
995 | ||
996 | auto len = ::recvmsg(fd, &rtnl_reply, 0); /* read as much data as fits in the receive buffer */ | |
997 | if (len <= 0) { | |
998 | return res; | |
999 | } | |
1000 | ||
1001 | for (auto* msg_ptr = (struct nlmsghdr *) reply; NLMSG_OK(msg_ptr, len); msg_ptr = NLMSG_NEXT(msg_ptr, len)) { | |
1002 | switch(msg_ptr->nlmsg_type) { | |
1003 | case NLMSG_DONE: // that is all | |
1004 | done = true; | |
1005 | break; | |
1006 | case RTM_NEWLINK: | |
1007 | { | |
1008 | auto* iface = reinterpret_cast<const ifinfomsg*>(NLMSG_DATA(msg_ptr)); | |
1009 | auto ilen = msg_ptr->nlmsg_len - NLMSG_LENGTH(sizeof(ifinfomsg)); | |
1010 | ||
1011 | // todo: filter any non-network interfaces (family) | |
1012 | ||
1013 | posix_network_interface_impl nwif; | |
1014 | ||
1015 | nwif._index = iface->ifi_index; | |
1016 | nwif._loopback = (iface->ifi_flags & IFF_LOOPBACK) != 0; | |
1017 | nwif._up = (iface->ifi_flags & IFF_UP) != 0; | |
1018 | #if defined(IFF_802_1Q_VLAN) && defined(IFF_EBRIDGE) && defined(IFF_SLAVE_INACTIVE) | |
1019 | nwif._virtual = (iface->ifi_flags & (IFF_802_1Q_VLAN|IFF_EBRIDGE|IFF_SLAVE_INACTIVE)) != 0; | |
1020 | #endif | |
1021 | for (auto* attribute = IFLA_RTA(iface); RTA_OK(attribute, ilen); attribute = RTA_NEXT(attribute, ilen)) { | |
1022 | switch(attribute->rta_type) { | |
1023 | case IFLA_IFNAME: | |
1024 | nwif._name = reinterpret_cast<const char *>(RTA_DATA(attribute)); | |
1025 | break; | |
1026 | case IFLA_MTU: | |
1027 | nwif._mtu = *reinterpret_cast<const uint32_t *>(RTA_DATA(attribute)); | |
1028 | break; | |
1029 | case IFLA_ADDRESS: | |
1030 | nwif._hardware_address.assign(reinterpret_cast<const uint8_t *>(RTA_DATA(attribute)), reinterpret_cast<const uint8_t *>(RTA_DATA(attribute)) + RTA_PAYLOAD(attribute)); | |
1031 | break; | |
1032 | default: | |
1033 | break; | |
1034 | } | |
1035 | } | |
1036 | ||
1037 | res.emplace_back(std::move(nwif)); | |
1038 | ||
1039 | break; | |
1040 | } | |
1041 | case RTM_NEWADDR: | |
1042 | { | |
1043 | auto* addr = reinterpret_cast<const ifaddrmsg*>(NLMSG_DATA(msg_ptr)); | |
1044 | auto ilen = msg_ptr->nlmsg_len - NLMSG_LENGTH(sizeof(ifaddrmsg)); | |
1045 | ||
1046 | for (auto& nwif : res) { | |
1047 | if (nwif._index == addr->ifa_index) { | |
1048 | for (auto* attribute = IFA_RTA(addr); RTA_OK(attribute, ilen); attribute = RTA_NEXT(attribute, ilen)) { | |
f67539c2 | 1049 | std::optional<inet_address> ia; |
9f95a23c TL |
1050 | |
1051 | switch(attribute->rta_type) { | |
1052 | case IFA_LOCAL: | |
1053 | case IFA_ADDRESS: // ipv6 addresses are reported only as "ADDRESS" | |
1054 | ||
1055 | if (RTA_PAYLOAD(attribute) == sizeof(::in_addr)) { | |
1056 | ia.emplace(*reinterpret_cast<const ::in_addr *>(RTA_DATA(attribute))); | |
1057 | } else if (RTA_PAYLOAD(attribute) == sizeof(::in6_addr)) { | |
1058 | ia.emplace(*reinterpret_cast<const ::in6_addr *>(RTA_DATA(attribute)), nwif.index()); | |
1059 | } | |
1060 | ||
1061 | if (ia && std::find(nwif._addresses.begin(), nwif._addresses.end(), *ia) == nwif._addresses.end()) { | |
1062 | nwif._addresses.emplace_back(*ia); | |
1063 | } | |
1064 | ||
1065 | break; | |
1066 | default: | |
1067 | break; | |
1068 | } | |
1069 | } | |
1070 | ||
1071 | break; | |
1072 | } | |
1073 | } | |
1074 | } | |
1075 | default: | |
1076 | break; | |
1077 | } | |
1078 | } | |
1079 | } | |
1080 | } | |
1081 | ||
1082 | return res; | |
1083 | }(); | |
1084 | ||
1085 | // And a similarly immutable set of shared_ptr to network_interface_impl per shard, ready | |
1086 | // to be handed out to callers with minimal overhead | |
1087 | static const thread_local std::vector<shared_ptr<posix_network_interface_impl>> thread_local_interfaces = [] { | |
1088 | std::vector<shared_ptr<posix_network_interface_impl>> res; | |
1089 | res.reserve(global_interfaces.size()); | |
1090 | std::transform(global_interfaces.begin(), global_interfaces.end(), std::back_inserter(res), [](const posix_network_interface_impl& impl) { | |
1091 | return make_shared<posix_network_interface_impl>(impl); | |
1092 | }); | |
1093 | return res; | |
1094 | }(); | |
1095 | ||
1096 | return std::vector<network_interface>(thread_local_interfaces.begin(), thread_local_interfaces.end()); | |
1097 | } | |
1098 | ||
11fdf7f2 TL |
1099 | } |
1100 | ||
1101 | } |