]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright (C) 2014 Cloudius Systems, Ltd. | |
20 | */ | |
21 | ||
22 | #pragma once | |
23 | ||
24 | #include <memory> | |
25 | #include <vector> | |
26 | #include <cstring> | |
27 | #include <seastar/core/future.hh> | |
28 | #include <seastar/net/byteorder.hh> | |
29 | #include <seastar/net/socket_defs.hh> | |
30 | #include <seastar/net/packet.hh> | |
11fdf7f2 TL |
31 | #include <seastar/core/temporary_buffer.hh> |
32 | #include <seastar/core/iostream.hh> | |
33 | #include <seastar/util/std-compat.hh> | |
20effc67 | 34 | #include <seastar/util/program-options.hh> |
9f95a23c | 35 | #include "../core/internal/api-level.hh" |
11fdf7f2 TL |
36 | #include <sys/types.h> |
37 | ||
38 | namespace seastar { | |
39 | ||
9f95a23c | 40 | inline |
f67539c2 | 41 | bool is_ip_unspecified(const ipv4_addr& addr) noexcept { |
9f95a23c | 42 | return addr.is_ip_unspecified(); |
11fdf7f2 TL |
43 | } |
44 | ||
9f95a23c | 45 | inline |
f67539c2 | 46 | bool is_port_unspecified(const ipv4_addr& addr) noexcept { |
9f95a23c | 47 | return addr.is_port_unspecified(); |
11fdf7f2 TL |
48 | } |
49 | ||
9f95a23c | 50 | inline |
f67539c2 | 51 | socket_address make_ipv4_address(const ipv4_addr& addr) noexcept { |
9f95a23c | 52 | return socket_address(addr); |
11fdf7f2 TL |
53 | } |
54 | ||
55 | inline | |
f67539c2 | 56 | socket_address make_ipv4_address(uint32_t ip, uint16_t port) noexcept { |
9f95a23c | 57 | return make_ipv4_address(ipv4_addr(ip, port)); |
11fdf7f2 TL |
58 | } |
59 | ||
60 | namespace net { | |
61 | ||
62 | // see linux tcp(7) for parameter explanation | |
63 | struct tcp_keepalive_params { | |
64 | std::chrono::seconds idle; // TCP_KEEPIDLE | |
65 | std::chrono::seconds interval; // TCP_KEEPINTVL | |
66 | unsigned count; // TCP_KEEPCNT | |
67 | }; | |
68 | ||
69 | // see linux sctp(7) for parameter explanation | |
70 | struct sctp_keepalive_params { | |
71 | std::chrono::seconds interval; // spp_hbinterval | |
72 | unsigned count; // spp_pathmaxrt | |
73 | }; | |
74 | ||
f67539c2 | 75 | using keepalive_params = std::variant<tcp_keepalive_params, sctp_keepalive_params>; |
11fdf7f2 TL |
76 | |
77 | /// \cond internal | |
78 | class connected_socket_impl; | |
79 | class socket_impl; | |
9f95a23c | 80 | |
f67539c2 | 81 | class server_socket_impl; |
11fdf7f2 TL |
82 | class udp_channel_impl; |
83 | class get_impl; | |
84 | /// \endcond | |
85 | ||
86 | class udp_datagram_impl { | |
87 | public: | |
88 | virtual ~udp_datagram_impl() {}; | |
9f95a23c TL |
89 | virtual socket_address get_src() = 0; |
90 | virtual socket_address get_dst() = 0; | |
11fdf7f2 TL |
91 | virtual uint16_t get_dst_port() = 0; |
92 | virtual packet& get_data() = 0; | |
93 | }; | |
94 | ||
95 | class udp_datagram final { | |
96 | private: | |
97 | std::unique_ptr<udp_datagram_impl> _impl; | |
98 | public: | |
f67539c2 | 99 | udp_datagram(std::unique_ptr<udp_datagram_impl>&& impl) noexcept : _impl(std::move(impl)) {}; |
9f95a23c TL |
100 | socket_address get_src() { return _impl->get_src(); } |
101 | socket_address get_dst() { return _impl->get_dst(); } | |
11fdf7f2 TL |
102 | uint16_t get_dst_port() { return _impl->get_dst_port(); } |
103 | packet& get_data() { return _impl->get_data(); } | |
104 | }; | |
105 | ||
106 | class udp_channel { | |
107 | private: | |
108 | std::unique_ptr<udp_channel_impl> _impl; | |
109 | public: | |
f67539c2 TL |
110 | udp_channel() noexcept; |
111 | udp_channel(std::unique_ptr<udp_channel_impl>) noexcept; | |
11fdf7f2 TL |
112 | ~udp_channel(); |
113 | ||
f67539c2 TL |
114 | udp_channel(udp_channel&&) noexcept; |
115 | udp_channel& operator=(udp_channel&&) noexcept; | |
11fdf7f2 | 116 | |
9f95a23c TL |
117 | socket_address local_address() const; |
118 | ||
11fdf7f2 | 119 | future<udp_datagram> receive(); |
9f95a23c TL |
120 | future<> send(const socket_address& dst, const char* msg); |
121 | future<> send(const socket_address& dst, packet p); | |
11fdf7f2 TL |
122 | bool is_closed() const; |
123 | /// Causes a pending receive() to complete (possibly with an exception) | |
124 | void shutdown_input(); | |
125 | /// Causes a pending send() to complete (possibly with an exception) | |
126 | void shutdown_output(); | |
127 | /// Close the channel and releases all resources. | |
128 | /// | |
129 | /// Must be called only when there are no unfinished send() or receive() calls. You | |
130 | /// can force pending calls to complete soon by calling shutdown_input() and | |
131 | /// shutdown_output(). | |
132 | void close(); | |
133 | }; | |
134 | ||
9f95a23c TL |
135 | class network_interface_impl; |
136 | ||
11fdf7f2 TL |
137 | } /* namespace net */ |
138 | ||
139 | /// \addtogroup networking-module | |
140 | /// @{ | |
141 | ||
f67539c2 TL |
142 | /// Configuration for buffered connected_socket input operations |
143 | /// | |
144 | /// This structure allows tuning of buffered input operations done via | |
145 | /// connected_socket. It is a hint to the implementation and may be | |
146 | /// ignored (e.g. the zero-copy native stack does not allocate buffers, | |
147 | /// so it ignores buffer-size parameters). | |
148 | struct connected_socket_input_stream_config final { | |
149 | /// Initial buffer size to use for input buffering | |
150 | unsigned buffer_size = 8192; | |
151 | /// Minimum buffer size to use for input buffering. The system will decrease | |
152 | /// buffer sizes if it sees a tendency towards small requests, but will not go | |
153 | /// below this buffer size. | |
154 | unsigned min_buffer_size = 512; | |
155 | /// Maximum buffer size to use for input buffering. The system will increase | |
156 | /// buffer sizes if it sees a tendency towards large requests, but will not go | |
157 | /// above this buffer size. | |
158 | unsigned max_buffer_size = 128 * 1024; | |
159 | }; | |
160 | ||
11fdf7f2 TL |
161 | /// A TCP (or other stream-based protocol) connection. |
162 | /// | |
163 | /// A \c connected_socket represents a full-duplex stream between | |
164 | /// two endpoints, a local endpoint and a remote endpoint. | |
165 | class connected_socket { | |
166 | friend class net::get_impl; | |
167 | std::unique_ptr<net::connected_socket_impl> _csi; | |
168 | public: | |
169 | /// Constructs a \c connected_socket not corresponding to a connection | |
f67539c2 | 170 | connected_socket() noexcept; |
11fdf7f2 TL |
171 | ~connected_socket(); |
172 | ||
173 | /// \cond internal | |
f67539c2 | 174 | explicit connected_socket(std::unique_ptr<net::connected_socket_impl> csi) noexcept; |
11fdf7f2 TL |
175 | /// \endcond |
176 | /// Moves a \c connected_socket object. | |
177 | connected_socket(connected_socket&& cs) noexcept; | |
178 | /// Move-assigns a \c connected_socket object. | |
179 | connected_socket& operator=(connected_socket&& cs) noexcept; | |
180 | /// Gets the input stream. | |
181 | /// | |
f67539c2 TL |
182 | /// \param csisc Configuration for the input_stream returned |
183 | /// | |
11fdf7f2 | 184 | /// Gets an object returning data sent from the remote endpoint. |
f67539c2 | 185 | input_stream<char> input(connected_socket_input_stream_config csisc = {}); |
11fdf7f2 TL |
186 | /// Gets the output stream. |
187 | /// | |
188 | /// Gets an object that sends data to the remote endpoint. | |
189 | /// \param buffer_size how much data to buffer | |
190 | output_stream<char> output(size_t buffer_size = 8192); | |
191 | /// Sets the TCP_NODELAY option (disabling Nagle's algorithm) | |
192 | void set_nodelay(bool nodelay); | |
193 | /// Gets the TCP_NODELAY option (Nagle's algorithm) | |
194 | /// | |
195 | /// \return whether the nodelay option is enabled or not | |
196 | bool get_nodelay() const; | |
197 | /// Sets SO_KEEPALIVE option (enable keepalive timer on a socket) | |
198 | void set_keepalive(bool keepalive); | |
199 | /// Gets O_KEEPALIVE option | |
200 | /// \return whether the keepalive option is enabled or not | |
201 | bool get_keepalive() const; | |
202 | /// Sets TCP keepalive parameters | |
203 | void set_keepalive_parameters(const net::keepalive_params& p); | |
204 | /// Get TCP keepalive parameters | |
205 | net::keepalive_params get_keepalive_parameters() const; | |
f67539c2 TL |
206 | /// Sets custom socket options. Based on setsockopt function. |
207 | /// Linux users should refer to protocol-specific manuals | |
208 | /// to see available options, e.g. tcp(7), ip(7), etc. | |
209 | void set_sockopt(int level, int optname, const void* data, size_t len); | |
210 | /// Gets custom socket options. Based on getsockopt function. | |
211 | /// Linux users should refer to protocol-specific manuals | |
212 | /// to see available options, e.g. tcp(7), ip(7), etc. | |
213 | int get_sockopt(int level, int optname, void* data, size_t len) const; | |
20effc67 TL |
214 | /// Local address of the socket |
215 | socket_address local_address() const noexcept; | |
11fdf7f2 TL |
216 | |
217 | /// Disables output to the socket. | |
218 | /// | |
219 | /// Current or future writes that have not been successfully flushed | |
220 | /// will immediately fail with an error. This is useful to abort | |
221 | /// operations on a socket that is not making progress due to a | |
222 | /// peer failure. | |
223 | void shutdown_output(); | |
224 | /// Disables input from the socket. | |
225 | /// | |
226 | /// Current or future reads will immediately fail with an error. | |
227 | /// This is useful to abort operations on a socket that is not making | |
228 | /// progress due to a peer failure. | |
229 | void shutdown_input(); | |
230 | }; | |
231 | /// @} | |
232 | ||
233 | /// \addtogroup networking-module | |
234 | /// @{ | |
235 | ||
236 | /// The seastar socket. | |
237 | /// | |
238 | /// A \c socket that allows a connection to be established between | |
239 | /// two endpoints. | |
240 | class socket { | |
241 | std::unique_ptr<net::socket_impl> _si; | |
242 | public: | |
f67539c2 | 243 | socket() noexcept = default; |
11fdf7f2 TL |
244 | ~socket(); |
245 | ||
246 | /// \cond internal | |
f67539c2 | 247 | explicit socket(std::unique_ptr<net::socket_impl> si) noexcept; |
11fdf7f2 TL |
248 | /// \endcond |
249 | /// Moves a \c seastar::socket object. | |
250 | socket(socket&&) noexcept; | |
251 | /// Move-assigns a \c seastar::socket object. | |
252 | socket& operator=(socket&&) noexcept; | |
253 | ||
254 | /// Attempts to establish the connection. | |
255 | /// | |
256 | /// \return a \ref connected_socket representing the connection. | |
9f95a23c TL |
257 | future<connected_socket> connect(socket_address sa, socket_address local = {}, transport proto = transport::TCP); |
258 | ||
259 | /// Sets SO_REUSEADDR option (enable reuseaddr option on a socket) | |
260 | void set_reuseaddr(bool reuseaddr); | |
261 | /// Gets O_REUSEADDR option | |
262 | /// \return whether the reuseaddr option is enabled or not | |
263 | bool get_reuseaddr() const; | |
11fdf7f2 TL |
264 | /// Stops any in-flight connection attempt. |
265 | /// | |
266 | /// Cancels the connection attempt if it's still in progress, and | |
267 | /// terminates the connection if it has already been established. | |
268 | void shutdown(); | |
269 | }; | |
270 | ||
271 | /// @} | |
272 | ||
273 | /// \addtogroup networking-module | |
274 | /// @{ | |
275 | ||
9f95a23c TL |
276 | /// The result of an server_socket::accept() call |
277 | struct accept_result { | |
278 | connected_socket connection; ///< The newly-accepted connection | |
279 | socket_address remote_address; ///< The address of the peer that connected to us | |
280 | }; | |
281 | ||
11fdf7f2 TL |
282 | /// A listening socket, waiting to accept incoming network connections. |
283 | class server_socket { | |
f67539c2 | 284 | std::unique_ptr<net::server_socket_impl> _ssi; |
11fdf7f2 TL |
285 | bool _aborted = false; |
286 | public: | |
287 | enum class load_balancing_algorithm { | |
288 | // This algorithm tries to distribute all connections equally between all shards. | |
289 | // It does this by sending new connections to a shard with smallest amount of connections. | |
290 | connection_distribution, | |
291 | // This algorithm distributes new connection based on peer's tcp port. Destination shard | |
292 | // is calculated as a port number modulo number of shards. This allows a client to connect | |
293 | // to a specific shard in a server given it knows how many shards server has by choosing | |
294 | // src port number accordingly. | |
295 | port, | |
9f95a23c TL |
296 | // This algorithm distributes all new connections to listen_options::fixed_cpu shard only. |
297 | fixed, | |
11fdf7f2 TL |
298 | default_ = connection_distribution |
299 | }; | |
300 | /// Constructs a \c server_socket not corresponding to a connection | |
f67539c2 | 301 | server_socket() noexcept; |
11fdf7f2 | 302 | /// \cond internal |
f67539c2 | 303 | explicit server_socket(std::unique_ptr<net::server_socket_impl> ssi) noexcept; |
11fdf7f2 TL |
304 | /// \endcond |
305 | /// Moves a \c server_socket object. | |
306 | server_socket(server_socket&& ss) noexcept; | |
307 | ~server_socket(); | |
308 | /// Move-assigns a \c server_socket object. | |
309 | server_socket& operator=(server_socket&& cs) noexcept; | |
310 | ||
311 | /// Accepts the next connection to successfully connect to this socket. | |
312 | /// | |
9f95a23c TL |
313 | /// \return an accept_result representing the connection and |
314 | /// the socket_address of the remote endpoint. | |
11fdf7f2 TL |
315 | /// |
316 | /// \see listen(socket_address sa) | |
317 | /// \see listen(socket_address sa, listen_options opts) | |
9f95a23c | 318 | future<accept_result> accept(); |
11fdf7f2 TL |
319 | |
320 | /// Stops any \ref accept() in progress. | |
321 | /// | |
322 | /// Current and future \ref accept() calls will terminate immediately | |
323 | /// with an error. | |
324 | void abort_accept(); | |
9f95a23c TL |
325 | |
326 | /// Local bound address | |
f67539c2 | 327 | socket_address local_address() const noexcept; |
11fdf7f2 | 328 | }; |
9f95a23c | 329 | |
11fdf7f2 TL |
330 | /// @} |
331 | ||
332 | struct listen_options { | |
333 | bool reuse_address = false; | |
334 | server_socket::load_balancing_algorithm lba = server_socket::load_balancing_algorithm::default_; | |
335 | transport proto = transport::TCP; | |
9f95a23c TL |
336 | int listen_backlog = 100; |
337 | unsigned fixed_cpu = 0u; | |
338 | void set_fixed_cpu(unsigned cpu) { | |
339 | lba = server_socket::load_balancing_algorithm::fixed; | |
340 | fixed_cpu = cpu; | |
341 | } | |
342 | }; | |
343 | ||
344 | class network_interface { | |
345 | private: | |
346 | shared_ptr<net::network_interface_impl> _impl; | |
347 | public: | |
f67539c2 TL |
348 | network_interface() = delete; |
349 | network_interface(shared_ptr<net::network_interface_impl>) noexcept; | |
350 | network_interface(network_interface&&) noexcept; | |
9f95a23c | 351 | |
f67539c2 | 352 | network_interface& operator=(network_interface&&) noexcept; |
9f95a23c TL |
353 | |
354 | uint32_t index() const; | |
355 | uint32_t mtu() const; | |
356 | ||
357 | const sstring& name() const; | |
358 | const sstring& display_name() const; | |
359 | const std::vector<net::inet_address>& addresses() const; | |
360 | const std::vector<uint8_t> hardware_address() const; | |
361 | ||
362 | bool is_loopback() const; | |
363 | bool is_virtual() const; | |
364 | bool is_up() const; | |
365 | bool supports_ipv6() const; | |
11fdf7f2 TL |
366 | }; |
367 | ||
368 | class network_stack { | |
369 | public: | |
370 | virtual ~network_stack() {} | |
371 | virtual server_socket listen(socket_address sa, listen_options opts) = 0; | |
372 | // FIXME: local parameter assumes ipv4 for now, fix when adding other AF | |
9f95a23c | 373 | future<connected_socket> connect(socket_address sa, socket_address = {}, transport proto = transport::TCP); |
11fdf7f2 | 374 | virtual ::seastar::socket socket() = 0; |
9f95a23c | 375 | virtual net::udp_channel make_udp_channel(const socket_address& = {}) = 0; |
11fdf7f2 TL |
376 | virtual future<> initialize() { |
377 | return make_ready_future(); | |
378 | } | |
379 | virtual bool has_per_core_namespace() = 0; | |
9f95a23c TL |
380 | // NOTE: this is not a correct query approach. |
381 | // This question should be per NIC, but we have no such | |
382 | // abstraction, so for now this is "stack-wide" | |
383 | virtual bool supports_ipv6() const { | |
384 | return false; | |
385 | } | |
386 | ||
387 | /** | |
388 | * Returns available network interfaces. This represents a | |
389 | * snapshot of interfaces available at call time, hence the | |
390 | * return by value. | |
391 | */ | |
392 | virtual std::vector<network_interface> network_interfaces(); | |
11fdf7f2 TL |
393 | }; |
394 | ||
20effc67 TL |
395 | struct network_stack_entry { |
396 | using factory_func = noncopyable_function<future<std::unique_ptr<network_stack>> (const program_options::option_group&)>; | |
397 | ||
398 | sstring name; | |
399 | std::unique_ptr<program_options::option_group> opts; | |
400 | factory_func factory; | |
401 | bool is_default; | |
402 | }; | |
403 | ||
11fdf7f2 | 404 | } |