]>
Commit | Line | Data |
---|---|---|
2fe27d5a | 1 | /* |
9c8ad495 | 2 | * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. |
2fe27d5a BP |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #ifndef NETLINK_SOCKET_H | |
18 | #define NETLINK_SOCKET_H 1 | |
19 | ||
20 | /* Netlink socket definitions. | |
022ad2b9 BP |
21 | * |
22 | * This header file defines functions for working with Netlink sockets. Only | |
23 | * Linux natively supports Netlink sockets, but Netlink is well suited as a | |
24 | * basis for extensible low-level protocols, so it can make sense to implement | |
25 | * a Netlink layer on other systems. This doesn't have to be done in exactly | |
26 | * the same way as on Linux, as long as the implementation can support the | |
27 | * semantics that are important to Open vSwitch. See "Usage concepts" below | |
28 | * for more information. | |
29 | * | |
30 | * For Netlink protocol definitions, see netlink-protocol.h. For helper | |
31 | * functions for working with Netlink messages, see netlink.h. | |
32 | * | |
33 | * | |
34 | * Usage concepts | |
35 | * ============== | |
2fe27d5a BP |
36 | * |
37 | * Netlink is a datagram-based network protocol primarily for communication | |
022ad2b9 BP |
38 | * between user processes and the kernel. Netlink is specified in RFC 3549, |
39 | * "Linux Netlink as an IP Services Protocol". | |
2fe27d5a BP |
40 | * |
41 | * Netlink is not suitable for use in physical networks of heterogeneous | |
42 | * machines because host byte order is used throughout. | |
43 | * | |
022ad2b9 BP |
44 | * The AF_NETLINK socket namespace is subdivided into statically numbered |
45 | * protocols, e.g. NETLINK_ROUTE, NETLINK_NETFILTER, provided as the third | |
46 | * argument to the socket() function. Maintaining the assigned numbers became | |
47 | * a bit of a problem, so the "Generic Netlink" NETLINK_GENERIC protocol was | |
48 | * introduced to map between human-readable names and dynamically assigned | |
49 | * numbers. All recently introduced Netlink protocol messages in Linux | |
50 | * (including all of the Open vSwitch specific messages) fall under | |
51 | * NETLINK_GENERIC. The Netlink library provides the nl_lookup_genl_family() | |
52 | * function for translating a Generic Netlink name to a number. On Linux, this | |
53 | * queries the kernel Generic Netlink implementation, but on other systems it | |
54 | * might be easier to statically assign each of the names used by Open vSwitch | |
55 | * and then implement this function entirely in userspace. | |
56 | * | |
57 | * Each Netlink socket is distinguished by its Netlink PID, a 32-bit integer | |
58 | * that is analogous to a TCP or UDP port number. The kernel has PID 0. | |
59 | * | |
60 | * Most Netlink messages manage a kernel table of some kind, e.g. the kernel | |
61 | * routing table, ARP table, etc. Open vSwitch specific messages manage tables | |
62 | * of datapaths, ports within datapaths ("vports"), and flows within | |
63 | * datapaths. Open vSwitch also has messages related to network packets | |
64 | * received on vports, which aren't really a table. | |
65 | * | |
66 | * Datagram protocols over a physical network are typically unreliable: in UDP, | |
67 | * for example, messages can be dropped, delivered more than once, or delivered | |
68 | * out of order. In Linux, Netlink does not deliver messages out of order or | |
69 | * multiple times. In some cases it can drop messages, but the kernel | |
70 | * indicates when a message has been dropped. The description below of each | |
71 | * way Open vSwitch uses Netlink also explains how to work around dropped | |
72 | * messages. | |
73 | * | |
74 | * Open vSwitch uses Netlink in four characteristic ways: | |
75 | * | |
76 | * 1. Transactions. A transaction is analogous to a system call, an ioctl, | |
77 | * or an RPC: userspace sends a request to the kernel, which processes | |
78 | * the request synchronously and returns a reply to userspace. | |
79 | * (Sometimes there is no explicit reply, but even in that case userspace | |
80 | * will receive an immediate reply if there is an error.) | |
81 | * | |
82 | * nl_transact() is the primary interface for transactions over Netlink. | |
83 | * This function doesn't take a socket as a parameter because sockets do | |
84 | * not have any state related to transactions. | |
85 | * | |
86 | * Netlink uses 16-bit "length" fields extensively, which effectively | |
87 | * limits requests and replies to 64 kB. "Dumps" (see below) are one way | |
88 | * to work around this limit for replies. | |
89 | * | |
90 | * In the Linux implementation of Netlink transactions, replies can | |
91 | * sometimes be lost. When this happens, nl_transact() automatically | |
92 | * executes the transaction again. This means that it is important that | |
93 | * transactions be idempotent, or that the client be prepared to tolerate | |
94 | * that a transaction might actually execute more than once. | |
95 | * | |
96 | * The Linux implementation can execute several transactions at the same | |
97 | * time more efficiently than individually. nl_transact_multiple() | |
98 | * allows for this. The semantics are no different from executing each | |
99 | * of the transactions individually with nl_transact(). | |
100 | * | |
101 | * 2. Dumps. A dump asks the kernel to provide all of the information in a | |
102 | * table. It consists of a request and a reply, where the reply consists | |
103 | * of an arbitrary number of messages. Each message in the reply is | |
104 | * limited to 64 kB, as is the request, but the total size of the reply | |
105 | * can be many times larger. | |
106 | * | |
107 | * The reply to a dump is usually generated piece by piece, not | |
108 | * atomically. The reply can represent an inconsistent snapshot of the | |
109 | * table. This is especially likely if entries in the table were being | |
110 | * added or deleted or changing during the dump. | |
111 | * | |
112 | * nl_dump_start() begins a dump based on the caller-provided request and | |
113 | * initializes a "struct nl_dump" to identify the dump. Subsequent calls | |
114 | * to nl_dump_next() then obtain the reply, one message at a time. | |
115 | * Usually, each message gives information about some entry in a table, | |
116 | * e.g. one flow in the Open vSwitch flow table, or one route in a | |
117 | * routing table. nl_dump_done() ends the dump. | |
118 | * | |
119 | * Linux implements dumps so that messages in a reply do not get lost. | |
120 | * | |
121 | * 3. Multicast subscriptions. Most kernel Netlink implementations allow a | |
122 | * process to monitor changes to its table, by subscribing to a Netlink | |
123 | * multicast group dedicated to that table. Whenever the table's content | |
124 | * changes (e.g. an entry is added or deleted or modified), the Netlink | |
125 | * implementation sends a message to all sockets that subscribe to its | |
126 | * multicast group notifying it of details of the change. (This doesn't | |
127 | * require much extra work by the Netlink implementer because the message | |
128 | * is generally identical to the one sent as a reply to the request that | |
129 | * changed the table.) | |
130 | * | |
131 | * nl_sock_join_mcgroup() subscribes a socket to a multicast group, and | |
132 | * nl_sock_recv() reads notifications. | |
133 | * | |
134 | * If userspace doesn't read messages from a socket subscribed to a | |
135 | * multicast group quickly enough, then notification messages can pile up | |
136 | * in the socket's receive buffer. If this continues long enough, the | |
137 | * receive buffer will fill up and notifications will be lost. In that | |
138 | * case, nl_sock_recv() will return ENOBUFS. The client can then use a | |
139 | * dump to resynchronize with the table state. (A simple implementation | |
140 | * of multicast groups might take advantage of this by simply returning | |
141 | * ENOBUFS whenever a table changes, without implementing actual | |
142 | * notifications. This would cause lots of extra dumps, so it may not be | |
143 | * suitable as a production implementation.) | |
144 | * | |
145 | * 4. Unicast subscriptions (Open vSwitch specific). Userspace can assign | |
146 | * one or more Netlink PIDs to a vport as "upcall PIDs". When a packet | |
147 | * received on the vport does not match any flow in its datapath's flow | |
148 | * table, the kernel hashes some of the packet's headers, uses the hash | |
149 | * to select one of the PIDs, and sends the packet (encapsulated in an | |
150 | * Open vSwitch Netlink message) to the socket with the selected PID. | |
151 | * | |
152 | * nl_sock_recv() reads notifications sent this way. | |
153 | * | |
36791e21 NR |
154 | * Specifically on Windows platform, the datapath needs to allocate a |
155 | * queue for packets, and it does so only when userspace "subscribe"'s to | |
156 | * packets on that netlink socket. Before closing the netlink socket, | |
157 | * userspace needs to "unsubscribe" packets on that netlink socket. | |
158 | * | |
159 | * nl_sock_subscribe_packets() and nl_sock_unsubscribe_packets() are | |
160 | * Windows specific. | |
161 | * | |
022ad2b9 BP |
162 | * Messages received this way can overflow, just like multicast |
163 | * subscription messages, and they are reported the same way. Because | |
164 | * packet notification messages do not report the state of a table, there | |
165 | * is no way to recover the dropped packets; they are simply lost. | |
166 | * | |
167 | * The main reason to support multiple PIDs per vport is to increase | |
168 | * fairness, that is, to make it harder for a single high-flow-rate | |
169 | * sender to drown out lower rate sources. Multiple PIDs per vport might | |
170 | * also improve packet handling latency or flow setup rate, but that is | |
171 | * not the main goal. | |
172 | * | |
173 | * Old versions of the Linux kernel module supported only one PID per | |
174 | * vport, and userspace still copes with this, so a simple or early | |
175 | * implementation might only support one PID per vport too. | |
0bd01224 BP |
176 | * |
177 | * | |
178 | * Thread-safety | |
179 | * ============= | |
180 | * | |
0672776e JS |
181 | * Most of the netlink functions are not fully thread-safe: Only a single |
182 | * thread may use a given nl_sock or nl_dump at one time. The exceptions are: | |
183 | * | |
184 | * - nl_sock_recv() is conditionally thread-safe: it may be called from | |
185 | * different threads with the same nl_sock, but each caller must provide | |
186 | * an independent receive buffer. | |
187 | * | |
188 | * - nl_dump_next() is conditionally thread-safe: it may be called from | |
189 | * different threads with the same nl_dump, but each caller must provide | |
190 | * independent buffers. | |
2fe27d5a BP |
191 | */ |
192 | ||
193 | #include <stdbool.h> | |
194 | #include <stddef.h> | |
195 | #include <stdint.h> | |
64c96779 | 196 | #include "openvswitch/ofpbuf.h" |
0672776e | 197 | #include "ovs-atomic.h" |
0791315e | 198 | #include "ovs-thread.h" |
2fe27d5a | 199 | |
2fe27d5a BP |
200 | struct nl_sock; |
201 | ||
202 | #ifndef HAVE_NETLINK | |
e214f751 | 203 | #ifndef _WIN32 |
2fe27d5a BP |
204 | #error "netlink-socket.h is only for hosts that support Netlink sockets" |
205 | #endif | |
e214f751 | 206 | #endif |
2fe27d5a BP |
207 | |
208 | /* Netlink sockets. */ | |
cceb11f5 | 209 | int nl_sock_create(int protocol, struct nl_sock **); |
c6eab56d | 210 | int nl_sock_clone(const struct nl_sock *, struct nl_sock **); |
2fe27d5a BP |
211 | void nl_sock_destroy(struct nl_sock *); |
212 | ||
cceb11f5 BP |
213 | int nl_sock_join_mcgroup(struct nl_sock *, unsigned int multicast_group); |
214 | int nl_sock_leave_mcgroup(struct nl_sock *, unsigned int multicast_group); | |
215 | ||
36791e21 NR |
216 | #ifdef _WIN32 |
217 | int nl_sock_subscribe_packets(struct nl_sock *sock); | |
218 | int nl_sock_unsubscribe_packets(struct nl_sock *sock); | |
219 | #endif | |
220 | ||
2fe27d5a | 221 | int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait); |
ff459dd6 BP |
222 | int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *, |
223 | uint32_t nlmsg_seq, bool wait); | |
72d32ac0 | 224 | int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait); |
2fe27d5a | 225 | |
6b7c12fd BP |
226 | int nl_sock_drain(struct nl_sock *); |
227 | ||
2fe27d5a | 228 | void nl_sock_wait(const struct nl_sock *, short int events); |
9667de98 | 229 | #ifndef _WIN32 |
8522ba09 | 230 | int nl_sock_fd(const struct nl_sock *); |
9667de98 | 231 | #endif |
2fe27d5a | 232 | |
50802adb JG |
233 | uint32_t nl_sock_pid(const struct nl_sock *); |
234 | ||
cc75061a BP |
235 | /* Batching transactions. */ |
236 | struct nl_transaction { | |
237 | /* Filled in by client. */ | |
238 | struct ofpbuf *request; /* Request to send. */ | |
239 | ||
72d32ac0 BP |
240 | /* The client must initialize 'reply' to one of: |
241 | * | |
242 | * - NULL, if it does not care to examine the reply. | |
243 | * | |
244 | * - Otherwise, to an ofpbuf with a memory allocation of at least | |
245 | * NLMSG_HDRLEN bytes. | |
246 | */ | |
247 | struct ofpbuf *reply; /* Reply (empty if reply was an error code). */ | |
cc75061a BP |
248 | int error; /* Positive errno value, 0 if no error. */ |
249 | }; | |
250 | ||
a88b4e04 BP |
251 | /* Transactions without an allocated socket. */ |
252 | int nl_transact(int protocol, const struct ofpbuf *request, | |
253 | struct ofpbuf **replyp); | |
254 | void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n); | |
255 | ||
2fe27d5a | 256 | /* Table dumping. */ |
d57695d7 JS |
257 | #define NL_DUMP_BUFSIZE 4096 |
258 | ||
2fe27d5a | 259 | struct nl_dump { |
93295354 | 260 | /* These members are immutable during the lifetime of the nl_dump. */ |
2fe27d5a | 261 | struct nl_sock *sock; /* Socket being dumped. */ |
9c8ad495 | 262 | uint32_t nl_seq; /* Expected nlmsg_seq for replies. */ |
93295354 BP |
263 | int status OVS_GUARDED; /* 0: dump in progress, |
264 | * positive errno: dump completed with error, | |
265 | * EOF: dump completed successfully. */ | |
0a0b5a72 BB |
266 | |
267 | /* 'mutex' protects 'status' and serializes access to 'sock'. */ | |
268 | struct ovs_mutex mutex; /* Protects 'status', synchronizes recv(). */ | |
2fe27d5a BP |
269 | }; |
270 | ||
a88b4e04 | 271 | void nl_dump_start(struct nl_dump *, int protocol, |
2fe27d5a | 272 | const struct ofpbuf *request); |
d57695d7 | 273 | bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf); |
2fe27d5a BP |
274 | int nl_dump_done(struct nl_dump *); |
275 | ||
276 | /* Miscellaneous */ | |
277 | int nl_lookup_genl_family(const char *name, int *number); | |
e408762f | 278 | int nl_lookup_genl_mcgroup(const char *family_name, const char *group_name, |
b3dcb73c | 279 | unsigned int *multicast_group); |
2fe27d5a BP |
280 | |
281 | #endif /* netlink-socket.h */ |