]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netlink-socket.h
netlink: provide network namespace id from a msg.
[mirror_ovs.git] / lib / netlink-socket.h
CommitLineData
2fe27d5a 1/*
9c8ad495 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
2fe27d5a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef NETLINK_SOCKET_H
18#define NETLINK_SOCKET_H 1
19
20/* Netlink socket definitions.
022ad2b9
BP
21 *
22 * This header file defines functions for working with Netlink sockets. Only
23 * Linux natively supports Netlink sockets, but Netlink is well suited as a
24 * basis for extensible low-level protocols, so it can make sense to implement
25 * a Netlink layer on other systems. This doesn't have to be done in exactly
26 * the same way as on Linux, as long as the implementation can support the
27 * semantics that are important to Open vSwitch. See "Usage concepts" below
28 * for more information.
29 *
30 * For Netlink protocol definitions, see netlink-protocol.h. For helper
31 * functions for working with Netlink messages, see netlink.h.
32 *
33 *
34 * Usage concepts
35 * ==============
2fe27d5a
BP
36 *
37 * Netlink is a datagram-based network protocol primarily for communication
022ad2b9
BP
38 * between user processes and the kernel. Netlink is specified in RFC 3549,
39 * "Linux Netlink as an IP Services Protocol".
2fe27d5a
BP
40 *
41 * Netlink is not suitable for use in physical networks of heterogeneous
42 * machines because host byte order is used throughout.
43 *
022ad2b9
BP
44 * The AF_NETLINK socket namespace is subdivided into statically numbered
45 * protocols, e.g. NETLINK_ROUTE, NETLINK_NETFILTER, provided as the third
46 * argument to the socket() function. Maintaining the assigned numbers became
47 * a bit of a problem, so the "Generic Netlink" NETLINK_GENERIC protocol was
48 * introduced to map between human-readable names and dynamically assigned
49 * numbers. All recently introduced Netlink protocol messages in Linux
50 * (including all of the Open vSwitch specific messages) fall under
51 * NETLINK_GENERIC. The Netlink library provides the nl_lookup_genl_family()
52 * function for translating a Generic Netlink name to a number. On Linux, this
53 * queries the kernel Generic Netlink implementation, but on other systems it
54 * might be easier to statically assign each of the names used by Open vSwitch
55 * and then implement this function entirely in userspace.
56 *
57 * Each Netlink socket is distinguished by its Netlink PID, a 32-bit integer
58 * that is analogous to a TCP or UDP port number. The kernel has PID 0.
59 *
60 * Most Netlink messages manage a kernel table of some kind, e.g. the kernel
61 * routing table, ARP table, etc. Open vSwitch specific messages manage tables
62 * of datapaths, ports within datapaths ("vports"), and flows within
63 * datapaths. Open vSwitch also has messages related to network packets
64 * received on vports, which aren't really a table.
65 *
66 * Datagram protocols over a physical network are typically unreliable: in UDP,
67 * for example, messages can be dropped, delivered more than once, or delivered
68 * out of order. In Linux, Netlink does not deliver messages out of order or
69 * multiple times. In some cases it can drop messages, but the kernel
70 * indicates when a message has been dropped. The description below of each
71 * way Open vSwitch uses Netlink also explains how to work around dropped
72 * messages.
73 *
74 * Open vSwitch uses Netlink in four characteristic ways:
75 *
76 * 1. Transactions. A transaction is analogous to a system call, an ioctl,
77 * or an RPC: userspace sends a request to the kernel, which processes
78 * the request synchronously and returns a reply to userspace.
79 * (Sometimes there is no explicit reply, but even in that case userspace
80 * will receive an immediate reply if there is an error.)
81 *
82 * nl_transact() is the primary interface for transactions over Netlink.
83 * This function doesn't take a socket as a parameter because sockets do
84 * not have any state related to transactions.
85 *
86 * Netlink uses 16-bit "length" fields extensively, which effectively
87 * limits requests and replies to 64 kB. "Dumps" (see below) are one way
88 * to work around this limit for replies.
89 *
90 * In the Linux implementation of Netlink transactions, replies can
91 * sometimes be lost. When this happens, nl_transact() automatically
92 * executes the transaction again. This means that it is important that
93 * transactions be idempotent, or that the client be prepared to tolerate
94 * that a transaction might actually execute more than once.
95 *
96 * The Linux implementation can execute several transactions at the same
97 * time more efficiently than individually. nl_transact_multiple()
98 * allows for this. The semantics are no different from executing each
99 * of the transactions individually with nl_transact().
100 *
101 * 2. Dumps. A dump asks the kernel to provide all of the information in a
102 * table. It consists of a request and a reply, where the reply consists
103 * of an arbitrary number of messages. Each message in the reply is
104 * limited to 64 kB, as is the request, but the total size of the reply
105 * can be many times larger.
106 *
107 * The reply to a dump is usually generated piece by piece, not
108 * atomically. The reply can represent an inconsistent snapshot of the
109 * table. This is especially likely if entries in the table were being
110 * added or deleted or changing during the dump.
111 *
112 * nl_dump_start() begins a dump based on the caller-provided request and
113 * initializes a "struct nl_dump" to identify the dump. Subsequent calls
114 * to nl_dump_next() then obtain the reply, one message at a time.
115 * Usually, each message gives information about some entry in a table,
116 * e.g. one flow in the Open vSwitch flow table, or one route in a
117 * routing table. nl_dump_done() ends the dump.
118 *
119 * Linux implements dumps so that messages in a reply do not get lost.
120 *
121 * 3. Multicast subscriptions. Most kernel Netlink implementations allow a
122 * process to monitor changes to its table, by subscribing to a Netlink
123 * multicast group dedicated to that table. Whenever the table's content
124 * changes (e.g. an entry is added or deleted or modified), the Netlink
125 * implementation sends a message to all sockets that subscribe to its
126 * multicast group notifying it of details of the change. (This doesn't
127 * require much extra work by the Netlink implementer because the message
128 * is generally identical to the one sent as a reply to the request that
129 * changed the table.)
130 *
131 * nl_sock_join_mcgroup() subscribes a socket to a multicast group, and
132 * nl_sock_recv() reads notifications.
133 *
134 * If userspace doesn't read messages from a socket subscribed to a
135 * multicast group quickly enough, then notification messages can pile up
136 * in the socket's receive buffer. If this continues long enough, the
137 * receive buffer will fill up and notifications will be lost. In that
138 * case, nl_sock_recv() will return ENOBUFS. The client can then use a
139 * dump to resynchronize with the table state. (A simple implementation
140 * of multicast groups might take advantage of this by simply returning
141 * ENOBUFS whenever a table changes, without implementing actual
142 * notifications. This would cause lots of extra dumps, so it may not be
143 * suitable as a production implementation.)
144 *
145 * 4. Unicast subscriptions (Open vSwitch specific). Userspace can assign
146 * one or more Netlink PIDs to a vport as "upcall PIDs". When a packet
147 * received on the vport does not match any flow in its datapath's flow
148 * table, the kernel hashes some of the packet's headers, uses the hash
149 * to select one of the PIDs, and sends the packet (encapsulated in an
150 * Open vSwitch Netlink message) to the socket with the selected PID.
151 *
152 * nl_sock_recv() reads notifications sent this way.
153 *
36791e21
NR
154 * Specifically on Windows platform, the datapath needs to allocate a
155 * queue for packets, and it does so only when userspace "subscribe"'s to
156 * packets on that netlink socket. Before closing the netlink socket,
157 * userspace needs to "unsubscribe" packets on that netlink socket.
158 *
159 * nl_sock_subscribe_packets() and nl_sock_unsubscribe_packets() are
160 * Windows specific.
161 *
022ad2b9
BP
162 * Messages received this way can overflow, just like multicast
163 * subscription messages, and they are reported the same way. Because
164 * packet notification messages do not report the state of a table, there
165 * is no way to recover the dropped packets; they are simply lost.
166 *
167 * The main reason to support multiple PIDs per vport is to increase
168 * fairness, that is, to make it harder for a single high-flow-rate
169 * sender to drown out lower rate sources. Multiple PIDs per vport might
170 * also improve packet handling latency or flow setup rate, but that is
171 * not the main goal.
172 *
173 * Old versions of the Linux kernel module supported only one PID per
174 * vport, and userspace still copes with this, so a simple or early
175 * implementation might only support one PID per vport too.
0bd01224
BP
176 *
177 *
178 * Thread-safety
179 * =============
180 *
0672776e
JS
181 * Most of the netlink functions are not fully thread-safe: Only a single
182 * thread may use a given nl_sock or nl_dump at one time. The exceptions are:
183 *
184 * - nl_sock_recv() is conditionally thread-safe: it may be called from
185 * different threads with the same nl_sock, but each caller must provide
186 * an independent receive buffer.
187 *
188 * - nl_dump_next() is conditionally thread-safe: it may be called from
189 * different threads with the same nl_dump, but each caller must provide
190 * independent buffers.
2fe27d5a
BP
191 */
192
193#include <stdbool.h>
194#include <stddef.h>
195#include <stdint.h>
64c96779 196#include "openvswitch/ofpbuf.h"
0672776e 197#include "ovs-atomic.h"
0791315e 198#include "ovs-thread.h"
2fe27d5a 199
2fe27d5a
BP
200struct nl_sock;
201
202#ifndef HAVE_NETLINK
e214f751 203#ifndef _WIN32
2fe27d5a
BP
204#error "netlink-socket.h is only for hosts that support Netlink sockets"
205#endif
e214f751 206#endif
2fe27d5a
BP
207
208/* Netlink sockets. */
cceb11f5 209int nl_sock_create(int protocol, struct nl_sock **);
c6eab56d 210int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
2fe27d5a
BP
211void nl_sock_destroy(struct nl_sock *);
212
cceb11f5
BP
213int nl_sock_join_mcgroup(struct nl_sock *, unsigned int multicast_group);
214int nl_sock_leave_mcgroup(struct nl_sock *, unsigned int multicast_group);
215
36791e21
NR
216#ifdef _WIN32
217int nl_sock_subscribe_packets(struct nl_sock *sock);
218int nl_sock_unsubscribe_packets(struct nl_sock *sock);
219#endif
220
2fe27d5a 221int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
ff459dd6
BP
222int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
223 uint32_t nlmsg_seq, bool wait);
a86bd14e 224int nl_sock_recv(struct nl_sock *, struct ofpbuf *, int *nsid, bool wait);
2fe27d5a 225
6b7c12fd
BP
226int nl_sock_drain(struct nl_sock *);
227
2fe27d5a 228void nl_sock_wait(const struct nl_sock *, short int events);
9667de98 229#ifndef _WIN32
8522ba09 230int nl_sock_fd(const struct nl_sock *);
9667de98 231#endif
2fe27d5a 232
50802adb
JG
233uint32_t nl_sock_pid(const struct nl_sock *);
234
cc75061a
BP
235/* Batching transactions. */
236struct nl_transaction {
237 /* Filled in by client. */
238 struct ofpbuf *request; /* Request to send. */
239
72d32ac0
BP
240 /* The client must initialize 'reply' to one of:
241 *
242 * - NULL, if it does not care to examine the reply.
243 *
244 * - Otherwise, to an ofpbuf with a memory allocation of at least
245 * NLMSG_HDRLEN bytes.
246 */
247 struct ofpbuf *reply; /* Reply (empty if reply was an error code). */
cc75061a
BP
248 int error; /* Positive errno value, 0 if no error. */
249};
250
a88b4e04
BP
251/* Transactions without an allocated socket. */
252int nl_transact(int protocol, const struct ofpbuf *request,
253 struct ofpbuf **replyp);
254void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
255
2fe27d5a 256/* Table dumping. */
d57695d7
JS
257#define NL_DUMP_BUFSIZE 4096
258
2fe27d5a 259struct nl_dump {
93295354 260 /* These members are immutable during the lifetime of the nl_dump. */
2fe27d5a 261 struct nl_sock *sock; /* Socket being dumped. */
9c8ad495 262 uint32_t nl_seq; /* Expected nlmsg_seq for replies. */
93295354
BP
263 int status OVS_GUARDED; /* 0: dump in progress,
264 * positive errno: dump completed with error,
265 * EOF: dump completed successfully. */
0a0b5a72
BB
266
267 /* 'mutex' protects 'status' and serializes access to 'sock'. */
268 struct ovs_mutex mutex; /* Protects 'status', synchronizes recv(). */
2fe27d5a
BP
269};
270
a88b4e04 271void nl_dump_start(struct nl_dump *, int protocol,
2fe27d5a 272 const struct ofpbuf *request);
d57695d7 273bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
2fe27d5a
BP
274int nl_dump_done(struct nl_dump *);
275
276/* Miscellaneous */
277int nl_lookup_genl_family(const char *name, int *number);
e408762f 278int nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
b3dcb73c 279 unsigned int *multicast_group);
2fe27d5a
BP
280
281#endif /* netlink-socket.h */