]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netlink-socket.c
NEWS: Mention RSTP.
[mirror_ovs.git] / lib / netlink-socket.c
CommitLineData
2fe27d5a 1/*
db1fc210 2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
2fe27d5a
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18#include "netlink-socket.h"
2fe27d5a
BP
19#include <errno.h>
20#include <inttypes.h>
21#include <stdlib.h>
22#include <sys/types.h>
cc75061a 23#include <sys/uio.h>
2fe27d5a
BP
24#include <unistd.h>
25#include "coverage.h"
26#include "dynamic-string.h"
2ad204c8
BP
27#include "hash.h"
28#include "hmap.h"
2fe27d5a
BP
29#include "netlink.h"
30#include "netlink-protocol.h"
886dd35a 31#include "odp-netlink.h"
2fe27d5a 32#include "ofpbuf.h"
0bd01224 33#include "ovs-thread.h"
2fe27d5a 34#include "poll-loop.h"
0672776e 35#include "seq.h"
6b7c12fd 36#include "socket-util.h"
cc75061a 37#include "util.h"
2fe27d5a
BP
38#include "vlog.h"
39
40VLOG_DEFINE_THIS_MODULE(netlink_socket);
41
42COVERAGE_DEFINE(netlink_overflow);
43COVERAGE_DEFINE(netlink_received);
fc999dda 44COVERAGE_DEFINE(netlink_recv_jumbo);
2fe27d5a
BP
45COVERAGE_DEFINE(netlink_sent);
46
47/* Linux header file confusion causes this to be undefined. */
48#ifndef SOL_NETLINK
49#define SOL_NETLINK 270
50#endif
51
22326ba6
AS
52#ifdef _WIN32
53static struct ovs_mutex portid_mutex = OVS_MUTEX_INITIALIZER;
54static uint32_t g_last_portid = 0;
55
56/* Port IDs must be unique! */
57static uint32_t
58portid_next(void)
59 OVS_GUARDED_BY(portid_mutex)
60{
61 g_last_portid++;
62 return g_last_portid;
63}
886dd35a 64#endif /* _WIN32 */
22326ba6 65
2fe27d5a
BP
66/* A single (bad) Netlink message can in theory dump out many, many log
67 * messages, so the burst size is set quite high here to avoid missing useful
68 * information. Also, at high logging levels we log *all* Netlink messages. */
69static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
70
7d7447df 71static uint32_t nl_sock_allocate_seq(struct nl_sock *, unsigned int n);
2fe27d5a 72static void log_nlmsg(const char *function, int error,
7041c3a9 73 const void *message, size_t size, int protocol);
886dd35a 74#ifdef _WIN32
ebac7fb7 75static int get_sock_pid_from_kernel(struct nl_sock *sock);
886dd35a 76#endif
2fe27d5a
BP
77\f
78/* Netlink sockets. */
79
0d121c73 80struct nl_sock {
22326ba6
AS
81#ifdef _WIN32
82 HANDLE handle;
83#else
2fe27d5a 84 int fd;
22326ba6 85#endif
7d7447df 86 uint32_t next_seq;
2fe27d5a 87 uint32_t pid;
7041c3a9 88 int protocol;
cc75061a 89 unsigned int rcvbuf; /* Receive buffer size (SO_RCVBUF). */
2fe27d5a
BP
90};
91
cc75061a
BP
92/* Compile-time limit on iovecs, so that we can allocate a maximum-size array
93 * of iovecs on the stack. */
94#define MAX_IOVS 128
95
96/* Maximum number of iovecs that may be passed to sendmsg, capped at a
97 * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
98 *
99 * Initialized by nl_sock_create(). */
100static int max_iovs;
101
a88b4e04
BP
102static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
103static void nl_pool_release(struct nl_sock *);
2fe27d5a
BP
104
105/* Creates a new netlink socket for the given netlink 'protocol'
106 * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Returns 0 and sets '*sockp' to the
a88b4e04 107 * new socket if successful, otherwise returns a positive errno value. */
2fe27d5a 108int
cceb11f5 109nl_sock_create(int protocol, struct nl_sock **sockp)
2fe27d5a 110{
0bd01224 111 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
2fe27d5a 112 struct nl_sock *sock;
22326ba6 113#ifndef _WIN32
2fe27d5a 114 struct sockaddr_nl local, remote;
22326ba6 115#endif
2c5a6834 116 socklen_t local_size;
d2b9f5b0 117 int rcvbuf;
2fe27d5a
BP
118 int retval = 0;
119
0bd01224 120 if (ovsthread_once_start(&once)) {
cc75061a
BP
121 int save_errno = errno;
122 errno = 0;
123
124 max_iovs = sysconf(_SC_UIO_MAXIOV);
125 if (max_iovs < _XOPEN_IOV_MAX) {
126 if (max_iovs == -1 && errno) {
10a89ef0 127 VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", ovs_strerror(errno));
cc75061a
BP
128 }
129 max_iovs = _XOPEN_IOV_MAX;
130 } else if (max_iovs > MAX_IOVS) {
131 max_iovs = MAX_IOVS;
132 }
133
134 errno = save_errno;
0bd01224 135 ovsthread_once_done(&once);
cc75061a
BP
136 }
137
2fe27d5a 138 *sockp = NULL;
488232b7 139 sock = xmalloc(sizeof *sock);
2fe27d5a 140
22326ba6
AS
141#ifdef _WIN32
142 sock->handle = CreateFileA("\\\\.\\OpenVSwitchDevice",
143 GENERIC_READ | GENERIC_WRITE,
144 FILE_SHARE_READ | FILE_SHARE_WRITE,
145 NULL, OPEN_EXISTING,
146 FILE_ATTRIBUTE_NORMAL, NULL);
147
148 int last_error = GetLastError();
149
150 if (sock->handle == INVALID_HANDLE_VALUE) {
151 VLOG_ERR("fcntl: %s", ovs_strerror(last_error));
152 goto error;
153 }
154#else
2fe27d5a
BP
155 sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
156 if (sock->fd < 0) {
10a89ef0 157 VLOG_ERR("fcntl: %s", ovs_strerror(errno));
2fe27d5a
BP
158 goto error;
159 }
22326ba6
AS
160#endif
161
7041c3a9 162 sock->protocol = protocol;
7d7447df 163 sock->next_seq = 1;
2fe27d5a 164
d2b9f5b0 165 rcvbuf = 1024 * 1024;
22326ba6
AS
166#ifdef _WIN32
167 sock->rcvbuf = rcvbuf;
ebac7fb7 168 retval = get_sock_pid_from_kernel(sock);
886dd35a
NR
169 if (retval != 0) {
170 goto error;
171 }
22326ba6 172#else
d2b9f5b0
BP
173 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUFFORCE,
174 &rcvbuf, sizeof rcvbuf)) {
80af5ee5
BP
175 /* Only root can use SO_RCVBUFFORCE. Everyone else gets EPERM.
176 * Warn only if the failure is therefore unexpected. */
f28b6dd3 177 if (errno != EPERM) {
80af5ee5 178 VLOG_WARN_RL(&rl, "setting %d-byte socket receive buffer failed "
10a89ef0 179 "(%s)", rcvbuf, ovs_strerror(errno));
80af5ee5 180 }
d2b9f5b0
BP
181 }
182
cc75061a
BP
183 retval = get_socket_rcvbuf(sock->fd);
184 if (retval < 0) {
185 retval = -retval;
186 goto error;
187 }
188 sock->rcvbuf = retval;
189
2c5a6834 190 /* Connect to kernel (pid 0) as remote address. */
2fe27d5a
BP
191 memset(&remote, 0, sizeof remote);
192 remote.nl_family = AF_NETLINK;
193 remote.nl_pid = 0;
194 if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) {
10a89ef0 195 VLOG_ERR("connect(0): %s", ovs_strerror(errno));
2c5a6834
BP
196 goto error;
197 }
198
199 /* Obtain pid assigned by kernel. */
200 local_size = sizeof local;
201 if (getsockname(sock->fd, (struct sockaddr *) &local, &local_size) < 0) {
10a89ef0 202 VLOG_ERR("getsockname: %s", ovs_strerror(errno));
2c5a6834
BP
203 goto error;
204 }
205 if (local_size < sizeof local || local.nl_family != AF_NETLINK) {
206 VLOG_ERR("getsockname returned bad Netlink name");
207 retval = EINVAL;
208 goto error;
2fe27d5a 209 }
2c5a6834 210 sock->pid = local.nl_pid;
22326ba6 211#endif
2fe27d5a 212
2fe27d5a
BP
213 *sockp = sock;
214 return 0;
215
2fe27d5a
BP
216error:
217 if (retval == 0) {
218 retval = errno;
219 if (retval == 0) {
220 retval = EINVAL;
221 }
222 }
22326ba6
AS
223#ifdef _WIN32
224 if (sock->handle != INVALID_HANDLE_VALUE) {
225 CloseHandle(sock->handle);
226 }
227#else
2fe27d5a
BP
228 if (sock->fd >= 0) {
229 close(sock->fd);
230 }
22326ba6 231#endif
2fe27d5a
BP
232 free(sock);
233 return retval;
234}
235
c6eab56d
BP
236/* Creates a new netlink socket for the same protocol as 'src'. Returns 0 and
237 * sets '*sockp' to the new socket if successful, otherwise returns a positive
238 * errno value. */
239int
240nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
241{
242 return nl_sock_create(src->protocol, sockp);
243}
244
2fe27d5a
BP
245/* Destroys netlink socket 'sock'. */
246void
247nl_sock_destroy(struct nl_sock *sock)
248{
249 if (sock) {
22326ba6
AS
250#ifdef _WIN32
251 CloseHandle(sock->handle);
252#else
a88b4e04 253 close(sock->fd);
22326ba6 254#endif
a88b4e04 255 free(sock);
2fe27d5a
BP
256 }
257}
258
886dd35a
NR
259#ifdef _WIN32
260/* Reads the pid for 'sock' generated in the kernel datapath. The function
261 * follows a transaction semantic. Eventually this function should call into
262 * nl_transact. */
263static int
ebac7fb7 264get_sock_pid_from_kernel(struct nl_sock *sock)
886dd35a
NR
265{
266 struct nl_transaction txn;
267 struct ofpbuf request;
268 uint64_t request_stub[128];
269 struct ofpbuf reply;
270 uint64_t reply_stub[128];
271 struct ovs_header *ovs_header;
272 struct nlmsghdr *nlmsg;
273 uint32_t seq;
274 int retval;
275 DWORD bytes;
276 int ovs_msg_size = sizeof (struct nlmsghdr) + sizeof (struct genlmsghdr) +
277 sizeof (struct ovs_header);
278
279 ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
280 txn.request = &request;
281 ofpbuf_use_stub(&reply, reply_stub, sizeof reply_stub);
282 txn.reply = &reply;
283
284 seq = nl_sock_allocate_seq(sock, 1);
285 nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
286 OVS_CTRL_CMD_WIN_GET_PID, OVS_WIN_CONTROL_VERSION);
287 nlmsg = nl_msg_nlmsghdr(txn.request);
288 nlmsg->nlmsg_seq = seq;
289
290 ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
291 ovs_header->dp_ifindex = 0;
292 ovs_header = ofpbuf_put_uninit(&reply, ovs_msg_size);
293
294 if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
295 ofpbuf_data(txn.request), ofpbuf_size(txn.request),
296 ofpbuf_data(txn.reply), ofpbuf_size(txn.reply),
297 &bytes, NULL)) {
298 retval = EINVAL;
299 goto done;
300 } else {
301 if (bytes < ovs_msg_size) {
302 retval = EINVAL;
303 goto done;
304 }
305
ebac7fb7 306 nlmsg = nl_msg_nlmsghdr(txn.reply);
886dd35a
NR
307 if (nlmsg->nlmsg_seq != seq) {
308 retval = EINVAL;
309 goto done;
310 }
ebac7fb7 311 sock->pid = nlmsg->nlmsg_pid;
886dd35a
NR
312 }
313 retval = 0;
314
315done:
316 ofpbuf_uninit(&request);
317 ofpbuf_uninit(&reply);
318 return retval;
319}
320#endif /* _WIN32 */
321
cceb11f5
BP
322/* Tries to add 'sock' as a listener for 'multicast_group'. Returns 0 if
323 * successful, otherwise a positive errno value.
324 *
a838c4fe
BP
325 * A socket that is subscribed to a multicast group that receives asynchronous
326 * notifications must not be used for Netlink transactions or dumps, because
327 * transactions and dumps can cause notifications to be lost.
328 *
cceb11f5
BP
329 * Multicast group numbers are always positive.
330 *
331 * It is not an error to attempt to join a multicast group to which a socket
332 * already belongs. */
333int
334nl_sock_join_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
335{
22326ba6
AS
336#ifdef _WIN32
337#define OVS_VPORT_MCGROUP_FALLBACK_ID 33
338 struct ofpbuf msg_buf;
339 struct message_multicast
340 {
341 struct nlmsghdr;
342 /* if true, join; if else, leave */
343 unsigned char join;
344 unsigned int groupId;
345 };
346
347 struct message_multicast msg = { 0 };
348
349 msg.nlmsg_len = sizeof(struct message_multicast);
350 msg.nlmsg_type = OVS_VPORT_MCGROUP_FALLBACK_ID;
351 msg.nlmsg_flags = 0;
352 msg.nlmsg_seq = 0;
353 msg.nlmsg_pid = sock->pid;
354
355 msg.join = 1;
356 msg.groupId = multicast_group;
357 msg_buf.base_ = &msg;
358 msg_buf.data_ = &msg;
359 msg_buf.size_ = msg.nlmsg_len;
360
361 nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
362#else
cceb11f5
BP
363 if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
364 &multicast_group, sizeof multicast_group) < 0) {
365 VLOG_WARN("could not join multicast group %u (%s)",
10a89ef0 366 multicast_group, ovs_strerror(errno));
cceb11f5
BP
367 return errno;
368 }
22326ba6 369#endif
cceb11f5
BP
370 return 0;
371}
372
373/* Tries to make 'sock' stop listening to 'multicast_group'. Returns 0 if
374 * successful, otherwise a positive errno value.
375 *
376 * Multicast group numbers are always positive.
377 *
378 * It is not an error to attempt to leave a multicast group to which a socket
379 * does not belong.
380 *
381 * On success, reading from 'sock' will still return any messages that were
382 * received on 'multicast_group' before the group was left. */
383int
384nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
385{
22326ba6
AS
386#ifdef _WIN32
387 struct ofpbuf msg_buf;
388 struct message_multicast
389 {
390 struct nlmsghdr;
391 /* if true, join; if else, leave*/
392 unsigned char join;
393 };
394
395 struct message_multicast msg = { 0 };
396 nl_msg_put_nlmsghdr(&msg, sizeof(struct message_multicast),
397 multicast_group, 0);
398 msg.join = 0;
399
400 msg_buf.base_ = &msg;
401 msg_buf.data_ = &msg;
402 msg_buf.size_ = msg.nlmsg_len;
403
404 nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
405#else
cceb11f5
BP
406 if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_DROP_MEMBERSHIP,
407 &multicast_group, sizeof multicast_group) < 0) {
408 VLOG_WARN("could not leave multicast group %u (%s)",
10a89ef0 409 multicast_group, ovs_strerror(errno));
cceb11f5
BP
410 return errno;
411 }
22326ba6 412#endif
cceb11f5
BP
413 return 0;
414}
415
c6eab56d 416static int
ff459dd6
BP
417nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
418 uint32_t nlmsg_seq, bool wait)
2fe27d5a
BP
419{
420 struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
421 int error;
422
1f317cb5 423 nlmsg->nlmsg_len = ofpbuf_size(msg);
ff459dd6 424 nlmsg->nlmsg_seq = nlmsg_seq;
2fe27d5a
BP
425 nlmsg->nlmsg_pid = sock->pid;
426 do {
427 int retval;
22326ba6 428#ifdef _WIN32
fd972eb8
NR
429 DWORD bytes;
430
431 if (!DeviceIoControl(sock->handle, OVS_IOCTL_WRITE,
432 ofpbuf_data(msg), ofpbuf_size(msg), NULL, 0,
433 &bytes, NULL)) {
22326ba6 434 retval = -1;
fd972eb8
NR
435 /* XXX: Map to a more appropriate error based on GetLastError(). */
436 errno = EINVAL;
437 } else {
438 retval = ofpbuf_size(msg);
22326ba6
AS
439 }
440#else
fd972eb8
NR
441 retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg),
442 wait ? 0 : MSG_DONTWAIT);
22326ba6 443#endif
2fe27d5a
BP
444 error = retval < 0 ? errno : 0;
445 } while (error == EINTR);
1f317cb5 446 log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
2fe27d5a
BP
447 if (!error) {
448 COVERAGE_INC(netlink_sent);
449 }
450 return error;
451}
452
c6eab56d 453/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
1f317cb5 454 * 'sock'. nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
ff459dd6
BP
455 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to a fresh
456 * sequence number, before the message is sent.
c6eab56d
BP
457 *
458 * Returns 0 if successful, otherwise a positive errno value. If
459 * 'wait' is true, then the send will wait until buffer space is ready;
460 * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
461int
462nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
ff459dd6
BP
463{
464 return nl_sock_send_seq(sock, msg, nl_sock_allocate_seq(sock, 1), wait);
465}
466
467/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
1f317cb5 468 * 'sock'. nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
ff459dd6
BP
469 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to
470 * 'nlmsg_seq', before the message is sent.
471 *
472 * Returns 0 if successful, otherwise a positive errno value. If
473 * 'wait' is true, then the send will wait until buffer space is ready;
474 * otherwise, returns EAGAIN if the 'sock' send buffer is full.
475 *
476 * This function is suitable for sending a reply to a request that was received
477 * with sequence number 'nlmsg_seq'. Otherwise, use nl_sock_send() instead. */
478int
479nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
480 uint32_t nlmsg_seq, bool wait)
c6eab56d 481{
ff459dd6 482 return nl_sock_send__(sock, msg, nlmsg_seq, wait);
c6eab56d
BP
483}
484
c6eab56d 485static int
72d32ac0 486nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
2fe27d5a 487{
72d32ac0
BP
488 /* We can't accurately predict the size of the data to be received. The
489 * caller is supposed to have allocated enough space in 'buf' to handle the
490 * "typical" case. To handle exceptions, we make available enough space in
491 * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
492 * figure since that's the maximum length of a Netlink attribute). */
2fe27d5a 493 struct nlmsghdr *nlmsghdr;
72d32ac0 494 uint8_t tail[65536];
fc999dda 495 struct iovec iov[2];
fc999dda
BP
496 struct msghdr msg;
497 ssize_t retval;
8f20fd98 498 int error;
fc999dda 499
cb22974d 500 ovs_assert(buf->allocated >= sizeof *nlmsghdr);
72d32ac0 501 ofpbuf_clear(buf);
2fe27d5a 502
1f317cb5 503 iov[0].iov_base = ofpbuf_base(buf);
72d32ac0 504 iov[0].iov_len = buf->allocated;
fc999dda 505 iov[1].iov_base = tail;
72d32ac0 506 iov[1].iov_len = sizeof tail;
fc999dda
BP
507
508 memset(&msg, 0, sizeof msg);
509 msg.msg_iov = iov;
510 msg.msg_iovlen = 2;
511
8f20fd98
BP
512 /* Receive a Netlink message from the kernel.
513 *
514 * This works around a kernel bug in which the kernel returns an error code
515 * as if it were the number of bytes read. It doesn't actually modify
516 * anything in the receive buffer in that case, so we can initialize the
517 * Netlink header with an impossible message length and then, upon success,
518 * check whether it changed. */
519 nlmsghdr = ofpbuf_base(buf);
2fe27d5a 520 do {
8f20fd98 521 nlmsghdr->nlmsg_len = UINT32_MAX;
22326ba6 522#ifdef _WIN32
fd972eb8
NR
523 DWORD bytes;
524 if (!DeviceIoControl(sock->handle, OVS_IOCTL_READ,
525 NULL, 0, tail, sizeof tail, &bytes, NULL)) {
22326ba6 526 retval = -1;
fd972eb8 527 errno = EINVAL;
22326ba6 528 } else {
fd972eb8
NR
529 retval = bytes;
530 if (retval == 0) {
531 retval = -1;
532 errno = EAGAIN;
533 } else {
534 if (retval >= buf->allocated) {
535 ofpbuf_reinit(buf, retval);
536 }
537 memcpy(ofpbuf_data(buf), tail, retval);
538 ofpbuf_set_size(buf, retval);
539 }
22326ba6
AS
540 }
541#else
fc999dda 542 retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);
22326ba6 543#endif
8f20fd98
BP
544 error = (retval < 0 ? errno
545 : retval == 0 ? ECONNRESET /* not possible? */
546 : nlmsghdr->nlmsg_len != UINT32_MAX ? 0
7f8e2646 547 : retval);
8f20fd98
BP
548 } while (error == EINTR);
549 if (error) {
fc999dda
BP
550 if (error == ENOBUFS) {
551 /* Socket receive buffer overflow dropped one or more messages that
552 * the kernel tried to send to us. */
553 COVERAGE_INC(netlink_overflow);
554 }
fc999dda 555 return error;
2fe27d5a 556 }
fc999dda 557
2fe27d5a 558 if (msg.msg_flags & MSG_TRUNC) {
34582733 559 VLOG_ERR_RL(&rl, "truncated message (longer than %"PRIuSIZE" bytes)",
72d32ac0 560 sizeof tail);
fc999dda 561 return E2BIG;
2fe27d5a 562 }
2fe27d5a 563
fc999dda 564 if (retval < sizeof *nlmsghdr
2fe27d5a 565 || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
fc999dda 566 || nlmsghdr->nlmsg_len > retval) {
e5e4b47c 567 VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE" bytes < %"PRIuSIZE")",
72d32ac0 568 retval, sizeof *nlmsghdr);
2fe27d5a
BP
569 return EPROTO;
570 }
22326ba6 571#ifndef _WIN32
1f317cb5 572 ofpbuf_set_size(buf, MIN(retval, buf->allocated));
72d32ac0
BP
573 if (retval > buf->allocated) {
574 COVERAGE_INC(netlink_recv_jumbo);
575 ofpbuf_put(buf, tail, retval - buf->allocated);
576 }
22326ba6 577#endif
72d32ac0 578
1f317cb5 579 log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
2fe27d5a
BP
580 COVERAGE_INC(netlink_received);
581
582 return 0;
583}
584
72d32ac0
BP
585/* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'. If
586 * 'wait' is true, waits for a message to be ready. Otherwise, fails with
587 * EAGAIN if the 'sock' receive buffer is empty.
588 *
589 * The caller must have initialized 'buf' with an allocation of at least
590 * NLMSG_HDRLEN bytes. For best performance, the caller should allocate enough
591 * space for a "typical" message.
592 *
593 * On success, returns 0 and replaces 'buf''s previous content by the received
594 * message. This function expands 'buf''s allocated memory, as necessary, to
595 * hold the actual size of the received message.
c6eab56d 596 *
72d32ac0
BP
597 * On failure, returns a positive errno value and clears 'buf' to zero length.
598 * 'buf' retains its previous memory allocation.
599 *
600 * Regardless of success or failure, this function resets 'buf''s headroom to
601 * 0. */
c6eab56d 602int
72d32ac0 603nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
c6eab56d 604{
72d32ac0 605 return nl_sock_recv__(sock, buf, wait);
cc75061a
BP
606}
607
608static void
609nl_sock_record_errors__(struct nl_transaction **transactions, size_t n,
610 int error)
611{
612 size_t i;
613
614 for (i = 0; i < n; i++) {
72d32ac0
BP
615 struct nl_transaction *txn = transactions[i];
616
617 txn->error = error;
618 if (txn->reply) {
619 ofpbuf_clear(txn->reply);
620 }
cc75061a
BP
621 }
622}
623
624static int
625nl_sock_transact_multiple__(struct nl_sock *sock,
626 struct nl_transaction **transactions, size_t n,
627 size_t *done)
628{
72d32ac0
BP
629 uint64_t tmp_reply_stub[1024 / 8];
630 struct nl_transaction tmp_txn;
631 struct ofpbuf tmp_reply;
632
633 uint32_t base_seq;
cc75061a
BP
634 struct iovec iovs[MAX_IOVS];
635 struct msghdr msg;
636 int error;
637 int i;
638
72d32ac0 639 base_seq = nl_sock_allocate_seq(sock, n);
cc75061a
BP
640 *done = 0;
641 for (i = 0; i < n; i++) {
72d32ac0
BP
642 struct nl_transaction *txn = transactions[i];
643 struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(txn->request);
cc75061a 644
1f317cb5 645 nlmsg->nlmsg_len = ofpbuf_size(txn->request);
72d32ac0 646 nlmsg->nlmsg_seq = base_seq + i;
cc75061a 647 nlmsg->nlmsg_pid = sock->pid;
cc75061a 648
1f317cb5
PS
649 iovs[i].iov_base = ofpbuf_data(txn->request);
650 iovs[i].iov_len = ofpbuf_size(txn->request);
cc75061a
BP
651 }
652
653 memset(&msg, 0, sizeof msg);
654 msg.msg_iov = iovs;
655 msg.msg_iovlen = n;
656 do {
22326ba6
AS
657#ifdef _WIN32
658 DWORD last_error = 0;
659 bool result = FALSE;
660 for (i = 0; i < n; i++) {
661 result = WriteFile((HANDLE)sock->handle, iovs[i].iov_base, iovs[i].iov_len,
662 &error, NULL);
663 last_error = GetLastError();
664 if (last_error != ERROR_SUCCESS && !result) {
665 error = EAGAIN;
666 errno = EAGAIN;
667 } else {
668 error = 0;
669 }
670 }
671#else
cc75061a 672 error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
22326ba6 673#endif
cc75061a
BP
674 } while (error == EINTR);
675
676 for (i = 0; i < n; i++) {
72d32ac0 677 struct nl_transaction *txn = transactions[i];
cc75061a 678
1f317cb5 679 log_nlmsg(__func__, error, ofpbuf_data(txn->request), ofpbuf_size(txn->request),
cc75061a
BP
680 sock->protocol);
681 }
682 if (!error) {
683 COVERAGE_ADD(netlink_sent, n);
684 }
685
686 if (error) {
687 return error;
688 }
689
72d32ac0
BP
690 ofpbuf_use_stub(&tmp_reply, tmp_reply_stub, sizeof tmp_reply_stub);
691 tmp_txn.request = NULL;
692 tmp_txn.reply = &tmp_reply;
693 tmp_txn.error = 0;
cc75061a 694 while (n > 0) {
72d32ac0
BP
695 struct nl_transaction *buf_txn, *txn;
696 uint32_t seq;
697
698 /* Find a transaction whose buffer we can use for receiving a reply.
699 * If no such transaction is left, use tmp_txn. */
700 buf_txn = &tmp_txn;
701 for (i = 0; i < n; i++) {
702 if (transactions[i]->reply) {
703 buf_txn = transactions[i];
704 break;
705 }
706 }
cc75061a 707
72d32ac0
BP
708 /* Receive a reply. */
709 error = nl_sock_recv__(sock, buf_txn->reply, false);
710 if (error) {
711 if (error == EAGAIN) {
712 nl_sock_record_errors__(transactions, n, 0);
713 *done += n;
714 error = 0;
715 }
716 break;
cc75061a
BP
717 }
718
72d32ac0
BP
719 /* Match the reply up with a transaction. */
720 seq = nl_msg_nlmsghdr(buf_txn->reply)->nlmsg_seq;
721 if (seq < base_seq || seq >= base_seq + n) {
722 VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, seq);
cc75061a
BP
723 continue;
724 }
72d32ac0
BP
725 i = seq - base_seq;
726 txn = transactions[i];
cc75061a 727
72d32ac0
BP
728 /* Fill in the results for 'txn'. */
729 if (nl_msg_nlmsgerr(buf_txn->reply, &txn->error)) {
730 if (txn->reply) {
731 ofpbuf_clear(txn->reply);
732 }
733 if (txn->error) {
cc75061a 734 VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
10a89ef0 735 error, ovs_strerror(txn->error));
cc75061a 736 }
cc75061a 737 } else {
72d32ac0
BP
738 txn->error = 0;
739 if (txn->reply && txn != buf_txn) {
740 /* Swap buffers. */
741 struct ofpbuf *reply = buf_txn->reply;
742 buf_txn->reply = txn->reply;
743 txn->reply = reply;
744 }
cc75061a
BP
745 }
746
72d32ac0
BP
747 /* Fill in the results for transactions before 'txn'. (We have to do
748 * this after the results for 'txn' itself because of the buffer swap
749 * above.) */
750 nl_sock_record_errors__(transactions, i, 0);
751
752 /* Advance. */
cc75061a
BP
753 *done += i + 1;
754 transactions += i + 1;
755 n -= i + 1;
72d32ac0 756 base_seq += i + 1;
cc75061a 757 }
72d32ac0 758 ofpbuf_uninit(&tmp_reply);
cc75061a 759
72d32ac0 760 return error;
cc75061a
BP
761}
762
022ad2b9 763static void
cc75061a
BP
764nl_sock_transact_multiple(struct nl_sock *sock,
765 struct nl_transaction **transactions, size_t n)
766{
767 int max_batch_count;
768 int error;
769
770 if (!n) {
771 return;
772 }
773
cc75061a
BP
774 /* In theory, every request could have a 64 kB reply. But the default and
775 * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
776 * be a bit below 128 kB, so that would only allow a single message in a
777 * "batch". So we assume that replies average (at most) 4 kB, which allows
778 * a good deal of batching.
779 *
780 * In practice, most of the requests that we batch either have no reply at
781 * all or a brief reply. */
782 max_batch_count = MAX(sock->rcvbuf / 4096, 1);
783 max_batch_count = MIN(max_batch_count, max_iovs);
784
785 while (n > 0) {
786 size_t count, bytes;
787 size_t done;
788
789 /* Batch up to 'max_batch_count' transactions. But cap it at about a
790 * page of requests total because big skbuffs are expensive to
791 * allocate in the kernel. */
792#if defined(PAGESIZE)
793 enum { MAX_BATCH_BYTES = MAX(1, PAGESIZE - 512) };
794#else
795 enum { MAX_BATCH_BYTES = 4096 - 512 };
796#endif
1f317cb5 797 bytes = ofpbuf_size(transactions[0]->request);
cc75061a 798 for (count = 1; count < n && count < max_batch_count; count++) {
1f317cb5 799 if (bytes + ofpbuf_size(transactions[count]->request) > MAX_BATCH_BYTES) {
cc75061a
BP
800 break;
801 }
1f317cb5 802 bytes += ofpbuf_size(transactions[count]->request);
cc75061a
BP
803 }
804
805 error = nl_sock_transact_multiple__(sock, transactions, count, &done);
806 transactions += done;
807 n -= done;
808
809 if (error == ENOBUFS) {
810 VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
811 } else if (error) {
10a89ef0 812 VLOG_ERR_RL(&rl, "transaction error (%s)", ovs_strerror(error));
cc75061a
BP
813 nl_sock_record_errors__(transactions, n, error);
814 }
815 }
816}
817
022ad2b9 818static int
cc75061a
BP
819nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
820 struct ofpbuf **replyp)
2fe27d5a 821{
cc75061a
BP
822 struct nl_transaction *transactionp;
823 struct nl_transaction transaction;
2fe27d5a 824
ebc56baa 825 transaction.request = CONST_CAST(struct ofpbuf *, request);
72d32ac0 826 transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
cc75061a 827 transactionp = &transaction;
72d32ac0 828
cc75061a 829 nl_sock_transact_multiple(sock, &transactionp, 1);
72d32ac0 830
2fe27d5a 831 if (replyp) {
72d32ac0
BP
832 if (transaction.error) {
833 ofpbuf_delete(transaction.reply);
834 *replyp = NULL;
835 } else {
836 *replyp = transaction.reply;
837 }
2fe27d5a 838 }
72d32ac0 839
cc75061a 840 return transaction.error;
2fe27d5a
BP
841}
842
6b7c12fd
BP
843/* Drain all the messages currently in 'sock''s receive queue. */
844int
845nl_sock_drain(struct nl_sock *sock)
846{
22326ba6
AS
847#ifdef _WIN32
848 return 0;
849#else
6b7c12fd 850 return drain_rcvbuf(sock->fd);
22326ba6 851#endif
6b7c12fd
BP
852}
853
a88b4e04
BP
854/* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
855 * Netlink socket created with the given 'protocol', and initializes 'dump' to
856 * reflect the state of the operation.
2fe27d5a 857 *
db1fc210
JS
858 * 'request' must contain a Netlink message. Before sending the message,
859 * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
860 * set to the Netlink socket's pid. NLM_F_DUMP and NLM_F_ACK will be set in
861 * nlmsg_flags.
2fe27d5a 862 *
a88b4e04 863 * The design of this Netlink socket library ensures that the dump is reliable.
2fe27d5a 864 *
db1fc210
JS
865 * This function provides no status indication. nl_dump_done() provides an
866 * error status for the entire dump operation.
2fe27d5a 867 *
db1fc210 868 * The caller must eventually destroy 'request'.
2fe27d5a
BP
869 */
870void
a88b4e04 871nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
2fe27d5a 872{
7d7447df 873 nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
93295354
BP
874
875 ovs_mutex_init(&dump->mutex);
876 ovs_mutex_lock(&dump->mutex);
877 dump->status = nl_pool_alloc(protocol, &dump->sock);
878 if (!dump->status) {
879 dump->status = nl_sock_send__(dump->sock, request,
880 nl_sock_allocate_seq(dump->sock, 1),
881 true);
b2d1c78a 882 }
9c8ad495 883 dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
93295354
BP
884 ovs_mutex_unlock(&dump->mutex);
885}
886
887static int
888nl_dump_refill(struct nl_dump *dump, struct ofpbuf *buffer)
889 OVS_REQUIRES(dump->mutex)
890{
891 struct nlmsghdr *nlmsghdr;
892 int error;
893
894 while (!ofpbuf_size(buffer)) {
1738803a 895 error = nl_sock_recv__(dump->sock, buffer, false);
93295354 896 if (error) {
1738803a
AW
897 /* The kernel never blocks providing the results of a dump, so
898 * error == EAGAIN means that we've read the whole thing, and
899 * therefore transform it into EOF. (The kernel always provides
900 * NLMSG_DONE as a sentinel. Some other thread must have received
901 * that already but not yet signaled it in 'status'.)
902 *
903 * Any other error is just an error. */
93295354
BP
904 return error == EAGAIN ? EOF : error;
905 }
906
907 nlmsghdr = nl_msg_nlmsghdr(buffer);
908 if (dump->nl_seq != nlmsghdr->nlmsg_seq) {
909 VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
910 nlmsghdr->nlmsg_seq, dump->nl_seq);
911 ofpbuf_clear(buffer);
912 }
913 }
914
915 if (nl_msg_nlmsgerr(buffer, &error) && error) {
916 VLOG_INFO_RL(&rl, "netlink dump request error (%s)",
917 ovs_strerror(error));
918 ofpbuf_clear(buffer);
919 return error;
920 }
921
922 return 0;
923}
924
925static int
926nl_dump_next__(struct ofpbuf *reply, struct ofpbuf *buffer)
927{
928 struct nlmsghdr *nlmsghdr = nl_msg_next(buffer, reply);
929 if (!nlmsghdr) {
930 VLOG_WARN_RL(&rl, "netlink dump contains message fragment");
931 return EPROTO;
932 } else if (nlmsghdr->nlmsg_type == NLMSG_DONE) {
933 return EOF;
934 } else {
935 return 0;
936 }
2fe27d5a
BP
937}
938
d57695d7
JS
939/* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
940 * have been initialized with nl_dump_start(), and 'buffer' must have been
941 * initialized. 'buffer' should be at least NL_DUMP_BUFSIZE bytes long.
2fe27d5a 942 *
19aa20a0
BP
943 * If successful, returns true and points 'reply->data' and
944 * 'ofpbuf_size(reply)' to the message that was retrieved. The caller must not
945 * modify 'reply' (because it points within 'buffer', which will be used by
946 * future calls to this function).
947 *
948 * On failure, returns false and sets 'reply->data' to NULL and
949 * 'ofpbuf_size(reply)' to 0. Failure might indicate an actual error or merely
950 * the end of replies. An error status for the entire dump operation is
951 * provided when it is completed by calling nl_dump_done().
0672776e
JS
952 *
953 * Multiple threads may call this function, passing the same nl_dump, however
954 * each must provide independent buffers. This function may cache multiple
955 * replies in the buffer, and these will be processed before more replies are
956 * fetched. When this function returns false, other threads may continue to
957 * process replies in their buffers, but they will not fetch more replies.
2fe27d5a
BP
958 */
959bool
d57695d7 960nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
2fe27d5a 961{
93295354 962 int retval = 0;
0672776e 963
93295354
BP
964 /* If the buffer is empty, refill it.
965 *
966 * If the buffer is not empty, we don't check the dump's status.
967 * Otherwise, we could end up skipping some of the dump results if thread A
968 * hits EOF while thread B is in the midst of processing a batch. */
969 if (!ofpbuf_size(buffer)) {
0791315e 970 ovs_mutex_lock(&dump->mutex);
93295354
BP
971 if (!dump->status) {
972 /* Take the mutex here to avoid an in-kernel race. If two threads
973 * try to read from a Netlink dump socket at once, then the socket
974 * error can be set to EINVAL, which will be encountered on the
975 * next recv on that socket, which could be anywhere due to the way
976 * that we pool Netlink sockets. Serializing the recv calls avoids
977 * the issue. */
978 dump->status = nl_dump_refill(dump, buffer);
979 }
980 retval = dump->status;
0791315e 981 ovs_mutex_unlock(&dump->mutex);
93295354 982 }
0791315e 983
93295354
BP
984 /* Fetch the next message from the buffer. */
985 if (!retval) {
986 retval = nl_dump_next__(reply, buffer);
2fe27d5a 987 if (retval) {
93295354
BP
988 /* Record 'retval' as the dump status, but don't overwrite an error
989 * with EOF. */
990 ovs_mutex_lock(&dump->mutex);
991 if (dump->status <= 0) {
992 dump->status = retval;
2fe27d5a 993 }
93295354 994 ovs_mutex_unlock(&dump->mutex);
2fe27d5a 995 }
2fe27d5a
BP
996 }
997
93295354
BP
998 if (retval) {
999 ofpbuf_set_data(reply, NULL);
1000 ofpbuf_set_size(reply, 0);
0672776e 1001 }
93295354 1002 return !retval;
2fe27d5a
BP
1003}
1004
1005/* Completes Netlink dump operation 'dump', which must have been initialized
1006 * with nl_dump_start(). Returns 0 if the dump operation was error-free,
1007 * otherwise a positive errno value describing the problem. */
1008int
1009nl_dump_done(struct nl_dump *dump)
1010{
0672776e 1011 int status;
d57695d7 1012
93295354
BP
1013 ovs_mutex_lock(&dump->mutex);
1014 status = dump->status;
1015 ovs_mutex_unlock(&dump->mutex);
1016
2fe27d5a 1017 /* Drain any remaining messages that the client didn't read. Otherwise the
a88b4e04
BP
1018 * kernel will continue to queue them up and waste buffer space.
1019 *
1020 * XXX We could just destroy and discard the socket in this case. */
0672776e
JS
1021 if (!status) {
1022 uint64_t tmp_reply_stub[NL_DUMP_BUFSIZE / 8];
1023 struct ofpbuf reply, buf;
1024
1025 ofpbuf_use_stub(&buf, tmp_reply_stub, sizeof tmp_reply_stub);
1026 while (nl_dump_next(dump, &reply, &buf)) {
1027 /* Nothing to do. */
2fe27d5a 1028 }
0672776e 1029 ofpbuf_uninit(&buf);
93295354
BP
1030
1031 ovs_mutex_lock(&dump->mutex);
1032 status = dump->status;
1033 ovs_mutex_unlock(&dump->mutex);
1034 ovs_assert(status);
2fe27d5a 1035 }
93295354 1036
a88b4e04 1037 nl_pool_release(dump->sock);
0791315e 1038 ovs_mutex_destroy(&dump->mutex);
93295354
BP
1039
1040 return status == EOF ? 0 : status;
2fe27d5a
BP
1041}
1042
1043/* Causes poll_block() to wake up when any of the specified 'events' (which is
1044 * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'. */
1045void
1046nl_sock_wait(const struct nl_sock *sock, short int events)
1047{
22326ba6
AS
1048#ifdef _WIN32
1049 poll_fd_wait(sock->handle, events);
1050#else
2fe27d5a 1051 poll_fd_wait(sock->fd, events);
22326ba6 1052#endif
2fe27d5a 1053}
50802adb 1054
8522ba09
BP
1055/* Returns the underlying fd for 'sock', for use in "poll()"-like operations
1056 * that can't use nl_sock_wait().
1057 *
1058 * It's a little tricky to use the returned fd correctly, because nl_sock does
1059 * "copy on write" to allow a single nl_sock to be used for notifications,
1060 * transactions, and dumps. If 'sock' is used only for notifications and
1061 * transactions (and never for dump) then the usage is safe. */
1062int
1063nl_sock_fd(const struct nl_sock *sock)
1064{
22326ba6
AS
1065#ifdef _WIN32
1066 return sock->handle;
1067#else
8522ba09 1068 return sock->fd;
22326ba6 1069#endif
8522ba09
BP
1070}
1071
50802adb
JG
1072/* Returns the PID associated with this socket. */
1073uint32_t
1074nl_sock_pid(const struct nl_sock *sock)
1075{
1076 return sock->pid;
1077}
2fe27d5a
BP
1078\f
1079/* Miscellaneous. */
1080
2ad204c8
BP
1081struct genl_family {
1082 struct hmap_node hmap_node;
1083 uint16_t id;
1084 char *name;
1085};
1086
1087static struct hmap genl_families = HMAP_INITIALIZER(&genl_families);
1088
2fe27d5a
BP
1089static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = {
1090 [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16},
213a13ed 1091 [CTRL_ATTR_MCAST_GROUPS] = {.type = NL_A_NESTED, .optional = true},
2fe27d5a
BP
1092};
1093
2ad204c8
BP
1094static struct genl_family *
1095find_genl_family_by_id(uint16_t id)
1096{
1097 struct genl_family *family;
1098
1099 HMAP_FOR_EACH_IN_BUCKET (family, hmap_node, hash_int(id, 0),
1100 &genl_families) {
1101 if (family->id == id) {
1102 return family;
1103 }
1104 }
1105 return NULL;
1106}
1107
1108static void
1109define_genl_family(uint16_t id, const char *name)
1110{
1111 struct genl_family *family = find_genl_family_by_id(id);
1112
1113 if (family) {
1114 if (!strcmp(family->name, name)) {
1115 return;
1116 }
1117 free(family->name);
1118 } else {
1119 family = xmalloc(sizeof *family);
1120 family->id = id;
1121 hmap_insert(&genl_families, &family->hmap_node, hash_int(id, 0));
1122 }
1123 family->name = xstrdup(name);
1124}
1125
1126static const char *
1127genl_family_to_name(uint16_t id)
1128{
1129 if (id == GENL_ID_CTRL) {
1130 return "control";
1131 } else {
1132 struct genl_family *family = find_genl_family_by_id(id);
1133 return family ? family->name : "unknown";
1134 }
1135}
1136
b3fca8a8 1137#ifndef _WIN32
e408762f 1138static int
2a477244
BP
1139do_lookup_genl_family(const char *name, struct nlattr **attrs,
1140 struct ofpbuf **replyp)
2fe27d5a
BP
1141{
1142 struct nl_sock *sock;
1143 struct ofpbuf request, *reply;
2a477244 1144 int error;
2fe27d5a 1145
2a477244
BP
1146 *replyp = NULL;
1147 error = nl_sock_create(NETLINK_GENERIC, &sock);
1148 if (error) {
1149 return error;
2fe27d5a
BP
1150 }
1151
1152 ofpbuf_init(&request, 0);
1153 nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
1154 CTRL_CMD_GETFAMILY, 1);
1155 nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name);
2a477244 1156 error = nl_sock_transact(sock, &request, &reply);
2fe27d5a 1157 ofpbuf_uninit(&request);
2a477244 1158 if (error) {
2fe27d5a 1159 nl_sock_destroy(sock);
2a477244 1160 return error;
2fe27d5a
BP
1161 }
1162
1163 if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
2a477244
BP
1164 family_policy, attrs, ARRAY_SIZE(family_policy))
1165 || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
2fe27d5a
BP
1166 nl_sock_destroy(sock);
1167 ofpbuf_delete(reply);
2a477244 1168 return EPROTO;
2fe27d5a
BP
1169 }
1170
2fe27d5a 1171 nl_sock_destroy(sock);
2a477244
BP
1172 *replyp = reply;
1173 return 0;
2fe27d5a 1174}
b3fca8a8
NR
1175#else
1176static int
1177do_lookup_genl_family(const char *name, struct nlattr **attrs,
1178 struct ofpbuf **replyp)
1179{
1180 struct nl_sock *sock;
4c484aca 1181 struct nlmsghdr *nlmsg;
b3fca8a8
NR
1182 struct ofpbuf *reply;
1183 int error;
1184 uint16_t family_id;
1185 const char *family_name;
1186 uint32_t family_version;
1187 uint32_t family_attrmax;
4c484aca
NR
1188 uint32_t mcgrp_id = OVS_WIN_NL_INVALID_MCGRP_ID;
1189 const char *mcgrp_name = NULL;
b3fca8a8
NR
1190
1191 *replyp = NULL;
1192 reply = ofpbuf_new(1024);
1193
4c484aca 1194 /* CTRL_ATTR_MCAST_GROUPS is supported only for VPORT family. */
b3fca8a8
NR
1195 if (!strcmp(name, OVS_WIN_CONTROL_FAMILY)) {
1196 family_id = OVS_WIN_NL_CTRL_FAMILY_ID;
1197 family_name = OVS_WIN_CONTROL_FAMILY;
1198 family_version = OVS_WIN_CONTROL_VERSION;
1199 family_attrmax = OVS_WIN_CONTROL_ATTR_MAX;
1200 } else if (!strcmp(name, OVS_DATAPATH_FAMILY)) {
1201 family_id = OVS_WIN_NL_DATAPATH_FAMILY_ID;
1202 family_name = OVS_DATAPATH_FAMILY;
1203 family_version = OVS_DATAPATH_VERSION;
1204 family_attrmax = OVS_DP_ATTR_MAX;
1205 } else if (!strcmp(name, OVS_PACKET_FAMILY)) {
1206 family_id = OVS_WIN_NL_PACKET_FAMILY_ID;
1207 family_name = OVS_PACKET_FAMILY;
1208 family_version = OVS_PACKET_VERSION;
1209 family_attrmax = OVS_PACKET_ATTR_MAX;
1210 } else if (!strcmp(name, OVS_VPORT_FAMILY)) {
1211 family_id = OVS_WIN_NL_VPORT_FAMILY_ID;
1212 family_name = OVS_VPORT_FAMILY;
1213 family_version = OVS_VPORT_VERSION;
1214 family_attrmax = OVS_VPORT_ATTR_MAX;
4c484aca
NR
1215 mcgrp_id = OVS_WIN_NL_VPORT_MCGRP_ID;
1216 mcgrp_name = OVS_VPORT_MCGROUP;
b3fca8a8
NR
1217 } else if (!strcmp(name, OVS_FLOW_FAMILY)) {
1218 family_id = OVS_WIN_NL_FLOW_FAMILY_ID;
1219 family_name = OVS_FLOW_FAMILY;
1220 family_version = OVS_FLOW_VERSION;
1221 family_attrmax = OVS_FLOW_ATTR_MAX;
1222 } else {
1223 ofpbuf_delete(reply);
1224 return EINVAL;
1225 }
1226
1227 nl_msg_put_genlmsghdr(reply, 0, GENL_ID_CTRL, 0,
1228 CTRL_CMD_NEWFAMILY, family_version);
1229 /* CTRL_ATTR_HDRSIZE and CTRL_ATTR_OPS are not populated, but the
1230 * callers do not seem to need them. */
1231 nl_msg_put_u16(reply, CTRL_ATTR_FAMILY_ID, family_id);
1232 nl_msg_put_string(reply, CTRL_ATTR_FAMILY_NAME, family_name);
1233 nl_msg_put_u32(reply, CTRL_ATTR_VERSION, family_version);
1234 nl_msg_put_u32(reply, CTRL_ATTR_MAXATTR, family_attrmax);
1235
4c484aca
NR
1236 if (mcgrp_id != OVS_WIN_NL_INVALID_MCGRP_ID) {
1237 size_t mcgrp_ofs1 = nl_msg_start_nested(reply, CTRL_ATTR_MCAST_GROUPS);
1238 size_t mcgrp_ofs2= nl_msg_start_nested(reply,
1239 OVS_WIN_NL_VPORT_MCGRP_ID - OVS_WIN_NL_MCGRP_START_ID);
1240 nl_msg_put_u32(reply, CTRL_ATTR_MCAST_GRP_ID, mcgrp_id);
1241 ovs_assert(mcgrp_name != NULL);
1242 nl_msg_put_string(reply, CTRL_ATTR_MCAST_GRP_NAME, mcgrp_name);
1243 nl_msg_end_nested(reply, mcgrp_ofs2);
1244 nl_msg_end_nested(reply, mcgrp_ofs1);
1245 }
1246
1247 /* Set the total length of the netlink message. */
1248 nlmsg = nl_msg_nlmsghdr(reply);
1249 nlmsg->nlmsg_len = ofpbuf_size(reply);
1250
b3fca8a8
NR
1251 if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
1252 family_policy, attrs, ARRAY_SIZE(family_policy))
1253 || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
1254 nl_sock_destroy(sock);
1255 ofpbuf_delete(reply);
1256 return EPROTO;
1257 }
1258
1259 *replyp = reply;
1260 return 0;
1261}
1262#endif
2fe27d5a 1263
e408762f
EJ
1264/* Finds the multicast group called 'group_name' in genl family 'family_name'.
1265 * When successful, writes its result to 'multicast_group' and returns 0.
213a13ed 1266 * Otherwise, clears 'multicast_group' and returns a positive error code.
b3dcb73c 1267 */
e408762f
EJ
1268int
1269nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
b3dcb73c 1270 unsigned int *multicast_group)
e408762f
EJ
1271{
1272 struct nlattr *family_attrs[ARRAY_SIZE(family_policy)];
6d23c6f4 1273 const struct nlattr *mc;
2a477244 1274 struct ofpbuf *reply;
e408762f 1275 unsigned int left;
2a477244 1276 int error;
e408762f
EJ
1277
1278 *multicast_group = 0;
2a477244
BP
1279 error = do_lookup_genl_family(family_name, family_attrs, &reply);
1280 if (error) {
1281 return error;
e408762f
EJ
1282 }
1283
213a13ed 1284 if (!family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
b3dcb73c 1285 error = EPROTO;
213a13ed
EJ
1286 goto exit;
1287 }
1288
6d23c6f4 1289 NL_NESTED_FOR_EACH (mc, left, family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
e408762f
EJ
1290 static const struct nl_policy mc_policy[] = {
1291 [CTRL_ATTR_MCAST_GRP_ID] = {.type = NL_A_U32},
1292 [CTRL_ATTR_MCAST_GRP_NAME] = {.type = NL_A_STRING},
1293 };
1294
1295 struct nlattr *mc_attrs[ARRAY_SIZE(mc_policy)];
1296 const char *mc_name;
1297
1298 if (!nl_parse_nested(mc, mc_policy, mc_attrs, ARRAY_SIZE(mc_policy))) {
2a477244
BP
1299 error = EPROTO;
1300 goto exit;
e408762f
EJ
1301 }
1302
1303 mc_name = nl_attr_get_string(mc_attrs[CTRL_ATTR_MCAST_GRP_NAME]);
1304 if (!strcmp(group_name, mc_name)) {
1305 *multicast_group =
1306 nl_attr_get_u32(mc_attrs[CTRL_ATTR_MCAST_GRP_ID]);
2a477244
BP
1307 error = 0;
1308 goto exit;
e408762f
EJ
1309 }
1310 }
2a477244 1311 error = EPROTO;
e408762f 1312
2a477244
BP
1313exit:
1314 ofpbuf_delete(reply);
1315 return error;
e408762f
EJ
1316}
1317
2fe27d5a
BP
1318/* If '*number' is 0, translates the given Generic Netlink family 'name' to a
1319 * number and stores it in '*number'. If successful, returns 0 and the caller
1320 * may use '*number' as the family number. On failure, returns a positive
1321 * errno value and '*number' caches the errno value. */
1322int
1323nl_lookup_genl_family(const char *name, int *number)
1324{
1325 if (*number == 0) {
2a477244
BP
1326 struct nlattr *attrs[ARRAY_SIZE(family_policy)];
1327 struct ofpbuf *reply;
1328 int error;
1329
1330 error = do_lookup_genl_family(name, attrs, &reply);
1331 if (!error) {
1332 *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
1333 define_genl_family(*number, name);
1334 } else {
1335 *number = -error;
1336 }
1337 ofpbuf_delete(reply);
1338
cb22974d 1339 ovs_assert(*number != 0);
2fe27d5a
BP
1340 }
1341 return *number > 0 ? 0 : -*number;
1342}
a88b4e04
BP
1343\f
1344struct nl_pool {
1345 struct nl_sock *socks[16];
1346 int n;
1347};
1348
834d6caf 1349static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
97be1538 1350static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
a88b4e04
BP
1351
1352static int
1353nl_pool_alloc(int protocol, struct nl_sock **sockp)
1354{
0bd01224 1355 struct nl_sock *sock = NULL;
a88b4e04
BP
1356 struct nl_pool *pool;
1357
1358 ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
1359
97be1538 1360 ovs_mutex_lock(&pool_mutex);
a88b4e04
BP
1361 pool = &pools[protocol];
1362 if (pool->n > 0) {
0bd01224
BP
1363 sock = pool->socks[--pool->n];
1364 }
97be1538 1365 ovs_mutex_unlock(&pool_mutex);
0bd01224
BP
1366
1367 if (sock) {
1368 *sockp = sock;
a88b4e04
BP
1369 return 0;
1370 } else {
1371 return nl_sock_create(protocol, sockp);
1372 }
1373}
1374
1375static void
1376nl_pool_release(struct nl_sock *sock)
1377{
1378 if (sock) {
1379 struct nl_pool *pool = &pools[sock->protocol];
1380
97be1538 1381 ovs_mutex_lock(&pool_mutex);
a88b4e04
BP
1382 if (pool->n < ARRAY_SIZE(pool->socks)) {
1383 pool->socks[pool->n++] = sock;
0bd01224 1384 sock = NULL;
a88b4e04 1385 }
97be1538 1386 ovs_mutex_unlock(&pool_mutex);
0bd01224
BP
1387
1388 nl_sock_destroy(sock);
a88b4e04
BP
1389 }
1390}
1391
022ad2b9
BP
1392/* Sends 'request' to the kernel on a Netlink socket for the given 'protocol'
1393 * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response. If
1394 * successful, returns 0. On failure, returns a positive errno value.
1395 *
1396 * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
1397 * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
1398 * on failure '*replyp' is set to NULL. If 'replyp' is null, then the kernel's
1399 * reply, if any, is discarded.
1400 *
1401 * Before the message is sent, nlmsg_len in 'request' will be finalized to
1402 * match ofpbuf_size(msg), nlmsg_pid will be set to the pid of the socket used
1403 * for sending the request, and nlmsg_seq will be initialized.
1404 *
1405 * The caller is responsible for destroying 'request'.
1406 *
1407 * Bare Netlink is an unreliable transport protocol. This function layers
1408 * reliable delivery and reply semantics on top of bare Netlink.
1409 *
1410 * In Netlink, sending a request to the kernel is reliable enough, because the
1411 * kernel will tell us if the message cannot be queued (and we will in that
1412 * case put it on the transmit queue and wait until it can be delivered).
1413 *
1414 * Receiving the reply is the real problem: if the socket buffer is full when
1415 * the kernel tries to send the reply, the reply will be dropped. However, the
1416 * kernel sets a flag that a reply has been dropped. The next call to recv
1417 * then returns ENOBUFS. We can then re-send the request.
1418 *
1419 * Caveats:
1420 *
1421 * 1. Netlink depends on sequence numbers to match up requests and
1422 * replies. The sender of a request supplies a sequence number, and
1423 * the reply echos back that sequence number.
1424 *
1425 * This is fine, but (1) some kernel netlink implementations are
1426 * broken, in that they fail to echo sequence numbers and (2) this
1427 * function will drop packets with non-matching sequence numbers, so
1428 * that only a single request can be usefully transacted at a time.
1429 *
1430 * 2. Resending the request causes it to be re-executed, so the request
1431 * needs to be idempotent.
1432 */
a88b4e04
BP
1433int
1434nl_transact(int protocol, const struct ofpbuf *request,
1435 struct ofpbuf **replyp)
1436{
1437 struct nl_sock *sock;
1438 int error;
1439
1440 error = nl_pool_alloc(protocol, &sock);
1441 if (error) {
1442 *replyp = NULL;
1443 return error;
1444 }
1445
1446 error = nl_sock_transact(sock, request, replyp);
1447
1448 nl_pool_release(sock);
1449 return error;
1450}
1451
022ad2b9
BP
1452/* Sends the 'request' member of the 'n' transactions in 'transactions' on a
1453 * Netlink socket for the given 'protocol' (e.g. NETLINK_ROUTE or
1454 * NETLINK_GENERIC), in order, and receives responses to all of them. Fills in
1455 * the 'error' member of each transaction with 0 if it was successful,
1456 * otherwise with a positive errno value. If 'reply' is nonnull, then it will
1457 * be filled with the reply if the message receives a detailed reply. In other
1458 * cases, i.e. where the request failed or had no reply beyond an indication of
1459 * success, 'reply' will be cleared if it is nonnull.
1460 *
1461 * The caller is responsible for destroying each request and reply, and the
1462 * transactions array itself.
1463 *
1464 * Before sending each message, this function will finalize nlmsg_len in each
1465 * 'request' to match the ofpbuf's size, set nlmsg_pid to the pid of the socket
1466 * used for the transaction, and initialize nlmsg_seq.
1467 *
1468 * Bare Netlink is an unreliable transport protocol. This function layers
1469 * reliable delivery and reply semantics on top of bare Netlink. See
1470 * nl_transact() for some caveats.
1471 */
a88b4e04
BP
1472void
1473nl_transact_multiple(int protocol,
1474 struct nl_transaction **transactions, size_t n)
1475{
1476 struct nl_sock *sock;
1477 int error;
1478
1479 error = nl_pool_alloc(protocol, &sock);
1480 if (!error) {
1481 nl_sock_transact_multiple(sock, transactions, n);
1482 nl_pool_release(sock);
1483 } else {
1484 nl_sock_record_errors__(transactions, n, error);
1485 }
1486}
1487
2fe27d5a 1488\f
7d7447df
BP
1489static uint32_t
1490nl_sock_allocate_seq(struct nl_sock *sock, unsigned int n)
1491{
1492 uint32_t seq = sock->next_seq;
1493
1494 sock->next_seq += n;
1495
1496 /* Make it impossible for the next request for sequence numbers to wrap
1497 * around to 0. Start over with 1 to avoid ever using a sequence number of
1498 * 0, because the kernel uses sequence number 0 for notifications. */
1499 if (sock->next_seq >= UINT32_MAX / 2) {
1500 sock->next_seq = 1;
1501 }
1502
1503 return seq;
1504}
1505
2fe27d5a 1506static void
2ad204c8 1507nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds)
2fe27d5a
BP
1508{
1509 struct nlmsg_flag {
1510 unsigned int bits;
1511 const char *name;
1512 };
1513 static const struct nlmsg_flag flags[] = {
1514 { NLM_F_REQUEST, "REQUEST" },
1515 { NLM_F_MULTI, "MULTI" },
1516 { NLM_F_ACK, "ACK" },
1517 { NLM_F_ECHO, "ECHO" },
1518 { NLM_F_DUMP, "DUMP" },
1519 { NLM_F_ROOT, "ROOT" },
1520 { NLM_F_MATCH, "MATCH" },
1521 { NLM_F_ATOMIC, "ATOMIC" },
1522 };
1523 const struct nlmsg_flag *flag;
1524 uint16_t flags_left;
1525
1526 ds_put_format(ds, "nl(len:%"PRIu32", type=%"PRIu16,
1527 h->nlmsg_len, h->nlmsg_type);
1528 if (h->nlmsg_type == NLMSG_NOOP) {
1529 ds_put_cstr(ds, "(no-op)");
1530 } else if (h->nlmsg_type == NLMSG_ERROR) {
1531 ds_put_cstr(ds, "(error)");
1532 } else if (h->nlmsg_type == NLMSG_DONE) {
1533 ds_put_cstr(ds, "(done)");
1534 } else if (h->nlmsg_type == NLMSG_OVERRUN) {
1535 ds_put_cstr(ds, "(overrun)");
1536 } else if (h->nlmsg_type < NLMSG_MIN_TYPE) {
1537 ds_put_cstr(ds, "(reserved)");
2ad204c8
BP
1538 } else if (protocol == NETLINK_GENERIC) {
1539 ds_put_format(ds, "(%s)", genl_family_to_name(h->nlmsg_type));
2fe27d5a
BP
1540 } else {
1541 ds_put_cstr(ds, "(family-defined)");
1542 }
1543 ds_put_format(ds, ", flags=%"PRIx16, h->nlmsg_flags);
1544 flags_left = h->nlmsg_flags;
1545 for (flag = flags; flag < &flags[ARRAY_SIZE(flags)]; flag++) {
1546 if ((flags_left & flag->bits) == flag->bits) {
1547 ds_put_format(ds, "[%s]", flag->name);
1548 flags_left &= ~flag->bits;
1549 }
1550 }
1551 if (flags_left) {
1552 ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left);
1553 }
2c5a6834
BP
1554 ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32,
1555 h->nlmsg_seq, h->nlmsg_pid);
2fe27d5a
BP
1556}
1557
1558static char *
7041c3a9 1559nlmsg_to_string(const struct ofpbuf *buffer, int protocol)
2fe27d5a
BP
1560{
1561 struct ds ds = DS_EMPTY_INITIALIZER;
1562 const struct nlmsghdr *h = ofpbuf_at(buffer, 0, NLMSG_HDRLEN);
1563 if (h) {
2ad204c8 1564 nlmsghdr_to_string(h, protocol, &ds);
2fe27d5a
BP
1565 if (h->nlmsg_type == NLMSG_ERROR) {
1566 const struct nlmsgerr *e;
1567 e = ofpbuf_at(buffer, NLMSG_HDRLEN,
1568 NLMSG_ALIGN(sizeof(struct nlmsgerr)));
1569 if (e) {
1570 ds_put_format(&ds, " error(%d", e->error);
1571 if (e->error < 0) {
10a89ef0 1572 ds_put_format(&ds, "(%s)", ovs_strerror(-e->error));
2fe27d5a
BP
1573 }
1574 ds_put_cstr(&ds, ", in-reply-to(");
2ad204c8 1575 nlmsghdr_to_string(&e->msg, protocol, &ds);
2fe27d5a
BP
1576 ds_put_cstr(&ds, "))");
1577 } else {
1578 ds_put_cstr(&ds, " error(truncated)");
1579 }
1580 } else if (h->nlmsg_type == NLMSG_DONE) {
1581 int *error = ofpbuf_at(buffer, NLMSG_HDRLEN, sizeof *error);
1582 if (error) {
1583 ds_put_format(&ds, " done(%d", *error);
1584 if (*error < 0) {
10a89ef0 1585 ds_put_format(&ds, "(%s)", ovs_strerror(-*error));
2fe27d5a
BP
1586 }
1587 ds_put_cstr(&ds, ")");
1588 } else {
1589 ds_put_cstr(&ds, " done(truncated)");
1590 }
7041c3a9
BP
1591 } else if (protocol == NETLINK_GENERIC) {
1592 struct genlmsghdr *genl = nl_msg_genlmsghdr(buffer);
1593 if (genl) {
1594 ds_put_format(&ds, ",genl(cmd=%"PRIu8",version=%"PRIu8")",
1595 genl->cmd, genl->version);
1596 }
2fe27d5a
BP
1597 }
1598 } else {
1599 ds_put_cstr(&ds, "nl(truncated)");
1600 }
1601 return ds.string;
1602}
1603
1604static void
1605log_nlmsg(const char *function, int error,
7041c3a9 1606 const void *message, size_t size, int protocol)
2fe27d5a
BP
1607{
1608 struct ofpbuf buffer;
1609 char *nlmsg;
1610
1611 if (!VLOG_IS_DBG_ENABLED()) {
1612 return;
1613 }
1614
1615 ofpbuf_use_const(&buffer, message, size);
7041c3a9 1616 nlmsg = nlmsg_to_string(&buffer, protocol);
10a89ef0 1617 VLOG_DBG_RL(&rl, "%s (%s): %s", function, ovs_strerror(error), nlmsg);
2fe27d5a
BP
1618 free(nlmsg);
1619}