2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "netlink-socket.h"
22 #include <sys/types.h>
26 #include "openvswitch/dynamic-string.h"
28 #include "openvswitch/hmap.h"
30 #include "netlink-protocol.h"
31 #include "odp-netlink.h"
32 #include "openvswitch/ofpbuf.h"
33 #include "ovs-thread.h"
34 #include "openvswitch/poll-loop.h"
36 #include "socket-util.h"
38 #include "openvswitch/vlog.h"
40 VLOG_DEFINE_THIS_MODULE(netlink_socket
);
42 COVERAGE_DEFINE(netlink_overflow
);
43 COVERAGE_DEFINE(netlink_received
);
44 COVERAGE_DEFINE(netlink_recv_jumbo
);
45 COVERAGE_DEFINE(netlink_sent
);
47 /* Linux header file confusion causes this to be undefined. */
49 #define SOL_NETLINK 270
52 /* A single (bad) Netlink message can in theory dump out many, many log
53 * messages, so the burst size is set quite high here to avoid missing useful
54 * information. Also, at high logging levels we log *all* Netlink messages. */
55 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(60, 600);
57 static uint32_t nl_sock_allocate_seq(struct nl_sock
*, unsigned int n
);
58 static void log_nlmsg(const char *function
, int error
,
59 const void *message
, size_t size
, int protocol
);
61 static int get_sock_pid_from_kernel(struct nl_sock
*sock
);
62 static int set_sock_property(struct nl_sock
*sock
);
63 static int nl_sock_transact(struct nl_sock
*sock
, const struct ofpbuf
*request
,
64 struct ofpbuf
**replyp
);
66 /* In the case DeviceIoControl failed and GetLastError returns with
67 * ERROR_NOT_FOUND means we lost communication with the kernel device.
68 * CloseHandle will fail because the handle in 'theory' does not exist.
69 * The only remaining option is to crash and allow the service to be restarted
70 * via service manager. This is the only way to close the handle from both
71 * userspace and kernel. */
73 lost_communication(DWORD last_err
)
75 if (last_err
== ERROR_NOT_FOUND
) {
76 ovs_abort(0, "lost communication with the kernel device");
81 /* Netlink sockets. */
86 OVERLAPPED overlapped
;
94 unsigned int rcvbuf
; /* Receive buffer size (SO_RCVBUF). */
97 /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
98 * of iovecs on the stack. */
101 /* Maximum number of iovecs that may be passed to sendmsg, capped at a
102 * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
104 * Initialized by nl_sock_create(). */
107 static int nl_pool_alloc(int protocol
, struct nl_sock
**sockp
);
108 static void nl_pool_release(struct nl_sock
*);
110 /* Creates a new netlink socket for the given netlink 'protocol'
111 * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Returns 0 and sets '*sockp' to the
112 * new socket if successful, otherwise returns a positive errno value. */
114 nl_sock_create(int protocol
, struct nl_sock
**sockp
)
116 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
117 struct nl_sock
*sock
;
119 struct sockaddr_nl local
, remote
;
121 socklen_t local_size
;
125 if (ovsthread_once_start(&once
)) {
126 int save_errno
= errno
;
129 max_iovs
= sysconf(_SC_UIO_MAXIOV
);
130 if (max_iovs
< _XOPEN_IOV_MAX
) {
131 if (max_iovs
== -1 && errno
) {
132 VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", ovs_strerror(errno
));
134 max_iovs
= _XOPEN_IOV_MAX
;
135 } else if (max_iovs
> MAX_IOVS
) {
140 ovsthread_once_done(&once
);
144 sock
= xmalloc(sizeof *sock
);
147 sock
->overlapped
.hEvent
= NULL
;
148 sock
->handle
= CreateFile(OVS_DEVICE_NAME_USER
,
149 GENERIC_READ
| GENERIC_WRITE
,
150 FILE_SHARE_READ
| FILE_SHARE_WRITE
,
152 FILE_FLAG_OVERLAPPED
, NULL
);
154 if (sock
->handle
== INVALID_HANDLE_VALUE
) {
155 VLOG_ERR("fcntl: %s", ovs_lasterror_to_string());
159 memset(&sock
->overlapped
, 0, sizeof sock
->overlapped
);
160 sock
->overlapped
.hEvent
= CreateEvent(NULL
, FALSE
, FALSE
, NULL
);
161 if (sock
->overlapped
.hEvent
== NULL
) {
162 VLOG_ERR("fcntl: %s", ovs_lasterror_to_string());
165 /* Initialize the type/ioctl to Generic */
166 sock
->read_ioctl
= OVS_IOCTL_READ
;
168 sock
->fd
= socket(AF_NETLINK
, SOCK_RAW
, protocol
);
170 VLOG_ERR("fcntl: %s", ovs_strerror(errno
));
175 sock
->protocol
= protocol
;
178 rcvbuf
= 1024 * 1024;
180 sock
->rcvbuf
= rcvbuf
;
181 retval
= get_sock_pid_from_kernel(sock
);
185 retval
= set_sock_property(sock
);
190 if (setsockopt(sock
->fd
, SOL_SOCKET
, SO_RCVBUFFORCE
,
191 &rcvbuf
, sizeof rcvbuf
)) {
192 /* Only root can use SO_RCVBUFFORCE. Everyone else gets EPERM.
193 * Warn only if the failure is therefore unexpected. */
194 if (errno
!= EPERM
) {
195 VLOG_WARN_RL(&rl
, "setting %d-byte socket receive buffer failed "
196 "(%s)", rcvbuf
, ovs_strerror(errno
));
200 retval
= get_socket_rcvbuf(sock
->fd
);
205 sock
->rcvbuf
= retval
;
208 /* Connect to kernel (pid 0) as remote address. */
209 memset(&remote
, 0, sizeof remote
);
210 remote
.nl_family
= AF_NETLINK
;
212 if (connect(sock
->fd
, (struct sockaddr
*) &remote
, sizeof remote
) < 0) {
213 VLOG_ERR("connect(0): %s", ovs_strerror(errno
));
217 /* Obtain pid assigned by kernel. */
218 local_size
= sizeof local
;
219 if (getsockname(sock
->fd
, (struct sockaddr
*) &local
, &local_size
) < 0) {
220 VLOG_ERR("getsockname: %s", ovs_strerror(errno
));
223 if (local_size
< sizeof local
|| local
.nl_family
!= AF_NETLINK
) {
224 VLOG_ERR("getsockname returned bad Netlink name");
228 sock
->pid
= local
.nl_pid
;
242 if (sock
->overlapped
.hEvent
) {
243 CloseHandle(sock
->overlapped
.hEvent
);
245 if (sock
->handle
!= INVALID_HANDLE_VALUE
) {
246 CloseHandle(sock
->handle
);
257 /* Creates a new netlink socket for the same protocol as 'src'. Returns 0 and
258 * sets '*sockp' to the new socket if successful, otherwise returns a positive
261 nl_sock_clone(const struct nl_sock
*src
, struct nl_sock
**sockp
)
263 return nl_sock_create(src
->protocol
, sockp
);
266 /* Destroys netlink socket 'sock'. */
268 nl_sock_destroy(struct nl_sock
*sock
)
272 if (sock
->overlapped
.hEvent
) {
273 CloseHandle(sock
->overlapped
.hEvent
);
275 CloseHandle(sock
->handle
);
284 /* Reads the pid for 'sock' generated in the kernel datapath. The function
285 * uses a separate IOCTL instead of a transaction semantic to avoid unnecessary
286 * message overhead. */
288 get_sock_pid_from_kernel(struct nl_sock
*sock
)
294 if (!DeviceIoControl(sock
->handle
, OVS_IOCTL_GET_PID
,
295 NULL
, 0, &pid
, sizeof(pid
),
297 lost_communication(GetLastError());
300 if (bytes
< sizeof(pid
)) {
310 /* Used for setting and managing socket properties in userspace and kernel.
311 * Currently two attributes are tracked - pid and protocol
312 * protocol - supplied by userspace based on the netlink family. Windows uses
313 * this property to set the value in kernel datapath.
314 * eg: (NETLINK_GENERIC/ NETLINK_NETFILTER)
315 * pid - generated by windows kernel and set in userspace. The property
317 * Also verify if Protocol and PID in Kernel reflects the values in userspace
320 set_sock_property(struct nl_sock
*sock
)
322 static const struct nl_policy ovs_socket_policy
[] = {
323 [OVS_NL_ATTR_SOCK_PROTO
] = { .type
= NL_A_BE32
, .optional
= true },
324 [OVS_NL_ATTR_SOCK_PID
] = { .type
= NL_A_BE32
, .optional
= true }
327 struct ofpbuf request
, *reply
;
328 struct ovs_header
*ovs_header
;
329 struct nlattr
*attrs
[ARRAY_SIZE(ovs_socket_policy
)];
333 ofpbuf_init(&request
, 0);
334 nl_msg_put_genlmsghdr(&request
, 0, OVS_WIN_NL_CTRL_FAMILY_ID
, 0,
335 OVS_CTRL_CMD_SOCK_PROP
, OVS_WIN_CONTROL_VERSION
);
336 ovs_header
= ofpbuf_put_uninit(&request
, sizeof *ovs_header
);
337 ovs_header
->dp_ifindex
= 0;
339 nl_msg_put_be32(&request
, OVS_NL_ATTR_SOCK_PROTO
, sock
->protocol
);
340 /* pid is already set as part of get_sock_pid_from_kernel()
341 * This is added to maintain consistency
343 nl_msg_put_be32(&request
, OVS_NL_ATTR_SOCK_PID
, sock
->pid
);
345 error
= nl_sock_transact(sock
, &request
, &reply
);
346 ofpbuf_uninit(&request
);
351 if (!nl_policy_parse(reply
,
352 NLMSG_HDRLEN
+ GENL_HDRLEN
+ sizeof *ovs_header
,
353 ovs_socket_policy
, attrs
,
354 ARRAY_SIZE(ovs_socket_policy
))) {
355 ofpbuf_delete(reply
);
358 /* Verify if the properties are setup properly */
359 if (attrs
[OVS_NL_ATTR_SOCK_PROTO
]) {
360 int protocol
= nl_attr_get_be32(attrs
[OVS_NL_ATTR_SOCK_PROTO
]);
361 if (protocol
!= sock
->protocol
) {
362 VLOG_ERR("Invalid protocol returned:%d expected:%d",
363 protocol
, sock
->protocol
);
368 if (attrs
[OVS_NL_ATTR_SOCK_PID
]) {
369 int pid
= nl_attr_get_be32(attrs
[OVS_NL_ATTR_SOCK_PID
]);
370 if (pid
!= sock
->pid
) {
371 VLOG_ERR("Invalid pid returned:%d expected:%d",
383 nl_sock_mcgroup(struct nl_sock
*sock
, unsigned int multicast_group
, bool join
)
385 struct ofpbuf request
;
386 uint64_t request_stub
[128];
387 struct ovs_header
*ovs_header
;
388 struct nlmsghdr
*nlmsg
;
391 ofpbuf_use_stub(&request
, request_stub
, sizeof request_stub
);
393 nl_msg_put_genlmsghdr(&request
, 0, OVS_WIN_NL_CTRL_FAMILY_ID
, 0,
394 OVS_CTRL_CMD_MC_SUBSCRIBE_REQ
,
395 OVS_WIN_CONTROL_VERSION
);
397 ovs_header
= ofpbuf_put_uninit(&request
, sizeof *ovs_header
);
398 ovs_header
->dp_ifindex
= 0;
400 nl_msg_put_u32(&request
, OVS_NL_ATTR_MCAST_GRP
, multicast_group
);
401 nl_msg_put_u8(&request
, OVS_NL_ATTR_MCAST_JOIN
, join
? 1 : 0);
403 error
= nl_sock_send(sock
, &request
, true);
404 ofpbuf_uninit(&request
);
408 /* Tries to add 'sock' as a listener for 'multicast_group'. Returns 0 if
409 * successful, otherwise a positive errno value.
411 * A socket that is subscribed to a multicast group that receives asynchronous
412 * notifications must not be used for Netlink transactions or dumps, because
413 * transactions and dumps can cause notifications to be lost.
415 * Multicast group numbers are always positive.
417 * It is not an error to attempt to join a multicast group to which a socket
418 * already belongs. */
420 nl_sock_join_mcgroup(struct nl_sock
*sock
, unsigned int multicast_group
)
423 /* Set the socket type as a "multicast" socket */
424 sock
->read_ioctl
= OVS_IOCTL_READ_EVENT
;
425 int error
= nl_sock_mcgroup(sock
, multicast_group
, true);
427 sock
->read_ioctl
= OVS_IOCTL_READ
;
428 VLOG_WARN("could not join multicast group %u (%s)",
429 multicast_group
, ovs_strerror(error
));
433 if (setsockopt(sock
->fd
, SOL_NETLINK
, NETLINK_ADD_MEMBERSHIP
,
434 &multicast_group
, sizeof multicast_group
) < 0) {
435 VLOG_WARN("could not join multicast group %u (%s)",
436 multicast_group
, ovs_strerror(errno
));
445 nl_sock_subscribe_packet__(struct nl_sock
*sock
, bool subscribe
)
447 struct ofpbuf request
;
448 uint64_t request_stub
[128];
449 struct ovs_header
*ovs_header
;
450 struct nlmsghdr
*nlmsg
;
453 ofpbuf_use_stub(&request
, request_stub
, sizeof request_stub
);
454 nl_msg_put_genlmsghdr(&request
, 0, OVS_WIN_NL_CTRL_FAMILY_ID
, 0,
455 OVS_CTRL_CMD_PACKET_SUBSCRIBE_REQ
,
456 OVS_WIN_CONTROL_VERSION
);
458 ovs_header
= ofpbuf_put_uninit(&request
, sizeof *ovs_header
);
459 ovs_header
->dp_ifindex
= 0;
460 nl_msg_put_u8(&request
, OVS_NL_ATTR_PACKET_SUBSCRIBE
, subscribe
? 1 : 0);
461 nl_msg_put_u32(&request
, OVS_NL_ATTR_PACKET_PID
, sock
->pid
);
463 error
= nl_sock_send(sock
, &request
, true);
464 ofpbuf_uninit(&request
);
469 nl_sock_subscribe_packets(struct nl_sock
*sock
)
473 if (sock
->read_ioctl
!= OVS_IOCTL_READ
) {
477 error
= nl_sock_subscribe_packet__(sock
, true);
479 VLOG_WARN("could not subscribe packets (%s)",
480 ovs_strerror(error
));
483 sock
->read_ioctl
= OVS_IOCTL_READ_PACKET
;
489 nl_sock_unsubscribe_packets(struct nl_sock
*sock
)
491 ovs_assert(sock
->read_ioctl
== OVS_IOCTL_READ_PACKET
);
493 int error
= nl_sock_subscribe_packet__(sock
, false);
495 VLOG_WARN("could not unsubscribe to packets (%s)",
496 ovs_strerror(error
));
500 sock
->read_ioctl
= OVS_IOCTL_READ
;
505 /* Tries to make 'sock' stop listening to 'multicast_group'. Returns 0 if
506 * successful, otherwise a positive errno value.
508 * Multicast group numbers are always positive.
510 * It is not an error to attempt to leave a multicast group to which a socket
513 * On success, reading from 'sock' will still return any messages that were
514 * received on 'multicast_group' before the group was left. */
516 nl_sock_leave_mcgroup(struct nl_sock
*sock
, unsigned int multicast_group
)
519 int error
= nl_sock_mcgroup(sock
, multicast_group
, false);
521 VLOG_WARN("could not leave multicast group %u (%s)",
522 multicast_group
, ovs_strerror(error
));
525 sock
->read_ioctl
= OVS_IOCTL_READ
;
527 if (setsockopt(sock
->fd
, SOL_NETLINK
, NETLINK_DROP_MEMBERSHIP
,
528 &multicast_group
, sizeof multicast_group
) < 0) {
529 VLOG_WARN("could not leave multicast group %u (%s)",
530 multicast_group
, ovs_strerror(errno
));
538 nl_sock_send__(struct nl_sock
*sock
, const struct ofpbuf
*msg
,
539 uint32_t nlmsg_seq
, bool wait
)
541 struct nlmsghdr
*nlmsg
= nl_msg_nlmsghdr(msg
);
544 nlmsg
->nlmsg_len
= msg
->size
;
545 nlmsg
->nlmsg_seq
= nlmsg_seq
;
546 nlmsg
->nlmsg_pid
= sock
->pid
;
552 if (!DeviceIoControl(sock
->handle
, OVS_IOCTL_WRITE
,
553 msg
->data
, msg
->size
, NULL
, 0,
555 lost_communication(GetLastError());
557 /* XXX: Map to a more appropriate error based on GetLastError(). */
559 VLOG_DBG_RL(&rl
, "fatal driver failure in write: %s",
560 ovs_lasterror_to_string());
565 retval
= send(sock
->fd
, msg
->data
, msg
->size
,
566 wait
? 0 : MSG_DONTWAIT
);
568 error
= retval
< 0 ? errno
: 0;
569 } while (error
== EINTR
);
570 log_nlmsg(__func__
, error
, msg
->data
, msg
->size
, sock
->protocol
);
572 COVERAGE_INC(netlink_sent
);
577 /* Tries to send 'msg', which must contain a Netlink message, to the kernel on
578 * 'sock'. nlmsg_len in 'msg' will be finalized to match msg->size, nlmsg_pid
579 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to a fresh
580 * sequence number, before the message is sent.
582 * Returns 0 if successful, otherwise a positive errno value. If
583 * 'wait' is true, then the send will wait until buffer space is ready;
584 * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
586 nl_sock_send(struct nl_sock
*sock
, const struct ofpbuf
*msg
, bool wait
)
588 return nl_sock_send_seq(sock
, msg
, nl_sock_allocate_seq(sock
, 1), wait
);
591 /* Tries to send 'msg', which must contain a Netlink message, to the kernel on
592 * 'sock'. nlmsg_len in 'msg' will be finalized to match msg->size, nlmsg_pid
593 * will be set to 'sock''s pid, and nlmsg_seq will be initialized to
594 * 'nlmsg_seq', before the message is sent.
596 * Returns 0 if successful, otherwise a positive errno value. If
597 * 'wait' is true, then the send will wait until buffer space is ready;
598 * otherwise, returns EAGAIN if the 'sock' send buffer is full.
600 * This function is suitable for sending a reply to a request that was received
601 * with sequence number 'nlmsg_seq'. Otherwise, use nl_sock_send() instead. */
603 nl_sock_send_seq(struct nl_sock
*sock
, const struct ofpbuf
*msg
,
604 uint32_t nlmsg_seq
, bool wait
)
606 return nl_sock_send__(sock
, msg
, nlmsg_seq
, wait
);
610 nl_sock_recv__(struct nl_sock
*sock
, struct ofpbuf
*buf
, bool wait
)
612 /* We can't accurately predict the size of the data to be received. The
613 * caller is supposed to have allocated enough space in 'buf' to handle the
614 * "typical" case. To handle exceptions, we make available enough space in
615 * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
616 * figure since that's the maximum length of a Netlink attribute). */
617 struct nlmsghdr
*nlmsghdr
;
624 ovs_assert(buf
->allocated
>= sizeof *nlmsghdr
);
627 iov
[0].iov_base
= buf
->base
;
628 iov
[0].iov_len
= buf
->allocated
;
629 iov
[1].iov_base
= tail
;
630 iov
[1].iov_len
= sizeof tail
;
632 memset(&msg
, 0, sizeof msg
);
636 /* Receive a Netlink message from the kernel.
638 * This works around a kernel bug in which the kernel returns an error code
639 * as if it were the number of bytes read. It doesn't actually modify
640 * anything in the receive buffer in that case, so we can initialize the
641 * Netlink header with an impossible message length and then, upon success,
642 * check whether it changed. */
643 nlmsghdr
= buf
->base
;
645 nlmsghdr
->nlmsg_len
= UINT32_MAX
;
648 if (!DeviceIoControl(sock
->handle
, sock
->read_ioctl
,
649 NULL
, 0, tail
, sizeof tail
, &bytes
, NULL
)) {
650 lost_communication(GetLastError());
651 VLOG_DBG_RL(&rl
, "fatal driver failure in transact: %s",
652 ovs_lasterror_to_string());
654 /* XXX: Map to a more appropriate error. */
662 if (retval
>= buf
->allocated
) {
663 ofpbuf_reinit(buf
, retval
);
664 nlmsghdr
= buf
->base
;
665 nlmsghdr
->nlmsg_len
= UINT32_MAX
;
667 memcpy(buf
->data
, tail
, retval
);
672 retval
= recvmsg(sock
->fd
, &msg
, wait
? 0 : MSG_DONTWAIT
);
674 error
= (retval
< 0 ? errno
675 : retval
== 0 ? ECONNRESET
/* not possible? */
676 : nlmsghdr
->nlmsg_len
!= UINT32_MAX
? 0
678 } while (error
== EINTR
);
680 if (error
== ENOBUFS
) {
681 /* Socket receive buffer overflow dropped one or more messages that
682 * the kernel tried to send to us. */
683 COVERAGE_INC(netlink_overflow
);
688 if (msg
.msg_flags
& MSG_TRUNC
) {
689 VLOG_ERR_RL(&rl
, "truncated message (longer than %"PRIuSIZE
" bytes)",
694 if (retval
< sizeof *nlmsghdr
695 || nlmsghdr
->nlmsg_len
< sizeof *nlmsghdr
696 || nlmsghdr
->nlmsg_len
> retval
) {
697 VLOG_ERR_RL(&rl
, "received invalid nlmsg (%"PRIuSIZE
" bytes < %"PRIuSIZE
")",
698 retval
, sizeof *nlmsghdr
);
702 buf
->size
= MIN(retval
, buf
->allocated
);
703 if (retval
> buf
->allocated
) {
704 COVERAGE_INC(netlink_recv_jumbo
);
705 ofpbuf_put(buf
, tail
, retval
- buf
->allocated
);
709 log_nlmsg(__func__
, 0, buf
->data
, buf
->size
, sock
->protocol
);
710 COVERAGE_INC(netlink_received
);
715 /* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'. If
716 * 'wait' is true, waits for a message to be ready. Otherwise, fails with
717 * EAGAIN if the 'sock' receive buffer is empty.
719 * The caller must have initialized 'buf' with an allocation of at least
720 * NLMSG_HDRLEN bytes. For best performance, the caller should allocate enough
721 * space for a "typical" message.
723 * On success, returns 0 and replaces 'buf''s previous content by the received
724 * message. This function expands 'buf''s allocated memory, as necessary, to
725 * hold the actual size of the received message.
727 * On failure, returns a positive errno value and clears 'buf' to zero length.
728 * 'buf' retains its previous memory allocation.
730 * Regardless of success or failure, this function resets 'buf''s headroom to
733 nl_sock_recv(struct nl_sock
*sock
, struct ofpbuf
*buf
, bool wait
)
735 return nl_sock_recv__(sock
, buf
, wait
);
739 nl_sock_record_errors__(struct nl_transaction
**transactions
, size_t n
,
744 for (i
= 0; i
< n
; i
++) {
745 struct nl_transaction
*txn
= transactions
[i
];
749 ofpbuf_clear(txn
->reply
);
755 nl_sock_transact_multiple__(struct nl_sock
*sock
,
756 struct nl_transaction
**transactions
, size_t n
,
759 uint64_t tmp_reply_stub
[1024 / 8];
760 struct nl_transaction tmp_txn
;
761 struct ofpbuf tmp_reply
;
764 struct iovec iovs
[MAX_IOVS
];
769 base_seq
= nl_sock_allocate_seq(sock
, n
);
771 for (i
= 0; i
< n
; i
++) {
772 struct nl_transaction
*txn
= transactions
[i
];
773 struct nlmsghdr
*nlmsg
= nl_msg_nlmsghdr(txn
->request
);
775 nlmsg
->nlmsg_len
= txn
->request
->size
;
776 nlmsg
->nlmsg_seq
= base_seq
+ i
;
777 nlmsg
->nlmsg_pid
= sock
->pid
;
779 iovs
[i
].iov_base
= txn
->request
->data
;
780 iovs
[i
].iov_len
= txn
->request
->size
;
784 memset(&msg
, 0, sizeof msg
);
788 error
= sendmsg(sock
->fd
, &msg
, 0) < 0 ? errno
: 0;
789 } while (error
== EINTR
);
791 for (i
= 0; i
< n
; i
++) {
792 struct nl_transaction
*txn
= transactions
[i
];
794 log_nlmsg(__func__
, error
, txn
->request
->data
,
795 txn
->request
->size
, sock
->protocol
);
798 COVERAGE_ADD(netlink_sent
, n
);
805 ofpbuf_use_stub(&tmp_reply
, tmp_reply_stub
, sizeof tmp_reply_stub
);
806 tmp_txn
.request
= NULL
;
807 tmp_txn
.reply
= &tmp_reply
;
810 struct nl_transaction
*buf_txn
, *txn
;
813 /* Find a transaction whose buffer we can use for receiving a reply.
814 * If no such transaction is left, use tmp_txn. */
816 for (i
= 0; i
< n
; i
++) {
817 if (transactions
[i
]->reply
) {
818 buf_txn
= transactions
[i
];
823 /* Receive a reply. */
824 error
= nl_sock_recv__(sock
, buf_txn
->reply
, false);
826 if (error
== EAGAIN
) {
827 nl_sock_record_errors__(transactions
, n
, 0);
834 /* Match the reply up with a transaction. */
835 seq
= nl_msg_nlmsghdr(buf_txn
->reply
)->nlmsg_seq
;
836 if (seq
< base_seq
|| seq
>= base_seq
+ n
) {
837 VLOG_DBG_RL(&rl
, "ignoring unexpected seq %#"PRIx32
, seq
);
841 txn
= transactions
[i
];
843 /* Fill in the results for 'txn'. */
844 if (nl_msg_nlmsgerr(buf_txn
->reply
, &txn
->error
)) {
846 ofpbuf_clear(txn
->reply
);
849 VLOG_DBG_RL(&rl
, "received NAK error=%d (%s)",
850 error
, ovs_strerror(txn
->error
));
854 if (txn
->reply
&& txn
!= buf_txn
) {
856 struct ofpbuf
*reply
= buf_txn
->reply
;
857 buf_txn
->reply
= txn
->reply
;
862 /* Fill in the results for transactions before 'txn'. (We have to do
863 * this after the results for 'txn' itself because of the buffer swap
865 nl_sock_record_errors__(transactions
, i
, 0);
869 transactions
+= i
+ 1;
873 ofpbuf_uninit(&tmp_reply
);
876 uint8_t reply_buf
[65536];
877 for (i
= 0; i
< n
; i
++) {
880 struct nl_transaction
*txn
= transactions
[i
];
881 struct nlmsghdr
*request_nlmsg
, *reply_nlmsg
;
883 ret
= DeviceIoControl(sock
->handle
, OVS_IOCTL_TRANSACT
,
886 reply_buf
, sizeof reply_buf
,
889 if (ret
&& reply_len
== 0) {
891 * The current transaction did not produce any data to read and that
892 * is not an error as such. Continue with the remainder of the
897 ofpbuf_clear(txn
->reply
);
900 /* XXX: Map to a more appropriate error. */
901 lost_communication(GetLastError());
903 VLOG_DBG_RL(&rl
, "fatal driver failure: %s",
904 ovs_lasterror_to_string());
908 if (reply_len
!= 0) {
909 request_nlmsg
= nl_msg_nlmsghdr(txn
->request
);
911 if (reply_len
< sizeof *reply_nlmsg
) {
912 nl_sock_record_errors__(transactions
, n
, 0);
913 VLOG_DBG_RL(&rl
, "insufficient length of reply %#"PRIu32
914 " for seq: %#"PRIx32
, reply_len
, request_nlmsg
->nlmsg_seq
);
918 /* Validate the sequence number in the reply. */
919 reply_nlmsg
= (struct nlmsghdr
*)reply_buf
;
921 if (request_nlmsg
->nlmsg_seq
!= reply_nlmsg
->nlmsg_seq
) {
922 ovs_assert(request_nlmsg
->nlmsg_seq
== reply_nlmsg
->nlmsg_seq
);
923 VLOG_DBG_RL(&rl
, "mismatched seq request %#"PRIx32
924 ", reply %#"PRIx32
, request_nlmsg
->nlmsg_seq
,
925 reply_nlmsg
->nlmsg_seq
);
929 /* Handle errors embedded within the netlink message. */
930 ofpbuf_use_stub(&tmp_reply
, reply_buf
, sizeof reply_buf
);
931 tmp_reply
.size
= sizeof reply_buf
;
932 if (nl_msg_nlmsgerr(&tmp_reply
, &txn
->error
)) {
934 ofpbuf_clear(txn
->reply
);
937 VLOG_DBG_RL(&rl
, "received NAK error=%d (%s)",
938 error
, ovs_strerror(txn
->error
));
943 /* Copy the reply to the buffer specified by the caller. */
944 if (reply_len
> txn
->reply
->allocated
) {
945 ofpbuf_reinit(txn
->reply
, reply_len
);
947 memcpy(txn
->reply
->data
, reply_buf
, reply_len
);
948 txn
->reply
->size
= reply_len
;
951 ofpbuf_uninit(&tmp_reply
);
954 /* Count the number of successful transactions. */
960 COVERAGE_ADD(netlink_sent
, n
);
968 nl_sock_transact_multiple(struct nl_sock
*sock
,
969 struct nl_transaction
**transactions
, size_t n
)
978 /* In theory, every request could have a 64 kB reply. But the default and
979 * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
980 * be a bit below 128 kB, so that would only allow a single message in a
981 * "batch". So we assume that replies average (at most) 4 kB, which allows
982 * a good deal of batching.
984 * In practice, most of the requests that we batch either have no reply at
985 * all or a brief reply. */
986 max_batch_count
= MAX(sock
->rcvbuf
/ 4096, 1);
987 max_batch_count
= MIN(max_batch_count
, max_iovs
);
993 /* Batch up to 'max_batch_count' transactions. But cap it at about a
994 * page of requests total because big skbuffs are expensive to
995 * allocate in the kernel. */
996 #if defined(PAGESIZE)
997 enum { MAX_BATCH_BYTES
= MAX(1, PAGESIZE
- 512) };
999 enum { MAX_BATCH_BYTES
= 4096 - 512 };
1001 bytes
= transactions
[0]->request
->size
;
1002 for (count
= 1; count
< n
&& count
< max_batch_count
; count
++) {
1003 if (bytes
+ transactions
[count
]->request
->size
> MAX_BATCH_BYTES
) {
1006 bytes
+= transactions
[count
]->request
->size
;
1009 error
= nl_sock_transact_multiple__(sock
, transactions
, count
, &done
);
1010 transactions
+= done
;
1013 if (error
== ENOBUFS
) {
1014 VLOG_DBG_RL(&rl
, "receive buffer overflow, resending request");
1016 VLOG_ERR_RL(&rl
, "transaction error (%s)", ovs_strerror(error
));
1017 nl_sock_record_errors__(transactions
, n
, error
);
1018 if (error
!= EAGAIN
) {
1019 /* A fatal error has occurred. Abort the rest of
1028 nl_sock_transact(struct nl_sock
*sock
, const struct ofpbuf
*request
,
1029 struct ofpbuf
**replyp
)
1031 struct nl_transaction
*transactionp
;
1032 struct nl_transaction transaction
;
1034 transaction
.request
= CONST_CAST(struct ofpbuf
*, request
);
1035 transaction
.reply
= replyp
? ofpbuf_new(1024) : NULL
;
1036 transactionp
= &transaction
;
1038 nl_sock_transact_multiple(sock
, &transactionp
, 1);
1041 if (transaction
.error
) {
1042 ofpbuf_delete(transaction
.reply
);
1045 *replyp
= transaction
.reply
;
1049 return transaction
.error
;
1052 /* Drain all the messages currently in 'sock''s receive queue. */
1054 nl_sock_drain(struct nl_sock
*sock
)
1059 return drain_rcvbuf(sock
->fd
);
1063 /* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
1064 * Netlink socket created with the given 'protocol', and initializes 'dump' to
1065 * reflect the state of the operation.
1067 * 'request' must contain a Netlink message. Before sending the message,
1068 * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
1069 * set to the Netlink socket's pid. NLM_F_DUMP and NLM_F_ACK will be set in
1072 * The design of this Netlink socket library ensures that the dump is reliable.
1074 * This function provides no status indication. nl_dump_done() provides an
1075 * error status for the entire dump operation.
1077 * The caller must eventually destroy 'request'.
1080 nl_dump_start(struct nl_dump
*dump
, int protocol
, const struct ofpbuf
*request
)
1082 nl_msg_nlmsghdr(request
)->nlmsg_flags
|= NLM_F_DUMP
| NLM_F_ACK
;
1084 ovs_mutex_init(&dump
->mutex
);
1085 ovs_mutex_lock(&dump
->mutex
);
1086 dump
->status
= nl_pool_alloc(protocol
, &dump
->sock
);
1087 if (!dump
->status
) {
1088 dump
->status
= nl_sock_send__(dump
->sock
, request
,
1089 nl_sock_allocate_seq(dump
->sock
, 1),
1092 dump
->nl_seq
= nl_msg_nlmsghdr(request
)->nlmsg_seq
;
1093 ovs_mutex_unlock(&dump
->mutex
);
1097 nl_dump_refill(struct nl_dump
*dump
, struct ofpbuf
*buffer
)
1098 OVS_REQUIRES(dump
->mutex
)
1100 struct nlmsghdr
*nlmsghdr
;
1103 while (!buffer
->size
) {
1104 error
= nl_sock_recv__(dump
->sock
, buffer
, false);
1106 /* The kernel never blocks providing the results of a dump, so
1107 * error == EAGAIN means that we've read the whole thing, and
1108 * therefore transform it into EOF. (The kernel always provides
1109 * NLMSG_DONE as a sentinel. Some other thread must have received
1110 * that already but not yet signaled it in 'status'.)
1112 * Any other error is just an error. */
1113 return error
== EAGAIN
? EOF
: error
;
1116 nlmsghdr
= nl_msg_nlmsghdr(buffer
);
1117 if (dump
->nl_seq
!= nlmsghdr
->nlmsg_seq
) {
1118 VLOG_DBG_RL(&rl
, "ignoring seq %#"PRIx32
" != expected %#"PRIx32
,
1119 nlmsghdr
->nlmsg_seq
, dump
->nl_seq
);
1120 ofpbuf_clear(buffer
);
1124 if (nl_msg_nlmsgerr(buffer
, &error
) && error
) {
1125 VLOG_INFO_RL(&rl
, "netlink dump request error (%s)",
1126 ovs_strerror(error
));
1127 ofpbuf_clear(buffer
);
1135 nl_dump_next__(struct ofpbuf
*reply
, struct ofpbuf
*buffer
)
1137 struct nlmsghdr
*nlmsghdr
= nl_msg_next(buffer
, reply
);
1139 VLOG_WARN_RL(&rl
, "netlink dump contains message fragment");
1141 } else if (nlmsghdr
->nlmsg_type
== NLMSG_DONE
) {
1148 /* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
1149 * have been initialized with nl_dump_start(), and 'buffer' must have been
1150 * initialized. 'buffer' should be at least NL_DUMP_BUFSIZE bytes long.
1152 * If successful, returns true and points 'reply->data' and
1153 * 'reply->size' to the message that was retrieved. The caller must not
1154 * modify 'reply' (because it points within 'buffer', which will be used by
1155 * future calls to this function).
1157 * On failure, returns false and sets 'reply->data' to NULL and
1158 * 'reply->size' to 0. Failure might indicate an actual error or merely
1159 * the end of replies. An error status for the entire dump operation is
1160 * provided when it is completed by calling nl_dump_done().
1162 * Multiple threads may call this function, passing the same nl_dump, however
1163 * each must provide independent buffers. This function may cache multiple
1164 * replies in the buffer, and these will be processed before more replies are
1165 * fetched. When this function returns false, other threads may continue to
1166 * process replies in their buffers, but they will not fetch more replies.
1169 nl_dump_next(struct nl_dump
*dump
, struct ofpbuf
*reply
, struct ofpbuf
*buffer
)
1173 /* If the buffer is empty, refill it.
1175 * If the buffer is not empty, we don't check the dump's status.
1176 * Otherwise, we could end up skipping some of the dump results if thread A
1177 * hits EOF while thread B is in the midst of processing a batch. */
1178 if (!buffer
->size
) {
1179 ovs_mutex_lock(&dump
->mutex
);
1180 if (!dump
->status
) {
1181 /* Take the mutex here to avoid an in-kernel race. If two threads
1182 * try to read from a Netlink dump socket at once, then the socket
1183 * error can be set to EINVAL, which will be encountered on the
1184 * next recv on that socket, which could be anywhere due to the way
1185 * that we pool Netlink sockets. Serializing the recv calls avoids
1187 dump
->status
= nl_dump_refill(dump
, buffer
);
1189 retval
= dump
->status
;
1190 ovs_mutex_unlock(&dump
->mutex
);
1193 /* Fetch the next message from the buffer. */
1195 retval
= nl_dump_next__(reply
, buffer
);
1197 /* Record 'retval' as the dump status, but don't overwrite an error
1199 ovs_mutex_lock(&dump
->mutex
);
1200 if (dump
->status
<= 0) {
1201 dump
->status
= retval
;
1203 ovs_mutex_unlock(&dump
->mutex
);
1214 /* Completes Netlink dump operation 'dump', which must have been initialized
1215 * with nl_dump_start(). Returns 0 if the dump operation was error-free,
1216 * otherwise a positive errno value describing the problem. */
1218 nl_dump_done(struct nl_dump
*dump
)
1222 ovs_mutex_lock(&dump
->mutex
);
1223 status
= dump
->status
;
1224 ovs_mutex_unlock(&dump
->mutex
);
1226 /* Drain any remaining messages that the client didn't read. Otherwise the
1227 * kernel will continue to queue them up and waste buffer space.
1229 * XXX We could just destroy and discard the socket in this case. */
1231 uint64_t tmp_reply_stub
[NL_DUMP_BUFSIZE
/ 8];
1232 struct ofpbuf reply
, buf
;
1234 ofpbuf_use_stub(&buf
, tmp_reply_stub
, sizeof tmp_reply_stub
);
1235 while (nl_dump_next(dump
, &reply
, &buf
)) {
1236 /* Nothing to do. */
1238 ofpbuf_uninit(&buf
);
1240 ovs_mutex_lock(&dump
->mutex
);
1241 status
= dump
->status
;
1242 ovs_mutex_unlock(&dump
->mutex
);
1246 nl_pool_release(dump
->sock
);
1247 ovs_mutex_destroy(&dump
->mutex
);
1249 return status
== EOF
? 0 : status
;
1253 /* Pend an I/O request in the driver. The driver completes the I/O whenever
1254 * an event or a packet is ready to be read. Once the I/O is completed
1255 * the overlapped structure event associated with the pending I/O will be set
1258 pend_io_request(struct nl_sock
*sock
)
1260 struct ofpbuf request
;
1261 uint64_t request_stub
[128];
1262 struct ovs_header
*ovs_header
;
1263 struct nlmsghdr
*nlmsg
;
1268 OVERLAPPED
*overlapped
= CONST_CAST(OVERLAPPED
*, &sock
->overlapped
);
1269 uint16_t cmd
= OVS_CTRL_CMD_WIN_PEND_PACKET_REQ
;
1271 ovs_assert(sock
->read_ioctl
== OVS_IOCTL_READ_PACKET
||
1272 sock
->read_ioctl
== OVS_IOCTL_READ_EVENT
);
1273 if (sock
->read_ioctl
== OVS_IOCTL_READ_EVENT
) {
1274 cmd
= OVS_CTRL_CMD_WIN_PEND_REQ
;
1277 int ovs_msg_size
= sizeof (struct nlmsghdr
) + sizeof (struct genlmsghdr
) +
1278 sizeof (struct ovs_header
);
1280 ofpbuf_use_stub(&request
, request_stub
, sizeof request_stub
);
1282 seq
= nl_sock_allocate_seq(sock
, 1);
1283 nl_msg_put_genlmsghdr(&request
, 0, OVS_WIN_NL_CTRL_FAMILY_ID
, 0,
1284 cmd
, OVS_WIN_CONTROL_VERSION
);
1285 nlmsg
= nl_msg_nlmsghdr(&request
);
1286 nlmsg
->nlmsg_seq
= seq
;
1287 nlmsg
->nlmsg_pid
= sock
->pid
;
1289 ovs_header
= ofpbuf_put_uninit(&request
, sizeof *ovs_header
);
1290 ovs_header
->dp_ifindex
= 0;
1291 nlmsg
->nlmsg_len
= request
.size
;
1293 if (!DeviceIoControl(sock
->handle
, OVS_IOCTL_WRITE
,
1294 request
.data
, request
.size
,
1295 NULL
, 0, &bytes
, overlapped
)) {
1296 error
= GetLastError();
1297 /* Check if the I/O got pended */
1298 if (error
!= ERROR_IO_INCOMPLETE
&& error
!= ERROR_IO_PENDING
) {
1299 lost_communication(error
);
1300 VLOG_ERR("nl_sock_wait failed - %s\n", ovs_format_message(error
));
1308 ofpbuf_uninit(&request
);
1313 /* Causes poll_block() to wake up when any of the specified 'events' (which is
1314 * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'.
1315 * On Windows, 'sock' is not treated as const, and may be modified. */
1317 nl_sock_wait(const struct nl_sock
*sock
, short int events
)
1320 if (sock
->overlapped
.Internal
!= STATUS_PENDING
) {
1321 int ret
= pend_io_request(CONST_CAST(struct nl_sock
*, sock
));
1323 poll_wevent_wait(sock
->overlapped
.hEvent
);
1325 poll_immediate_wake();
1328 poll_wevent_wait(sock
->overlapped
.hEvent
);
1331 poll_fd_wait(sock
->fd
, events
);
1336 /* Returns the underlying fd for 'sock', for use in "poll()"-like operations
1337 * that can't use nl_sock_wait().
1339 * It's a little tricky to use the returned fd correctly, because nl_sock does
1340 * "copy on write" to allow a single nl_sock to be used for notifications,
1341 * transactions, and dumps. If 'sock' is used only for notifications and
1342 * transactions (and never for dump) then the usage is safe. */
1344 nl_sock_fd(const struct nl_sock
*sock
)
1350 /* Returns the PID associated with this socket. */
1352 nl_sock_pid(const struct nl_sock
*sock
)
1357 /* Miscellaneous. */
1359 struct genl_family
{
1360 struct hmap_node hmap_node
;
1365 static struct hmap genl_families
= HMAP_INITIALIZER(&genl_families
);
1367 static const struct nl_policy family_policy
[CTRL_ATTR_MAX
+ 1] = {
1368 [CTRL_ATTR_FAMILY_ID
] = {.type
= NL_A_U16
},
1369 [CTRL_ATTR_MCAST_GROUPS
] = {.type
= NL_A_NESTED
, .optional
= true},
1372 static struct genl_family
*
1373 find_genl_family_by_id(uint16_t id
)
1375 struct genl_family
*family
;
1377 HMAP_FOR_EACH_IN_BUCKET (family
, hmap_node
, hash_int(id
, 0),
1379 if (family
->id
== id
) {
1387 define_genl_family(uint16_t id
, const char *name
)
1389 struct genl_family
*family
= find_genl_family_by_id(id
);
1392 if (!strcmp(family
->name
, name
)) {
1397 family
= xmalloc(sizeof *family
);
1399 hmap_insert(&genl_families
, &family
->hmap_node
, hash_int(id
, 0));
1401 family
->name
= xstrdup(name
);
1405 genl_family_to_name(uint16_t id
)
1407 if (id
== GENL_ID_CTRL
) {
1410 struct genl_family
*family
= find_genl_family_by_id(id
);
1411 return family
? family
->name
: "unknown";
1417 do_lookup_genl_family(const char *name
, struct nlattr
**attrs
,
1418 struct ofpbuf
**replyp
)
1420 struct nl_sock
*sock
;
1421 struct ofpbuf request
, *reply
;
1425 error
= nl_sock_create(NETLINK_GENERIC
, &sock
);
1430 ofpbuf_init(&request
, 0);
1431 nl_msg_put_genlmsghdr(&request
, 0, GENL_ID_CTRL
, NLM_F_REQUEST
,
1432 CTRL_CMD_GETFAMILY
, 1);
1433 nl_msg_put_string(&request
, CTRL_ATTR_FAMILY_NAME
, name
);
1434 error
= nl_sock_transact(sock
, &request
, &reply
);
1435 ofpbuf_uninit(&request
);
1437 nl_sock_destroy(sock
);
1441 if (!nl_policy_parse(reply
, NLMSG_HDRLEN
+ GENL_HDRLEN
,
1442 family_policy
, attrs
, ARRAY_SIZE(family_policy
))
1443 || nl_attr_get_u16(attrs
[CTRL_ATTR_FAMILY_ID
]) == 0) {
1444 nl_sock_destroy(sock
);
1445 ofpbuf_delete(reply
);
1449 nl_sock_destroy(sock
);
1455 do_lookup_genl_family(const char *name
, struct nlattr
**attrs
,
1456 struct ofpbuf
**replyp
)
1458 struct nlmsghdr
*nlmsg
;
1459 struct ofpbuf
*reply
;
1462 const char *family_name
;
1463 uint32_t family_version
;
1464 uint32_t family_attrmax
;
1465 uint32_t mcgrp_id
= OVS_WIN_NL_INVALID_MCGRP_ID
;
1466 const char *mcgrp_name
= NULL
;
1469 reply
= ofpbuf_new(1024);
1471 /* CTRL_ATTR_MCAST_GROUPS is supported only for VPORT family. */
1472 if (!strcmp(name
, OVS_WIN_CONTROL_FAMILY
)) {
1473 family_id
= OVS_WIN_NL_CTRL_FAMILY_ID
;
1474 family_name
= OVS_WIN_CONTROL_FAMILY
;
1475 family_version
= OVS_WIN_CONTROL_VERSION
;
1476 family_attrmax
= OVS_WIN_CONTROL_ATTR_MAX
;
1477 } else if (!strcmp(name
, OVS_DATAPATH_FAMILY
)) {
1478 family_id
= OVS_WIN_NL_DATAPATH_FAMILY_ID
;
1479 family_name
= OVS_DATAPATH_FAMILY
;
1480 family_version
= OVS_DATAPATH_VERSION
;
1481 family_attrmax
= OVS_DP_ATTR_MAX
;
1482 } else if (!strcmp(name
, OVS_PACKET_FAMILY
)) {
1483 family_id
= OVS_WIN_NL_PACKET_FAMILY_ID
;
1484 family_name
= OVS_PACKET_FAMILY
;
1485 family_version
= OVS_PACKET_VERSION
;
1486 family_attrmax
= OVS_PACKET_ATTR_MAX
;
1487 } else if (!strcmp(name
, OVS_VPORT_FAMILY
)) {
1488 family_id
= OVS_WIN_NL_VPORT_FAMILY_ID
;
1489 family_name
= OVS_VPORT_FAMILY
;
1490 family_version
= OVS_VPORT_VERSION
;
1491 family_attrmax
= OVS_VPORT_ATTR_MAX
;
1492 mcgrp_id
= OVS_WIN_NL_VPORT_MCGRP_ID
;
1493 mcgrp_name
= OVS_VPORT_MCGROUP
;
1494 } else if (!strcmp(name
, OVS_FLOW_FAMILY
)) {
1495 family_id
= OVS_WIN_NL_FLOW_FAMILY_ID
;
1496 family_name
= OVS_FLOW_FAMILY
;
1497 family_version
= OVS_FLOW_VERSION
;
1498 family_attrmax
= OVS_FLOW_ATTR_MAX
;
1499 } else if (!strcmp(name
, OVS_WIN_NETDEV_FAMILY
)) {
1500 family_id
= OVS_WIN_NL_NETDEV_FAMILY_ID
;
1501 family_name
= OVS_WIN_NETDEV_FAMILY
;
1502 family_version
= OVS_WIN_NETDEV_VERSION
;
1503 family_attrmax
= OVS_WIN_NETDEV_ATTR_MAX
;
1505 ofpbuf_delete(reply
);
1509 nl_msg_put_genlmsghdr(reply
, 0, GENL_ID_CTRL
, 0,
1510 CTRL_CMD_NEWFAMILY
, family_version
);
1511 /* CTRL_ATTR_HDRSIZE and CTRL_ATTR_OPS are not populated, but the
1512 * callers do not seem to need them. */
1513 nl_msg_put_u16(reply
, CTRL_ATTR_FAMILY_ID
, family_id
);
1514 nl_msg_put_string(reply
, CTRL_ATTR_FAMILY_NAME
, family_name
);
1515 nl_msg_put_u32(reply
, CTRL_ATTR_VERSION
, family_version
);
1516 nl_msg_put_u32(reply
, CTRL_ATTR_MAXATTR
, family_attrmax
);
1518 if (mcgrp_id
!= OVS_WIN_NL_INVALID_MCGRP_ID
) {
1519 size_t mcgrp_ofs1
= nl_msg_start_nested(reply
, CTRL_ATTR_MCAST_GROUPS
);
1520 size_t mcgrp_ofs2
= nl_msg_start_nested(reply
,
1521 OVS_WIN_NL_VPORT_MCGRP_ID
- OVS_WIN_NL_MCGRP_START_ID
);
1522 nl_msg_put_u32(reply
, CTRL_ATTR_MCAST_GRP_ID
, mcgrp_id
);
1523 ovs_assert(mcgrp_name
!= NULL
);
1524 nl_msg_put_string(reply
, CTRL_ATTR_MCAST_GRP_NAME
, mcgrp_name
);
1525 nl_msg_end_nested(reply
, mcgrp_ofs2
);
1526 nl_msg_end_nested(reply
, mcgrp_ofs1
);
1529 /* Set the total length of the netlink message. */
1530 nlmsg
= nl_msg_nlmsghdr(reply
);
1531 nlmsg
->nlmsg_len
= reply
->size
;
1533 if (!nl_policy_parse(reply
, NLMSG_HDRLEN
+ GENL_HDRLEN
,
1534 family_policy
, attrs
, ARRAY_SIZE(family_policy
))
1535 || nl_attr_get_u16(attrs
[CTRL_ATTR_FAMILY_ID
]) == 0) {
1536 ofpbuf_delete(reply
);
1545 /* Finds the multicast group called 'group_name' in genl family 'family_name'.
1546 * When successful, writes its result to 'multicast_group' and returns 0.
1547 * Otherwise, clears 'multicast_group' and returns a positive error code.
1550 nl_lookup_genl_mcgroup(const char *family_name
, const char *group_name
,
1551 unsigned int *multicast_group
)
1553 struct nlattr
*family_attrs
[ARRAY_SIZE(family_policy
)];
1554 const struct nlattr
*mc
;
1555 struct ofpbuf
*reply
;
1559 *multicast_group
= 0;
1560 error
= do_lookup_genl_family(family_name
, family_attrs
, &reply
);
1565 if (!family_attrs
[CTRL_ATTR_MCAST_GROUPS
]) {
1570 NL_NESTED_FOR_EACH (mc
, left
, family_attrs
[CTRL_ATTR_MCAST_GROUPS
]) {
1571 static const struct nl_policy mc_policy
[] = {
1572 [CTRL_ATTR_MCAST_GRP_ID
] = {.type
= NL_A_U32
},
1573 [CTRL_ATTR_MCAST_GRP_NAME
] = {.type
= NL_A_STRING
},
1576 struct nlattr
*mc_attrs
[ARRAY_SIZE(mc_policy
)];
1577 const char *mc_name
;
1579 if (!nl_parse_nested(mc
, mc_policy
, mc_attrs
, ARRAY_SIZE(mc_policy
))) {
1584 mc_name
= nl_attr_get_string(mc_attrs
[CTRL_ATTR_MCAST_GRP_NAME
]);
1585 if (!strcmp(group_name
, mc_name
)) {
1587 nl_attr_get_u32(mc_attrs
[CTRL_ATTR_MCAST_GRP_ID
]);
1595 ofpbuf_delete(reply
);
1599 /* If '*number' is 0, translates the given Generic Netlink family 'name' to a
1600 * number and stores it in '*number'. If successful, returns 0 and the caller
1601 * may use '*number' as the family number. On failure, returns a positive
1602 * errno value and '*number' caches the errno value. */
1604 nl_lookup_genl_family(const char *name
, int *number
)
1607 struct nlattr
*attrs
[ARRAY_SIZE(family_policy
)];
1608 struct ofpbuf
*reply
;
1611 error
= do_lookup_genl_family(name
, attrs
, &reply
);
1613 *number
= nl_attr_get_u16(attrs
[CTRL_ATTR_FAMILY_ID
]);
1614 define_genl_family(*number
, name
);
1618 ofpbuf_delete(reply
);
1620 ovs_assert(*number
!= 0);
1622 return *number
> 0 ? 0 : -*number
;
1626 struct nl_sock
*socks
[16];
1630 static struct ovs_mutex pool_mutex
= OVS_MUTEX_INITIALIZER
;
1631 static struct nl_pool pools
[MAX_LINKS
] OVS_GUARDED_BY(pool_mutex
);
1634 nl_pool_alloc(int protocol
, struct nl_sock
**sockp
)
1636 struct nl_sock
*sock
= NULL
;
1637 struct nl_pool
*pool
;
1639 ovs_assert(protocol
>= 0 && protocol
< ARRAY_SIZE(pools
));
1641 ovs_mutex_lock(&pool_mutex
);
1642 pool
= &pools
[protocol
];
1644 sock
= pool
->socks
[--pool
->n
];
1646 ovs_mutex_unlock(&pool_mutex
);
1652 return nl_sock_create(protocol
, sockp
);
1657 nl_pool_release(struct nl_sock
*sock
)
1660 struct nl_pool
*pool
= &pools
[sock
->protocol
];
1662 ovs_mutex_lock(&pool_mutex
);
1663 if (pool
->n
< ARRAY_SIZE(pool
->socks
)) {
1664 pool
->socks
[pool
->n
++] = sock
;
1667 ovs_mutex_unlock(&pool_mutex
);
1669 nl_sock_destroy(sock
);
1673 /* Sends 'request' to the kernel on a Netlink socket for the given 'protocol'
1674 * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response. If
1675 * successful, returns 0. On failure, returns a positive errno value.
1677 * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
1678 * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
1679 * on failure '*replyp' is set to NULL. If 'replyp' is null, then the kernel's
1680 * reply, if any, is discarded.
1682 * Before the message is sent, nlmsg_len in 'request' will be finalized to
1683 * match msg->size, nlmsg_pid will be set to the pid of the socket used
1684 * for sending the request, and nlmsg_seq will be initialized.
1686 * The caller is responsible for destroying 'request'.
1688 * Bare Netlink is an unreliable transport protocol. This function layers
1689 * reliable delivery and reply semantics on top of bare Netlink.
1691 * In Netlink, sending a request to the kernel is reliable enough, because the
1692 * kernel will tell us if the message cannot be queued (and we will in that
1693 * case put it on the transmit queue and wait until it can be delivered).
1695 * Receiving the reply is the real problem: if the socket buffer is full when
1696 * the kernel tries to send the reply, the reply will be dropped. However, the
1697 * kernel sets a flag that a reply has been dropped. The next call to recv
1698 * then returns ENOBUFS. We can then re-send the request.
1702 * 1. Netlink depends on sequence numbers to match up requests and
1703 * replies. The sender of a request supplies a sequence number, and
1704 * the reply echos back that sequence number.
1706 * This is fine, but (1) some kernel netlink implementations are
1707 * broken, in that they fail to echo sequence numbers and (2) this
1708 * function will drop packets with non-matching sequence numbers, so
1709 * that only a single request can be usefully transacted at a time.
1711 * 2. Resending the request causes it to be re-executed, so the request
1712 * needs to be idempotent.
1715 nl_transact(int protocol
, const struct ofpbuf
*request
,
1716 struct ofpbuf
**replyp
)
1718 struct nl_sock
*sock
;
1721 error
= nl_pool_alloc(protocol
, &sock
);
1729 error
= nl_sock_transact(sock
, request
, replyp
);
1731 nl_pool_release(sock
);
1735 /* Sends the 'request' member of the 'n' transactions in 'transactions' on a
1736 * Netlink socket for the given 'protocol' (e.g. NETLINK_ROUTE or
1737 * NETLINK_GENERIC), in order, and receives responses to all of them. Fills in
1738 * the 'error' member of each transaction with 0 if it was successful,
1739 * otherwise with a positive errno value. If 'reply' is nonnull, then it will
1740 * be filled with the reply if the message receives a detailed reply. In other
1741 * cases, i.e. where the request failed or had no reply beyond an indication of
1742 * success, 'reply' will be cleared if it is nonnull.
1744 * The caller is responsible for destroying each request and reply, and the
1745 * transactions array itself.
1747 * Before sending each message, this function will finalize nlmsg_len in each
1748 * 'request' to match the ofpbuf's size, set nlmsg_pid to the pid of the socket
1749 * used for the transaction, and initialize nlmsg_seq.
1751 * Bare Netlink is an unreliable transport protocol. This function layers
1752 * reliable delivery and reply semantics on top of bare Netlink. See
1753 * nl_transact() for some caveats.
1756 nl_transact_multiple(int protocol
,
1757 struct nl_transaction
**transactions
, size_t n
)
1759 struct nl_sock
*sock
;
1762 error
= nl_pool_alloc(protocol
, &sock
);
1764 nl_sock_transact_multiple(sock
, transactions
, n
);
1765 nl_pool_release(sock
);
1767 nl_sock_record_errors__(transactions
, n
, error
);
1773 nl_sock_allocate_seq(struct nl_sock
*sock
, unsigned int n
)
1775 uint32_t seq
= sock
->next_seq
;
1777 sock
->next_seq
+= n
;
1779 /* Make it impossible for the next request for sequence numbers to wrap
1780 * around to 0. Start over with 1 to avoid ever using a sequence number of
1781 * 0, because the kernel uses sequence number 0 for notifications. */
1782 if (sock
->next_seq
>= UINT32_MAX
/ 2) {
1790 nlmsghdr_to_string(const struct nlmsghdr
*h
, int protocol
, struct ds
*ds
)
1796 static const struct nlmsg_flag flags
[] = {
1797 { NLM_F_REQUEST
, "REQUEST" },
1798 { NLM_F_MULTI
, "MULTI" },
1799 { NLM_F_ACK
, "ACK" },
1800 { NLM_F_ECHO
, "ECHO" },
1801 { NLM_F_DUMP
, "DUMP" },
1802 { NLM_F_ROOT
, "ROOT" },
1803 { NLM_F_MATCH
, "MATCH" },
1804 { NLM_F_ATOMIC
, "ATOMIC" },
1806 const struct nlmsg_flag
*flag
;
1807 uint16_t flags_left
;
1809 ds_put_format(ds
, "nl(len:%"PRIu32
", type=%"PRIu16
,
1810 h
->nlmsg_len
, h
->nlmsg_type
);
1811 if (h
->nlmsg_type
== NLMSG_NOOP
) {
1812 ds_put_cstr(ds
, "(no-op)");
1813 } else if (h
->nlmsg_type
== NLMSG_ERROR
) {
1814 ds_put_cstr(ds
, "(error)");
1815 } else if (h
->nlmsg_type
== NLMSG_DONE
) {
1816 ds_put_cstr(ds
, "(done)");
1817 } else if (h
->nlmsg_type
== NLMSG_OVERRUN
) {
1818 ds_put_cstr(ds
, "(overrun)");
1819 } else if (h
->nlmsg_type
< NLMSG_MIN_TYPE
) {
1820 ds_put_cstr(ds
, "(reserved)");
1821 } else if (protocol
== NETLINK_GENERIC
) {
1822 ds_put_format(ds
, "(%s)", genl_family_to_name(h
->nlmsg_type
));
1824 ds_put_cstr(ds
, "(family-defined)");
1826 ds_put_format(ds
, ", flags=%"PRIx16
, h
->nlmsg_flags
);
1827 flags_left
= h
->nlmsg_flags
;
1828 for (flag
= flags
; flag
< &flags
[ARRAY_SIZE(flags
)]; flag
++) {
1829 if ((flags_left
& flag
->bits
) == flag
->bits
) {
1830 ds_put_format(ds
, "[%s]", flag
->name
);
1831 flags_left
&= ~flag
->bits
;
1835 ds_put_format(ds
, "[OTHER:%"PRIx16
"]", flags_left
);
1837 ds_put_format(ds
, ", seq=%"PRIx32
", pid=%"PRIu32
,
1838 h
->nlmsg_seq
, h
->nlmsg_pid
);
1842 nlmsg_to_string(const struct ofpbuf
*buffer
, int protocol
)
1844 struct ds ds
= DS_EMPTY_INITIALIZER
;
1845 const struct nlmsghdr
*h
= ofpbuf_at(buffer
, 0, NLMSG_HDRLEN
);
1847 nlmsghdr_to_string(h
, protocol
, &ds
);
1848 if (h
->nlmsg_type
== NLMSG_ERROR
) {
1849 const struct nlmsgerr
*e
;
1850 e
= ofpbuf_at(buffer
, NLMSG_HDRLEN
,
1851 NLMSG_ALIGN(sizeof(struct nlmsgerr
)));
1853 ds_put_format(&ds
, " error(%d", e
->error
);
1855 ds_put_format(&ds
, "(%s)", ovs_strerror(-e
->error
));
1857 ds_put_cstr(&ds
, ", in-reply-to(");
1858 nlmsghdr_to_string(&e
->msg
, protocol
, &ds
);
1859 ds_put_cstr(&ds
, "))");
1861 ds_put_cstr(&ds
, " error(truncated)");
1863 } else if (h
->nlmsg_type
== NLMSG_DONE
) {
1864 int *error
= ofpbuf_at(buffer
, NLMSG_HDRLEN
, sizeof *error
);
1866 ds_put_format(&ds
, " done(%d", *error
);
1868 ds_put_format(&ds
, "(%s)", ovs_strerror(-*error
));
1870 ds_put_cstr(&ds
, ")");
1872 ds_put_cstr(&ds
, " done(truncated)");
1874 } else if (protocol
== NETLINK_GENERIC
) {
1875 struct genlmsghdr
*genl
= nl_msg_genlmsghdr(buffer
);
1877 ds_put_format(&ds
, ",genl(cmd=%"PRIu8
",version=%"PRIu8
")",
1878 genl
->cmd
, genl
->version
);
1882 ds_put_cstr(&ds
, "nl(truncated)");
1888 log_nlmsg(const char *function
, int error
,
1889 const void *message
, size_t size
, int protocol
)
1891 if (!VLOG_IS_DBG_ENABLED()) {
1895 struct ofpbuf buffer
= ofpbuf_const_initializer(message
, size
);
1896 char *nlmsg
= nlmsg_to_string(&buffer
, protocol
);
1897 VLOG_DBG_RL(&rl
, "%s (%s): %s", function
, ovs_strerror(error
), nlmsg
);