]>
git.proxmox.com Git - mirror_ovs.git/blob - lib/socket-util.c
2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "socket-util.h"
19 #include <arpa/inet.h>
29 #include <sys/resource.h>
30 #include <sys/socket.h>
35 #include "dynamic-string.h"
36 #include "fatal-signal.h"
38 #include "poll-loop.h"
41 #if AF_PACKET && LINUX_DATAPATH
42 #include <linux/if_packet.h>
45 #include "netlink-protocol.h"
46 #include "netlink-socket.h"
49 VLOG_DEFINE_THIS_MODULE(socket_util
);
51 /* #ifdefs make it a pain to maintain code: you have to try to build both ways.
52 * Thus, this file compiles all of the code regardless of the target, by
53 * writing "if (LINUX_DATAPATH)" instead of "#ifdef __linux__". */
54 #ifndef LINUX_DATAPATH
55 #define LINUX_DATAPATH 0
62 static int getsockopt_int(int fd
, int level
, int option
, const char *optname
,
65 /* Sets 'fd' to non-blocking mode. Returns 0 if successful, otherwise a
66 * positive errno value. */
68 set_nonblocking(int fd
)
70 int flags
= fcntl(fd
, F_GETFL
, 0);
72 if (fcntl(fd
, F_SETFL
, flags
| O_NONBLOCK
) != -1) {
75 VLOG_ERR("fcntl(F_SETFL) failed: %s", ovs_strerror(errno
));
79 VLOG_ERR("fcntl(F_GETFL) failed: %s", ovs_strerror(errno
));
85 xset_nonblocking(int fd
)
87 if (set_nonblocking(fd
)) {
93 set_dscp(int fd
, uint8_t dscp
)
102 if (setsockopt(fd
, IPPROTO_IP
, IP_TOS
, &val
, sizeof val
)) {
110 rlim_is_finite(rlim_t limit
)
112 if (limit
== RLIM_INFINITY
) {
116 #ifdef RLIM_SAVED_CUR /* FreeBSD 8.0 lacks RLIM_SAVED_CUR. */
117 if (limit
== RLIM_SAVED_CUR
) {
122 #ifdef RLIM_SAVED_MAX /* FreeBSD 8.0 lacks RLIM_SAVED_MAX. */
123 if (limit
== RLIM_SAVED_MAX
) {
131 /* Returns the maximum valid FD value, plus 1. */
135 static int max_fds
= -1;
138 if (!getrlimit(RLIMIT_NOFILE
, &r
) && rlim_is_finite(r
.rlim_cur
)) {
139 max_fds
= r
.rlim_cur
;
141 VLOG_WARN("failed to obtain fd limit, defaulting to 1024");
148 /* Translates 'host_name', which must be a string representation of an IP
149 * address, into a numeric IP address in '*addr'. Returns 0 if successful,
150 * otherwise a positive errno value. */
152 lookup_ip(const char *host_name
, struct in_addr
*addr
)
154 if (!inet_aton(host_name
, addr
)) {
155 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
156 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid IP address", host_name
);
162 /* Translates 'host_name', which must be a string representation of an IPv6
163 * address, into a numeric IPv6 address in '*addr'. Returns 0 if successful,
164 * otherwise a positive errno value. */
166 lookup_ipv6(const char *host_name
, struct in6_addr
*addr
)
168 if (inet_pton(AF_INET6
, host_name
, addr
) != 1) {
169 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
170 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid IPv6 address", host_name
);
176 /* Translates 'host_name', which must be a host name or a string representation
177 * of an IP address, into a numeric IP address in '*addr'. Returns 0 if
178 * successful, otherwise a positive errno value.
180 * Most Open vSwitch code should not use this because it causes deadlocks:
181 * getaddrinfo() sends out a DNS request but that starts a new flow for which
182 * OVS must set up a flow, but it can't because it's waiting for a DNS reply.
183 * The synchronous lookup also delays other activity. (Of course we can solve
184 * this but it doesn't seem worthwhile quite yet.) */
186 lookup_hostname(const char *host_name
, struct in_addr
*addr
)
188 struct addrinfo
*result
;
189 struct addrinfo hints
;
191 if (inet_aton(host_name
, addr
)) {
195 memset(&hints
, 0, sizeof hints
);
196 hints
.ai_family
= AF_INET
;
198 switch (getaddrinfo(host_name
, NULL
, &hints
, &result
)) {
200 *addr
= ((struct sockaddr_in
*) result
->ai_addr
)->sin_addr
;
201 freeaddrinfo(result
);
204 #ifdef EAI_ADDRFAMILY
239 check_connection_completion(int fd
)
241 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 10);
246 pfd
.events
= POLLOUT
;
248 retval
= poll(&pfd
, 1, 0);
249 } while (retval
< 0 && errno
== EINTR
);
251 if (pfd
.revents
& POLLERR
) {
252 ssize_t n
= send(fd
, "", 1, MSG_DONTWAIT
);
256 VLOG_ERR_RL(&rl
, "poll return POLLERR but send succeeded");
261 } else if (retval
< 0) {
262 VLOG_ERR_RL(&rl
, "poll: %s", ovs_strerror(errno
));
269 /* Drain all the data currently in the receive queue of a datagram socket (and
270 * possibly additional data). There is no way to know how many packets are in
271 * the receive queue, but we do know that the total number of bytes queued does
272 * not exceed the receive buffer size, so we pull packets until none are left
273 * or we've read that many bytes. */
279 rcvbuf
= get_socket_rcvbuf(fd
);
285 /* In Linux, specifying MSG_TRUNC in the flags argument causes the
286 * datagram length to be returned, even if that is longer than the
287 * buffer provided. Thus, we can use a 1-byte buffer to discard the
288 * incoming datagram and still be able to account how many bytes were
289 * removed from the receive buffer.
291 * On other Unix-like OSes, MSG_TRUNC has no effect in the flags
293 char buffer
[LINUX_DATAPATH
? 1 : 2048];
294 ssize_t n_bytes
= recv(fd
, buffer
, sizeof buffer
,
295 MSG_TRUNC
| MSG_DONTWAIT
);
296 if (n_bytes
<= 0 || n_bytes
>= rcvbuf
) {
304 /* Returns the size of socket 'sock''s receive buffer (SO_RCVBUF), or a
305 * negative errno value if an error occurs. */
307 get_socket_rcvbuf(int sock
)
312 error
= getsockopt_int(sock
, SOL_SOCKET
, SO_RCVBUF
, "SO_RCVBUF", &rcvbuf
);
313 return error
? -error
: rcvbuf
;
316 /* Reads and discards up to 'n' datagrams from 'fd', stopping as soon as no
317 * more data can be immediately read. ('fd' should therefore be in
318 * non-blocking mode.)*/
320 drain_fd(int fd
, size_t n_packets
)
322 for (; n_packets
> 0; n_packets
--) {
323 /* 'buffer' only needs to be 1 byte long in most circumstances. This
324 * size is defensive against the possibility that we someday want to
325 * use a Linux tap device without TUN_NO_PI, in which case a buffer
326 * smaller than sizeof(struct tun_pi) will give EINVAL on read. */
328 if (read(fd
, buffer
, sizeof buffer
) <= 0) {
334 /* Stores in '*un' a sockaddr_un that refers to file 'name'. Stores in
335 * '*un_len' the size of the sockaddr_un. */
337 make_sockaddr_un__(const char *name
, struct sockaddr_un
*un
, socklen_t
*un_len
)
339 un
->sun_family
= AF_UNIX
;
340 ovs_strzcpy(un
->sun_path
, name
, sizeof un
->sun_path
);
341 *un_len
= (offsetof(struct sockaddr_un
, sun_path
)
342 + strlen (un
->sun_path
) + 1);
345 /* Stores in '*un' a sockaddr_un that refers to file 'name'. Stores in
346 * '*un_len' the size of the sockaddr_un.
348 * Returns 0 on success, otherwise a positive errno value. On success,
349 * '*dirfdp' is either -1 or a nonnegative file descriptor that the caller
350 * should close after using '*un' to bind or connect. On failure, '*dirfdp' is
353 make_sockaddr_un(const char *name
, struct sockaddr_un
*un
, socklen_t
*un_len
,
356 enum { MAX_UN_LEN
= sizeof un
->sun_path
- 1 };
359 if (strlen(name
) > MAX_UN_LEN
) {
360 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 1);
362 if (LINUX_DATAPATH
) {
363 /* 'name' is too long to fit in a sockaddr_un, but we have a
364 * workaround for that on Linux: shorten it by opening a file
365 * descriptor for the directory part of the name and indirecting
366 * through /proc/self/fd/<dirfd>/<basename>. */
371 dir
= dir_name(name
);
372 base
= base_name(name
);
374 dirfd
= open(dir
, O_DIRECTORY
| O_RDONLY
);
381 short_name
= xasprintf("/proc/self/fd/%d/%s", dirfd
, base
);
385 if (strlen(short_name
) <= MAX_UN_LEN
) {
386 make_sockaddr_un__(short_name
, un
, un_len
);
394 VLOG_WARN_RL(&rl
, "Unix socket name %s is longer than maximum "
395 "%d bytes (even shortened)", name
, MAX_UN_LEN
);
397 /* 'name' is too long and we have no workaround. */
398 VLOG_WARN_RL(&rl
, "Unix socket name %s is longer than maximum "
399 "%d bytes", name
, MAX_UN_LEN
);
404 make_sockaddr_un__(name
, un
, un_len
);
409 /* Binds Unix domain socket 'fd' to a file with permissions 0700. */
411 bind_unix_socket(int fd
, struct sockaddr
*sun
, socklen_t sun_len
)
413 /* According to _Unix Network Programming_, umask should affect bind(). */
414 mode_t old_umask
= umask(0077);
415 int error
= bind(fd
, sun
, sun_len
) ? errno
: 0;
420 /* Creates a Unix domain socket in the given 'style' (either SOCK_DGRAM or
421 * SOCK_STREAM) that is bound to '*bind_path' (if 'bind_path' is non-null) and
422 * connected to '*connect_path' (if 'connect_path' is non-null). If 'nonblock'
423 * is true, the socket is made non-blocking.
425 * Returns the socket's fd if successful, otherwise a negative errno value. */
427 make_unix_socket(int style
, bool nonblock
,
428 const char *bind_path
, const char *connect_path
)
433 fd
= socket(PF_UNIX
, style
, 0);
438 /* Set nonblocking mode right away, if we want it. This prevents blocking
439 * in connect(), if connect_path != NULL. (In turn, that's a corner case:
440 * it will only happen if style is SOCK_STREAM or SOCK_SEQPACKET, and only
441 * if a backlog of un-accepted connections has built up in the kernel.) */
443 error
= set_nonblocking(fd
);
450 struct sockaddr_un un
;
454 if (unlink(bind_path
) && errno
!= ENOENT
) {
455 VLOG_WARN("unlinking \"%s\": %s\n",
456 bind_path
, ovs_strerror(errno
));
458 fatal_signal_add_file_to_unlink(bind_path
);
460 error
= make_sockaddr_un(bind_path
, &un
, &un_len
, &dirfd
);
462 error
= bind_unix_socket(fd
, (struct sockaddr
*) &un
, un_len
);
473 struct sockaddr_un un
;
477 error
= make_sockaddr_un(connect_path
, &un
, &un_len
, &dirfd
);
479 && connect(fd
, (struct sockaddr
*) &un
, un_len
)
480 && errno
!= EINPROGRESS
) {
494 if (error
== EAGAIN
) {
498 fatal_signal_unlink_file_now(bind_path
);
505 get_unix_name_len(socklen_t sun_len
)
507 return (sun_len
>= offsetof(struct sockaddr_un
, sun_path
)
508 ? sun_len
- offsetof(struct sockaddr_un
, sun_path
)
513 guess_netmask(ovs_be32 ip_
)
515 uint32_t ip
= ntohl(ip_
);
516 return ((ip
>> 31) == 0 ? htonl(0xff000000) /* Class A */
517 : (ip
>> 30) == 2 ? htonl(0xffff0000) /* Class B */
518 : (ip
>> 29) == 6 ? htonl(0xffffff00) /* Class C */
519 : htonl(0)); /* ??? */
522 /* Parses 'target', which should be a string in the format "<host>[:<port>]".
523 * <host> is required. If 'default_port' is nonzero then <port> is optional
524 * and defaults to 'default_port'.
526 * On success, returns true and stores the parsed remote address into '*sinp'.
527 * On failure, logs an error, stores zeros into '*sinp', and returns false. */
529 inet_parse_active(const char *target_
, uint16_t default_port
,
530 struct sockaddr_in
*sinp
)
532 char *target
= xstrdup(target_
);
533 char *save_ptr
= NULL
;
534 const char *host_name
;
535 const char *port_string
;
539 sinp
->sin_family
= AF_INET
;
540 sinp
->sin_port
= htons(default_port
);
543 host_name
= strtok_r(target
, ":", &save_ptr
);
544 port_string
= strtok_r(NULL
, ":", &save_ptr
);
546 VLOG_ERR("%s: bad peer name format", target_
);
550 /* Look up IP, port. */
551 if (lookup_ip(host_name
, &sinp
->sin_addr
)) {
554 if (port_string
&& atoi(port_string
)) {
555 sinp
->sin_port
= htons(atoi(port_string
));
556 } else if (!default_port
) {
557 VLOG_ERR("%s: port number must be specified", target_
);
565 memset(sinp
, 0, sizeof *sinp
);
571 /* Opens a non-blocking IPv4 socket of the specified 'style' and connects to
572 * 'target', which should be a string in the format "<host>[:<port>]". <host>
573 * is required. If 'default_port' is nonzero then <port> is optional and
574 * defaults to 'default_port'.
576 * 'style' should be SOCK_STREAM (for TCP) or SOCK_DGRAM (for UDP).
578 * On success, returns 0 (indicating connection complete) or EAGAIN (indicating
579 * connection in progress), in which case the new file descriptor is stored
580 * into '*fdp'. On failure, returns a positive errno value other than EAGAIN
581 * and stores -1 into '*fdp'.
583 * If 'sinp' is non-null, then on success the target address is stored into
586 * 'dscp' becomes the DSCP bits in the IP headers for the new connection. It
587 * should be in the range [0, 63] and will automatically be shifted to the
588 * appropriately place in the IP tos field. */
590 inet_open_active(int style
, const char *target
, uint16_t default_port
,
591 struct sockaddr_in
*sinp
, int *fdp
, uint8_t dscp
)
593 struct sockaddr_in sin
;
598 if (!inet_parse_active(target
, default_port
, &sin
)) {
599 error
= EAFNOSUPPORT
;
603 /* Create non-blocking socket. */
604 fd
= socket(AF_INET
, style
, 0);
606 VLOG_ERR("%s: socket: %s", target
, ovs_strerror(errno
));
610 error
= set_nonblocking(fd
);
615 /* The dscp bits must be configured before connect() to ensure that the TOS
616 * field is set during the connection establishment. If set after
617 * connect(), the handshake SYN frames will be sent with a TOS of 0. */
618 error
= set_dscp(fd
, dscp
);
620 VLOG_ERR("%s: socket: %s", target
, ovs_strerror(error
));
625 error
= connect(fd
, (struct sockaddr
*) &sin
, sizeof sin
) == 0 ? 0 : errno
;
626 if (error
== EINPROGRESS
) {
631 if (!error
|| error
== EAGAIN
) {
635 } else if (fd
>= 0) {
643 /* Parses 'target', which should be a string in the format "[<port>][:<ip>]":
645 * - If 'default_port' is -1, then <port> is required. Otherwise, if
646 * <port> is omitted, then 'default_port' is used instead.
648 * - If <port> (or 'default_port', if used) is 0, then no port is bound
649 * and the TCP/IP stack will select a port.
651 * - If <ip> is omitted then the IP address is wildcarded.
653 * If successful, stores the address into '*sinp' and returns true; otherwise
654 * zeros '*sinp' and returns false. */
656 inet_parse_passive(const char *target_
, int default_port
,
657 struct sockaddr_in
*sinp
)
659 char *target
= xstrdup(target_
);
660 char *string_ptr
= target
;
661 const char *host_name
;
662 const char *port_string
;
666 /* Address defaults. */
667 memset(sinp
, 0, sizeof *sinp
);
668 sinp
->sin_family
= AF_INET
;
669 sinp
->sin_addr
.s_addr
= htonl(INADDR_ANY
);
670 sinp
->sin_port
= htons(default_port
);
672 /* Parse optional port number. */
673 port_string
= strsep(&string_ptr
, ":");
674 if (port_string
&& str_to_int(port_string
, 10, &port
)) {
675 sinp
->sin_port
= htons(port
);
676 } else if (default_port
< 0) {
677 VLOG_ERR("%s: port number must be specified", target_
);
681 /* Parse optional bind IP. */
682 host_name
= strsep(&string_ptr
, ":");
683 if (host_name
&& host_name
[0] && lookup_ip(host_name
, &sinp
->sin_addr
)) {
691 memset(sinp
, 0, sizeof *sinp
);
698 /* Opens a non-blocking IPv4 socket of the specified 'style', binds to
699 * 'target', and listens for incoming connections. Parses 'target' in the same
700 * way was inet_parse_passive().
702 * 'style' should be SOCK_STREAM (for TCP) or SOCK_DGRAM (for UDP).
704 * For TCP, the socket will have SO_REUSEADDR turned on.
706 * On success, returns a non-negative file descriptor. On failure, returns a
707 * negative errno value.
709 * If 'sinp' is non-null, then on success the bound address is stored into
712 * 'dscp' becomes the DSCP bits in the IP headers for the new connection. It
713 * should be in the range [0, 63] and will automatically be shifted to the
714 * appropriately place in the IP tos field. */
716 inet_open_passive(int style
, const char *target
, int default_port
,
717 struct sockaddr_in
*sinp
, uint8_t dscp
)
719 bool kernel_chooses_port
;
720 struct sockaddr_in sin
;
722 unsigned int yes
= 1;
724 if (!inet_parse_passive(target
, default_port
, &sin
)) {
725 return -EAFNOSUPPORT
;
728 /* Create non-blocking socket, set SO_REUSEADDR. */
729 fd
= socket(AF_INET
, style
, 0);
732 VLOG_ERR("%s: socket: %s", target
, ovs_strerror(error
));
735 error
= set_nonblocking(fd
);
739 if (style
== SOCK_STREAM
740 && setsockopt(fd
, SOL_SOCKET
, SO_REUSEADDR
, &yes
, sizeof yes
) < 0) {
742 VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s",
743 target
, ovs_strerror(error
));
748 if (bind(fd
, (struct sockaddr
*) &sin
, sizeof sin
) < 0) {
750 VLOG_ERR("%s: bind: %s", target
, ovs_strerror(error
));
754 /* The dscp bits must be configured before connect() to ensure that the TOS
755 * field is set during the connection establishment. If set after
756 * connect(), the handshake SYN frames will be sent with a TOS of 0. */
757 error
= set_dscp(fd
, dscp
);
759 VLOG_ERR("%s: socket: %s", target
, ovs_strerror(error
));
764 if (style
== SOCK_STREAM
&& listen(fd
, 10) < 0) {
766 VLOG_ERR("%s: listen: %s", target
, ovs_strerror(error
));
770 kernel_chooses_port
= sin
.sin_port
== htons(0);
771 if (sinp
|| kernel_chooses_port
) {
772 socklen_t sin_len
= sizeof sin
;
773 if (getsockname(fd
, (struct sockaddr
*) &sin
, &sin_len
) < 0) {
775 VLOG_ERR("%s: getsockname: %s", target
, ovs_strerror(error
));
778 if (sin
.sin_family
!= AF_INET
|| sin_len
!= sizeof sin
) {
779 error
= EAFNOSUPPORT
;
780 VLOG_ERR("%s: getsockname: invalid socket name", target
);
786 if (kernel_chooses_port
) {
787 VLOG_INFO("%s: listening on port %"PRIu16
,
788 target
, ntohs(sin
.sin_port
));
799 /* Returns a readable and writable fd for /dev/null, if successful, otherwise
800 * a negative errno value. The caller must not close the returned fd (because
801 * the same fd will be handed out to subsequent callers). */
805 static int null_fd
= -1;
807 null_fd
= open("/dev/null", O_RDWR
);
810 VLOG_ERR("could not open /dev/null: %s", ovs_strerror(error
));
818 read_fully(int fd
, void *p_
, size_t size
, size_t *bytes_read
)
824 ssize_t retval
= read(fd
, p
, size
);
826 *bytes_read
+= retval
;
829 } else if (retval
== 0) {
831 } else if (errno
!= EINTR
) {
839 write_fully(int fd
, const void *p_
, size_t size
, size_t *bytes_written
)
841 const uint8_t *p
= p_
;
845 ssize_t retval
= write(fd
, p
, size
);
847 *bytes_written
+= retval
;
850 } else if (retval
== 0) {
851 VLOG_WARN("write returned 0");
853 } else if (errno
!= EINTR
) {
860 /* Given file name 'file_name', fsyncs the directory in which it is contained.
861 * Returns 0 if successful, otherwise a positive errno value. */
863 fsync_parent_dir(const char *file_name
)
869 dir
= dir_name(file_name
);
870 fd
= open(dir
, O_RDONLY
);
873 if (errno
== EINVAL
|| errno
== EROFS
) {
874 /* This directory does not support synchronization. Not
875 * really an error. */
878 VLOG_ERR("%s: fsync failed (%s)", dir
, ovs_strerror(error
));
884 VLOG_ERR("%s: open failed (%s)", dir
, ovs_strerror(error
));
891 /* Obtains the modification time of the file named 'file_name' to the greatest
892 * supported precision. If successful, stores the mtime in '*mtime' and
893 * returns 0. On error, returns a positive errno value and stores zeros in
896 get_mtime(const char *file_name
, struct timespec
*mtime
)
900 if (!stat(file_name
, &s
)) {
901 mtime
->tv_sec
= s
.st_mtime
;
903 #if HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC
904 mtime
->tv_nsec
= s
.st_mtim
.tv_nsec
;
905 #elif HAVE_STRUCT_STAT_ST_MTIMENSEC
906 mtime
->tv_nsec
= s
.st_mtimensec
;
913 mtime
->tv_sec
= mtime
->tv_nsec
= 0;
922 VLOG_FATAL("failed to create pipe (%s)", ovs_strerror(errno
));
927 xpipe_nonblocking(int fds
[2])
930 xset_nonblocking(fds
[0]);
931 xset_nonblocking(fds
[1]);
935 xsocketpair(int domain
, int type
, int protocol
, int fds
[2])
937 if (socketpair(domain
, type
, protocol
, fds
)) {
938 VLOG_FATAL("failed to create socketpair (%s)", ovs_strerror(errno
));
943 getsockopt_int(int fd
, int level
, int option
, const char *optname
, int *valuep
)
945 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 10);
951 if (getsockopt(fd
, level
, option
, &value
, &len
)) {
953 VLOG_ERR_RL(&rl
, "getsockopt(%s): %s", optname
, ovs_strerror(error
));
954 } else if (len
!= sizeof value
) {
956 VLOG_ERR_RL(&rl
, "getsockopt(%s): value is %u bytes (expected %zu)",
957 optname
, (unsigned int) len
, sizeof value
);
962 *valuep
= error
? 0 : value
;
967 describe_sockaddr(struct ds
*string
, int fd
,
968 int (*getaddr
)(int, struct sockaddr
*, socklen_t
*))
970 struct sockaddr_storage ss
;
971 socklen_t len
= sizeof ss
;
973 if (!getaddr(fd
, (struct sockaddr
*) &ss
, &len
)) {
974 if (ss
.ss_family
== AF_INET
) {
975 struct sockaddr_in sin
;
977 memcpy(&sin
, &ss
, sizeof sin
);
978 ds_put_format(string
, IP_FMT
":%"PRIu16
,
979 IP_ARGS(sin
.sin_addr
.s_addr
), ntohs(sin
.sin_port
));
980 } else if (ss
.ss_family
== AF_UNIX
) {
981 struct sockaddr_un sun
;
985 memcpy(&sun
, &ss
, sizeof sun
);
986 maxlen
= len
- offsetof(struct sockaddr_un
, sun_path
);
987 null
= memchr(sun
.sun_path
, '\0', maxlen
);
988 ds_put_buffer(string
, sun
.sun_path
,
989 null
? null
- sun
.sun_path
: maxlen
);
992 else if (ss
.ss_family
== AF_NETLINK
) {
995 /* SO_PROTOCOL was introduced in 2.6.32. Support it regardless of the version
996 * of the Linux kernel headers in use at build time. */
998 #define SO_PROTOCOL 38
1001 if (!getsockopt_int(fd
, SOL_SOCKET
, SO_PROTOCOL
, "SO_PROTOCOL",
1005 ds_put_cstr(string
, "NETLINK_ROUTE");
1008 case NETLINK_GENERIC
:
1009 ds_put_cstr(string
, "NETLINK_GENERIC");
1013 ds_put_format(string
, "AF_NETLINK family %d", protocol
);
1017 ds_put_cstr(string
, "AF_NETLINK");
1021 #if AF_PACKET && LINUX_DATAPATH
1022 else if (ss
.ss_family
== AF_PACKET
) {
1023 struct sockaddr_ll sll
;
1025 memcpy(&sll
, &ss
, sizeof sll
);
1026 ds_put_cstr(string
, "AF_PACKET");
1027 if (sll
.sll_ifindex
) {
1028 char name
[IFNAMSIZ
];
1030 if (if_indextoname(sll
.sll_ifindex
, name
)) {
1031 ds_put_format(string
, "(%s)", name
);
1033 ds_put_format(string
, "(ifindex=%d)", sll
.sll_ifindex
);
1036 if (sll
.sll_protocol
) {
1037 ds_put_format(string
, "(protocol=0x%"PRIu16
")",
1038 ntohs(sll
.sll_protocol
));
1042 else if (ss
.ss_family
== AF_UNSPEC
) {
1043 ds_put_cstr(string
, "AF_UNSPEC");
1045 ds_put_format(string
, "AF_%d", (int) ss
.ss_family
);
1051 #ifdef LINUX_DATAPATH
1053 put_fd_filename(struct ds
*string
, int fd
)
1059 linkname
= xasprintf("/proc/self/fd/%d", fd
);
1060 n
= readlink(linkname
, buf
, sizeof buf
);
1062 ds_put_char(string
, ' ');
1063 ds_put_buffer(string
, buf
, n
);
1064 if (n
> sizeof buf
) {
1065 ds_put_cstr(string
, "...");
1072 /* Returns a malloc()'d string describing 'fd', for use in logging. */
1080 if (fstat(fd
, &s
)) {
1081 ds_put_format(&string
, "fstat failed (%s)", ovs_strerror(errno
));
1082 } else if (S_ISSOCK(s
.st_mode
)) {
1083 describe_sockaddr(&string
, fd
, getsockname
);
1084 ds_put_cstr(&string
, "<->");
1085 describe_sockaddr(&string
, fd
, getpeername
);
1087 ds_put_cstr(&string
, (isatty(fd
) ? "tty"
1088 : S_ISDIR(s
.st_mode
) ? "directory"
1089 : S_ISCHR(s
.st_mode
) ? "character device"
1090 : S_ISBLK(s
.st_mode
) ? "block device"
1091 : S_ISREG(s
.st_mode
) ? "file"
1092 : S_ISFIFO(s
.st_mode
) ? "FIFO"
1093 : S_ISLNK(s
.st_mode
) ? "symbolic link"
1095 #ifdef LINUX_DATAPATH
1096 put_fd_filename(&string
, fd
);
1099 return ds_steal_cstr(&string
);
1102 /* Returns the total of the 'iov_len' members of the 'n_iovs' in 'iovs'.
1103 * The caller must ensure that the total does not exceed SIZE_MAX. */
1105 iovec_len(const struct iovec iovs
[], size_t n_iovs
)
1110 for (i
= 0; i
< n_iovs
; i
++) {
1111 len
+= iovs
[i
].iov_len
;
1116 /* Returns true if all of the 'n_iovs' iovecs in 'iovs' have length zero. */
1118 iovec_is_empty(const struct iovec iovs
[], size_t n_iovs
)
1122 for (i
= 0; i
< n_iovs
; i
++) {
1123 if (iovs
[i
].iov_len
) {
1130 /* Sends the 'n_iovs' iovecs of data in 'iovs' and the 'n_fds' file descriptors
1131 * in 'fds' on Unix domain socket 'sock'. Returns the number of bytes
1132 * successfully sent or -1 if an error occurred. On error, sets errno
1135 send_iovec_and_fds(int sock
,
1136 const struct iovec
*iovs
, size_t n_iovs
,
1137 const int fds
[], size_t n_fds
)
1139 ovs_assert(sock
>= 0);
1143 char control
[CMSG_SPACE(SOUTIL_MAX_FDS
* sizeof *fds
)];
1147 ovs_assert(!iovec_is_empty(iovs
, n_iovs
));
1148 ovs_assert(n_fds
<= SOUTIL_MAX_FDS
);
1150 memset(&cmsg
, 0, sizeof cmsg
);
1151 cmsg
.cm
.cmsg_len
= CMSG_LEN(n_fds
* sizeof *fds
);
1152 cmsg
.cm
.cmsg_level
= SOL_SOCKET
;
1153 cmsg
.cm
.cmsg_type
= SCM_RIGHTS
;
1154 memcpy(CMSG_DATA(&cmsg
.cm
), fds
, n_fds
* sizeof *fds
);
1156 msg
.msg_name
= NULL
;
1157 msg
.msg_namelen
= 0;
1158 msg
.msg_iov
= CONST_CAST(struct iovec
*, iovs
);
1159 msg
.msg_iovlen
= n_iovs
;
1160 msg
.msg_control
= &cmsg
.cm
;
1161 msg
.msg_controllen
= CMSG_SPACE(n_fds
* sizeof *fds
);
1164 return sendmsg(sock
, &msg
, 0);
1166 return writev(sock
, iovs
, n_iovs
);
1170 /* Sends the 'n_iovs' iovecs of data in 'iovs' and the 'n_fds' file descriptors
1171 * in 'fds' on Unix domain socket 'sock'. If 'skip_bytes' is nonzero, then the
1172 * first 'skip_bytes' of data in the iovecs are not sent, and none of the file
1173 * descriptors are sent. The function continues to retry sending until an
1174 * error (other than EINTR) occurs or all the data and fds are sent.
1176 * Returns 0 if all the data and fds were successfully sent, otherwise a
1177 * positive errno value. Regardless of success, stores the number of bytes
1178 * sent (always at least 'skip_bytes') in '*bytes_sent'. (If at least one byte
1179 * is sent, then all the fds have been sent.)
1181 * 'skip_bytes' must be less than or equal to iovec_len(iovs, n_iovs). */
1183 send_iovec_and_fds_fully(int sock
,
1184 const struct iovec iovs
[], size_t n_iovs
,
1185 const int fds
[], size_t n_fds
,
1186 size_t skip_bytes
, size_t *bytes_sent
)
1189 while (n_iovs
> 0) {
1193 retval
= skip_bytes
;
1195 } else if (!*bytes_sent
) {
1196 retval
= send_iovec_and_fds(sock
, iovs
, n_iovs
, fds
, n_fds
);
1198 retval
= writev(sock
, iovs
, n_iovs
);
1202 *bytes_sent
+= retval
;
1203 while (retval
> 0) {
1204 const uint8_t *base
= iovs
->iov_base
;
1205 size_t len
= iovs
->iov_len
;
1211 error
= write_fully(sock
, base
+ retval
, len
- retval
,
1213 *bytes_sent
+= sent
;
1223 } else if (retval
== 0) {
1224 if (iovec_is_empty(iovs
, n_iovs
)) {
1227 VLOG_WARN("send returned 0");
1229 } else if (errno
!= EINTR
) {
1237 /* Sends the 'n_iovs' iovecs of data in 'iovs' and the 'n_fds' file descriptors
1238 * in 'fds' on Unix domain socket 'sock'. The function continues to retry
1239 * sending until an error (other than EAGAIN or EINTR) occurs or all the data
1240 * and fds are sent. Upon EAGAIN, the function blocks until the socket is
1241 * ready for more data.
1243 * Returns 0 if all the data and fds were successfully sent, otherwise a
1244 * positive errno value. */
1246 send_iovec_and_fds_fully_block(int sock
,
1247 const struct iovec iovs
[], size_t n_iovs
,
1248 const int fds
[], size_t n_fds
)
1255 error
= send_iovec_and_fds_fully(sock
, iovs
, n_iovs
,
1256 fds
, n_fds
, sent
, &sent
);
1257 if (error
!= EAGAIN
) {
1260 poll_fd_wait(sock
, POLLOUT
);
1265 /* Attempts to receive from Unix domain socket 'sock' up to 'size' bytes of
1266 * data into 'data' and up to SOUTIL_MAX_FDS file descriptors into 'fds'.
1268 * - Upon success, returns the number of bytes of data copied into 'data'
1269 * and stores the number of received file descriptors into '*n_fdsp'.
1271 * - On failure, returns a negative errno value and stores 0 in
1274 * - On EOF, returns 0 and stores 0 in '*n_fdsp'. */
1276 recv_data_and_fds(int sock
,
1277 void *data
, size_t size
,
1278 int fds
[SOUTIL_MAX_FDS
], size_t *n_fdsp
)
1282 char control
[CMSG_SPACE(SOUTIL_MAX_FDS
* sizeof *fds
)];
1294 iov
.iov_base
= data
;
1297 msg
.msg_name
= NULL
;
1298 msg
.msg_namelen
= 0;
1301 msg
.msg_control
= &cmsg
.cm
;
1302 msg
.msg_controllen
= sizeof cmsg
.control
;
1305 retval
= recvmsg(sock
, &msg
, 0);
1306 } while (retval
< 0 && errno
== EINTR
);
1308 return retval
< 0 ? -errno
: 0;
1311 for (p
= CMSG_FIRSTHDR(&msg
); p
; p
= CMSG_NXTHDR(&msg
, p
)) {
1312 if (p
->cmsg_level
!= SOL_SOCKET
|| p
->cmsg_type
!= SCM_RIGHTS
) {
1313 VLOG_ERR("unexpected control message %d:%d",
1314 p
->cmsg_level
, p
->cmsg_type
);
1316 } else if (*n_fdsp
) {
1317 VLOG_ERR("multiple SCM_RIGHTS received");
1320 size_t n_fds
= (p
->cmsg_len
- CMSG_LEN(0)) / sizeof *fds
;
1321 const int *fds_data
= (const int *) CMSG_DATA(p
);
1323 ovs_assert(n_fds
> 0);
1324 if (n_fds
> SOUTIL_MAX_FDS
) {
1325 VLOG_ERR("%zu fds received but only %d supported",
1326 n_fds
, SOUTIL_MAX_FDS
);
1327 for (i
= 0; i
< n_fds
; i
++) {
1334 memcpy(fds
, fds_data
, n_fds
* sizeof *fds
);
1341 for (i
= 0; i
< *n_fdsp
; i
++) {