2 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "socket-util.h"
19 #include <arpa/inet.h>
29 #include <sys/ioctl.h>
30 #include <sys/resource.h>
31 #include <sys/socket.h>
36 #include "dynamic-string.h"
37 #include "fatal-signal.h"
38 #include "ovs-thread.h"
40 #include "poll-loop.h"
43 #if AF_PACKET && LINUX_DATAPATH
44 #include <linux/if_packet.h>
47 #include "netlink-protocol.h"
48 #include "netlink-socket.h"
51 VLOG_DEFINE_THIS_MODULE(socket_util
);
53 /* #ifdefs make it a pain to maintain code: you have to try to build both ways.
54 * Thus, this file compiles all of the code regardless of the target, by
55 * writing "if (LINUX_DATAPATH)" instead of "#ifdef __linux__". */
56 #ifndef LINUX_DATAPATH
57 #define LINUX_DATAPATH 0
64 /* Maximum length of the sun_path member in a struct sockaddr_un, excluding
65 * space for a null terminator. */
66 #define MAX_UN_LEN (sizeof(((struct sockaddr_un *) 0)->sun_path) - 1)
68 static int getsockopt_int(int fd
, int level
, int option
, const char *optname
,
71 /* Sets 'fd' to non-blocking mode. Returns 0 if successful, otherwise a
72 * positive errno value. */
74 set_nonblocking(int fd
)
77 int flags
= fcntl(fd
, F_GETFL
, 0);
79 if (fcntl(fd
, F_SETFL
, flags
| O_NONBLOCK
) != -1) {
82 VLOG_ERR("fcntl(F_SETFL) failed: %s", ovs_strerror(errno
));
86 VLOG_ERR("fcntl(F_GETFL) failed: %s", ovs_strerror(errno
));
90 unsigned long arg
= 1;
91 if (ioctlsocket(fd
, FIONBIO
, &arg
)) {
92 int error
= sock_errno();
93 VLOG_ERR("set_nonblocking failed: %s", sock_strerror(error
));
101 xset_nonblocking(int fd
)
103 if (set_nonblocking(fd
)) {
109 set_dscp(int fd
, uint8_t dscp
)
118 if (setsockopt(fd
, IPPROTO_IP
, IP_TOS
, &val
, sizeof val
)) {
125 /* Translates 'host_name', which must be a string representation of an IP
126 * address, into a numeric IP address in '*addr'. Returns 0 if successful,
127 * otherwise a positive errno value. */
129 lookup_ip(const char *host_name
, struct in_addr
*addr
)
131 if (!inet_pton(AF_INET
, host_name
, addr
)) {
132 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
133 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid IP address", host_name
);
139 /* Translates 'host_name', which must be a string representation of an IPv6
140 * address, into a numeric IPv6 address in '*addr'. Returns 0 if successful,
141 * otherwise a positive errno value. */
143 lookup_ipv6(const char *host_name
, struct in6_addr
*addr
)
145 if (inet_pton(AF_INET6
, host_name
, addr
) != 1) {
146 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
147 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid IPv6 address", host_name
);
153 /* Translates 'host_name', which must be a host name or a string representation
154 * of an IP address, into a numeric IP address in '*addr'. Returns 0 if
155 * successful, otherwise a positive errno value.
157 * Most Open vSwitch code should not use this because it causes deadlocks:
158 * getaddrinfo() sends out a DNS request but that starts a new flow for which
159 * OVS must set up a flow, but it can't because it's waiting for a DNS reply.
160 * The synchronous lookup also delays other activity. (Of course we can solve
161 * this but it doesn't seem worthwhile quite yet.) */
163 lookup_hostname(const char *host_name
, struct in_addr
*addr
)
165 struct addrinfo
*result
;
166 struct addrinfo hints
;
168 if (inet_pton(AF_INET
, host_name
, addr
)) {
172 memset(&hints
, 0, sizeof hints
);
173 hints
.ai_family
= AF_INET
;
175 switch (getaddrinfo(host_name
, NULL
, &hints
, &result
)) {
177 *addr
= ALIGNED_CAST(struct sockaddr_in
*,
178 result
->ai_addr
)->sin_addr
;
179 freeaddrinfo(result
);
182 #ifdef EAI_ADDRFAMILY
203 #if defined (EAI_NODATA) && EAI_NODATA != EAI_NONAME
219 check_connection_completion(int fd
)
221 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 10);
226 pfd
.events
= POLLOUT
;
230 retval
= poll(&pfd
, 1, 0);
231 } while (retval
< 0 && errno
== EINTR
);
233 retval
= WSAPoll(&pfd
, 1, 0);
236 if (pfd
.revents
& POLLERR
) {
237 ssize_t n
= send(fd
, "", 1, 0);
241 VLOG_ERR_RL(&rl
, "poll return POLLERR but send succeeded");
246 } else if (retval
< 0) {
247 VLOG_ERR_RL(&rl
, "poll: %s", sock_strerror(sock_errno()));
254 /* Drain all the data currently in the receive queue of a datagram socket (and
255 * possibly additional data). There is no way to know how many packets are in
256 * the receive queue, but we do know that the total number of bytes queued does
257 * not exceed the receive buffer size, so we pull packets until none are left
258 * or we've read that many bytes. */
264 rcvbuf
= get_socket_rcvbuf(fd
);
270 /* In Linux, specifying MSG_TRUNC in the flags argument causes the
271 * datagram length to be returned, even if that is longer than the
272 * buffer provided. Thus, we can use a 1-byte buffer to discard the
273 * incoming datagram and still be able to account how many bytes were
274 * removed from the receive buffer.
276 * On other Unix-like OSes, MSG_TRUNC has no effect in the flags
278 char buffer
[LINUX_DATAPATH
? 1 : 2048];
279 ssize_t n_bytes
= recv(fd
, buffer
, sizeof buffer
, MSG_TRUNC
);
280 if (n_bytes
<= 0 || n_bytes
>= rcvbuf
) {
288 /* Returns the size of socket 'sock''s receive buffer (SO_RCVBUF), or a
289 * negative errno value if an error occurs. */
291 get_socket_rcvbuf(int sock
)
296 error
= getsockopt_int(sock
, SOL_SOCKET
, SO_RCVBUF
, "SO_RCVBUF", &rcvbuf
);
297 return error
? -error
: rcvbuf
;
300 /* Reads and discards up to 'n' datagrams from 'fd', stopping as soon as no
301 * more data can be immediately read. ('fd' should therefore be in
302 * non-blocking mode.)*/
304 drain_fd(int fd
, size_t n_packets
)
306 for (; n_packets
> 0; n_packets
--) {
307 /* 'buffer' only needs to be 1 byte long in most circumstances. This
308 * size is defensive against the possibility that we someday want to
309 * use a Linux tap device without TUN_NO_PI, in which case a buffer
310 * smaller than sizeof(struct tun_pi) will give EINVAL on read. */
312 if (read(fd
, buffer
, sizeof buffer
) <= 0) {
319 /* Attempts to shorten 'name' by opening a file descriptor for the directory
320 * part of the name and indirecting through /proc/self/fd/<dirfd>/<basename>.
321 * On systems with Linux-like /proc, this works as long as <basename> isn't too
324 * On success, returns 0 and stores the short name in 'short_name' and a
325 * directory file descriptor to eventually be closed in '*dirfpd'. */
327 shorten_name_via_proc(const char *name
, char short_name
[MAX_UN_LEN
+ 1],
334 if (!LINUX_DATAPATH
) {
338 dir
= dir_name(name
);
339 dirfd
= open(dir
, O_DIRECTORY
| O_RDONLY
);
341 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 1);
344 VLOG_WARN_RL(&rl
, "%s: open failed (%s)", dir
, ovs_strerror(error
));
351 base
= base_name(name
);
352 len
= snprintf(short_name
, MAX_UN_LEN
+ 1,
353 "/proc/self/fd/%d/%s", dirfd
, base
);
356 if (len
>= 0 && len
<= MAX_UN_LEN
) {
365 /* Attempts to shorten 'name' by creating a symlink for the directory part of
366 * the name and indirecting through <symlink>/<basename>. This works on
367 * systems that support symlinks, as long as <basename> isn't too long.
369 * On success, returns 0 and stores the short name in 'short_name' and the
370 * symbolic link to eventually delete in 'linkname'. */
372 shorten_name_via_symlink(const char *name
, char short_name
[MAX_UN_LEN
+ 1],
373 char linkname
[MAX_UN_LEN
+ 1])
375 char *abs
, *dir
, *base
;
380 abs
= abs_file_name(NULL
, name
);
382 base
= base_name(abs
);
385 tmpdir
= getenv("TMPDIR");
386 if (tmpdir
== NULL
) {
390 for (i
= 0; i
< 1000; i
++) {
393 len
= snprintf(linkname
, MAX_UN_LEN
+ 1,
394 "%s/ovs-un-c-%"PRIu32
, tmpdir
, random_uint32());
395 error
= (len
< 0 || len
> MAX_UN_LEN
? ENAMETOOLONG
396 : symlink(dir
, linkname
) ? errno
398 if (error
!= EEXIST
) {
406 fatal_signal_add_file_to_unlink(linkname
);
408 len
= snprintf(short_name
, MAX_UN_LEN
+ 1, "%s/%s", linkname
, base
);
409 if (len
< 0 || len
> MAX_UN_LEN
) {
410 fatal_signal_unlink_file_now(linkname
);
411 error
= ENAMETOOLONG
;
424 /* Stores in '*un' a sockaddr_un that refers to file 'name'. Stores in
425 * '*un_len' the size of the sockaddr_un.
427 * Returns 0 on success, otherwise a positive errno value.
429 * Uses '*dirfdp' and 'linkname' to store references to data when the caller no
430 * longer needs to use 'un'. On success, freeing these references with
431 * free_sockaddr_un() is mandatory to avoid a leak; on failure, freeing them is
432 * unnecessary but harmless. */
434 make_sockaddr_un(const char *name
, struct sockaddr_un
*un
, socklen_t
*un_len
,
435 int *dirfdp
, char linkname
[MAX_UN_LEN
+ 1])
437 char short_name
[MAX_UN_LEN
+ 1];
441 if (strlen(name
) > MAX_UN_LEN
) {
442 /* 'name' is too long to fit in a sockaddr_un. Try a workaround. */
443 int error
= shorten_name_via_proc(name
, short_name
, dirfdp
);
444 if (error
== ENAMETOOLONG
) {
445 error
= shorten_name_via_symlink(name
, short_name
, linkname
);
448 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 1);
450 VLOG_WARN_RL(&rl
, "Unix socket name %s is longer than maximum "
451 "%"PRIuSIZE
" bytes", name
, MAX_UN_LEN
);
458 un
->sun_family
= AF_UNIX
;
459 ovs_strzcpy(un
->sun_path
, name
, sizeof un
->sun_path
);
460 *un_len
= (offsetof(struct sockaddr_un
, sun_path
)
461 + strlen (un
->sun_path
) + 1);
465 /* Clean up after make_sockaddr_un(). */
467 free_sockaddr_un(int dirfd
, const char *linkname
)
473 fatal_signal_unlink_file_now(linkname
);
477 /* Binds Unix domain socket 'fd' to a file with permissions 0700. */
479 bind_unix_socket(int fd
, struct sockaddr
*sun
, socklen_t sun_len
)
481 /* According to _Unix Network Programming_, umask should affect bind(). */
482 mode_t old_umask
= umask(0077);
483 int error
= bind(fd
, sun
, sun_len
) ? errno
: 0;
488 /* Creates a Unix domain socket in the given 'style' (either SOCK_DGRAM or
489 * SOCK_STREAM) that is bound to '*bind_path' (if 'bind_path' is non-null) and
490 * connected to '*connect_path' (if 'connect_path' is non-null). If 'nonblock'
491 * is true, the socket is made non-blocking.
493 * Returns the socket's fd if successful, otherwise a negative errno value. */
495 make_unix_socket(int style
, bool nonblock
,
496 const char *bind_path
, const char *connect_path
)
501 fd
= socket(PF_UNIX
, style
, 0);
506 /* Set nonblocking mode right away, if we want it. This prevents blocking
507 * in connect(), if connect_path != NULL. (In turn, that's a corner case:
508 * it will only happen if style is SOCK_STREAM or SOCK_SEQPACKET, and only
509 * if a backlog of un-accepted connections has built up in the kernel.) */
511 error
= set_nonblocking(fd
);
518 char linkname
[MAX_UN_LEN
+ 1];
519 struct sockaddr_un un
;
523 if (unlink(bind_path
) && errno
!= ENOENT
) {
524 VLOG_WARN("unlinking \"%s\": %s\n",
525 bind_path
, ovs_strerror(errno
));
527 fatal_signal_add_file_to_unlink(bind_path
);
529 error
= make_sockaddr_un(bind_path
, &un
, &un_len
, &dirfd
, linkname
);
531 error
= bind_unix_socket(fd
, (struct sockaddr
*) &un
, un_len
);
533 free_sockaddr_un(dirfd
, linkname
);
541 char linkname
[MAX_UN_LEN
+ 1];
542 struct sockaddr_un un
;
546 error
= make_sockaddr_un(connect_path
, &un
, &un_len
, &dirfd
, linkname
);
548 && connect(fd
, (struct sockaddr
*) &un
, un_len
)
549 && errno
!= EINPROGRESS
) {
552 free_sockaddr_un(dirfd
, linkname
);
562 if (error
== EAGAIN
) {
566 fatal_signal_unlink_file_now(bind_path
);
573 get_unix_name_len(socklen_t sun_len
)
575 return (sun_len
>= offsetof(struct sockaddr_un
, sun_path
)
576 ? sun_len
- offsetof(struct sockaddr_un
, sun_path
)
582 guess_netmask(ovs_be32 ip_
)
584 uint32_t ip
= ntohl(ip_
);
585 return ((ip
>> 31) == 0 ? htonl(0xff000000) /* Class A */
586 : (ip
>> 30) == 2 ? htonl(0xffff0000) /* Class B */
587 : (ip
>> 29) == 6 ? htonl(0xffffff00) /* Class C */
588 : htonl(0)); /* ??? */
591 /* This is like strsep() except:
593 * - The separator string is ":".
595 * - Square brackets [] quote ":" separators and are removed from the
598 parse_bracketed_token(char **pp
)
604 } else if (*p
== '\0') {
607 } else if (*p
== '[') {
609 char *end
= start
+ strcspn(start
, "]");
610 *pp
= (*end
== '\0' ? NULL
611 : end
[1] == ':' ? end
+ 2
617 char *end
= start
+ strcspn(start
, ":");
618 *pp
= *end
== '\0' ? NULL
: end
+ 1;
625 parse_sockaddr_components(struct sockaddr_storage
*ss
,
627 const char *port_s
, uint16_t default_port
,
630 struct sockaddr_in
*sin
= ALIGNED_CAST(struct sockaddr_in
*, ss
);
633 if (port_s
&& port_s
[0]) {
634 if (!str_to_int(port_s
, 10, &port
) || port
< 0 || port
> 65535) {
635 VLOG_ERR("%s: bad port number \"%s\"", s
, port_s
);
641 memset(ss
, 0, sizeof *ss
);
642 if (strchr(host_s
, ':')) {
643 struct sockaddr_in6
*sin6
644 = ALIGNED_CAST(struct sockaddr_in6
*, ss
);
646 sin6
->sin6_family
= AF_INET6
;
647 sin6
->sin6_port
= htons(port
);
648 if (!inet_pton(AF_INET6
, host_s
, sin6
->sin6_addr
.s6_addr
)) {
649 VLOG_ERR("%s: bad IPv6 address \"%s\"", s
, host_s
);
653 sin
->sin_family
= AF_INET
;
654 sin
->sin_port
= htons(port
);
655 if (!inet_pton(AF_INET
, host_s
, &sin
->sin_addr
.s_addr
)) {
656 VLOG_ERR("%s: bad IPv4 address \"%s\"", s
, host_s
);
664 memset(ss
, 0, sizeof *ss
);
668 /* Parses 'target', which should be a string in the format "<host>[:<port>]".
669 * <host>, which is required, may be an IPv4 address or an IPv6 address
670 * enclosed in square brackets. If 'default_port' is nonzero then <port> is
671 * optional and defaults to 'default_port'.
673 * On success, returns true and stores the parsed remote address into '*ss'.
674 * On failure, logs an error, stores zeros into '*ss', and returns false. */
676 inet_parse_active(const char *target_
, uint16_t default_port
,
677 struct sockaddr_storage
*ss
)
679 char *target
= xstrdup(target_
);
686 host
= parse_bracketed_token(&p
);
687 port
= parse_bracketed_token(&p
);
689 VLOG_ERR("%s: host must be specified", target_
);
691 } else if (!port
&& !default_port
) {
692 VLOG_ERR("%s: port must be specified", target_
);
695 ok
= parse_sockaddr_components(ss
, host
, port
, default_port
, target_
);
698 memset(ss
, 0, sizeof *ss
);
705 /* Opens a non-blocking IPv4 or IPv6 socket of the specified 'style' and
706 * connects to 'target', which should be a string in the format
707 * "<host>[:<port>]". <host>, which is required, may be an IPv4 address or an
708 * IPv6 address enclosed in square brackets. If 'default_port' is nonzero then
709 * <port> is optional and defaults to 'default_port'.
711 * 'style' should be SOCK_STREAM (for TCP) or SOCK_DGRAM (for UDP).
713 * On success, returns 0 (indicating connection complete) or EAGAIN (indicating
714 * connection in progress), in which case the new file descriptor is stored
715 * into '*fdp'. On failure, returns a positive errno value other than EAGAIN
716 * and stores -1 into '*fdp'.
718 * If 'ss' is non-null, then on success stores the target address into '*ss'.
720 * 'dscp' becomes the DSCP bits in the IP headers for the new connection. It
721 * should be in the range [0, 63] and will automatically be shifted to the
722 * appropriately place in the IP tos field. */
724 inet_open_active(int style
, const char *target
, uint16_t default_port
,
725 struct sockaddr_storage
*ssp
, int *fdp
, uint8_t dscp
)
727 struct sockaddr_storage ss
;
732 if (!inet_parse_active(target
, default_port
, &ss
)) {
733 error
= EAFNOSUPPORT
;
737 /* Create non-blocking socket. */
738 fd
= socket(ss
.ss_family
, style
, 0);
740 error
= sock_errno();
741 VLOG_ERR("%s: socket: %s", target
, sock_strerror(error
));
744 error
= set_nonblocking(fd
);
749 /* The dscp bits must be configured before connect() to ensure that the
750 * TOS field is set during the connection establishment. If set after
751 * connect(), the handshake SYN frames will be sent with a TOS of 0. */
752 error
= set_dscp(fd
, dscp
);
754 VLOG_ERR("%s: socket: %s", target
, sock_strerror(error
));
759 error
= connect(fd
, (struct sockaddr
*) &ss
, ss_length(&ss
)) == 0
762 if (error
== EINPROGRESS
764 || error
== WSAEALREADY
|| error
== WSAEWOULDBLOCK
771 if (error
&& error
!= EAGAIN
) {
773 memset(ssp
, 0, sizeof *ssp
);
788 /* Parses 'target', which should be a string in the format "[<port>][:<host>]":
790 * - If 'default_port' is -1, then <port> is required. Otherwise, if
791 * <port> is omitted, then 'default_port' is used instead.
793 * - If <port> (or 'default_port', if used) is 0, then no port is bound
794 * and the TCP/IP stack will select a port.
796 * - <host> is optional. If supplied, it may be an IPv4 address or an
797 * IPv6 address enclosed in square brackets. If omitted, the IP address
800 * If successful, stores the address into '*ss' and returns true; otherwise
801 * zeros '*ss' and returns false. */
803 inet_parse_passive(const char *target_
, int default_port
,
804 struct sockaddr_storage
*ss
)
806 char *target
= xstrdup(target_
);
813 port
= parse_bracketed_token(&p
);
814 host
= parse_bracketed_token(&p
);
815 if (!port
&& default_port
< 0) {
816 VLOG_ERR("%s: port must be specified", target_
);
819 ok
= parse_sockaddr_components(ss
, host
? host
: "0.0.0.0",
820 port
, default_port
, target_
);
823 memset(ss
, 0, sizeof *ss
);
830 /* Opens a non-blocking IPv4 or IPv6 socket of the specified 'style', binds to
831 * 'target', and listens for incoming connections. Parses 'target' in the same
832 * way was inet_parse_passive().
834 * 'style' should be SOCK_STREAM (for TCP) or SOCK_DGRAM (for UDP).
836 * For TCP, the socket will have SO_REUSEADDR turned on.
838 * On success, returns a non-negative file descriptor. On failure, returns a
839 * negative errno value.
841 * If 'ss' is non-null, then on success stores the bound address into '*ss'.
843 * 'dscp' becomes the DSCP bits in the IP headers for the new connection. It
844 * should be in the range [0, 63] and will automatically be shifted to the
845 * appropriately place in the IP tos field. */
847 inet_open_passive(int style
, const char *target
, int default_port
,
848 struct sockaddr_storage
*ssp
, uint8_t dscp
)
850 bool kernel_chooses_port
;
851 struct sockaddr_storage ss
;
853 unsigned int yes
= 1;
855 if (!inet_parse_passive(target
, default_port
, &ss
)) {
856 return -EAFNOSUPPORT
;
858 kernel_chooses_port
= ss_get_port(&ss
) == 0;
860 /* Create non-blocking socket, set SO_REUSEADDR. */
861 fd
= socket(ss
.ss_family
, style
, 0);
863 error
= sock_errno();
864 VLOG_ERR("%s: socket: %s", target
, sock_strerror(error
));
867 error
= set_nonblocking(fd
);
871 if (style
== SOCK_STREAM
872 && setsockopt(fd
, SOL_SOCKET
, SO_REUSEADDR
, &yes
, sizeof yes
) < 0) {
873 error
= sock_errno();
874 VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s",
875 target
, sock_strerror(error
));
880 if (bind(fd
, (struct sockaddr
*) &ss
, ss_length(&ss
)) < 0) {
881 error
= sock_errno();
882 VLOG_ERR("%s: bind: %s", target
, sock_strerror(error
));
886 /* The dscp bits must be configured before connect() to ensure that the TOS
887 * field is set during the connection establishment. If set after
888 * connect(), the handshake SYN frames will be sent with a TOS of 0. */
889 error
= set_dscp(fd
, dscp
);
891 VLOG_ERR("%s: socket: %s", target
, sock_strerror(error
));
896 if (style
== SOCK_STREAM
&& listen(fd
, 10) < 0) {
897 error
= sock_errno();
898 VLOG_ERR("%s: listen: %s", target
, sock_strerror(error
));
902 if (ssp
|| kernel_chooses_port
) {
903 socklen_t ss_len
= sizeof ss
;
904 if (getsockname(fd
, (struct sockaddr
*) &ss
, &ss_len
) < 0) {
905 error
= sock_errno();
906 VLOG_ERR("%s: getsockname: %s", target
, sock_strerror(error
));
909 if (kernel_chooses_port
) {
910 VLOG_INFO("%s: listening on port %"PRIu16
,
911 target
, ss_get_port(&ss
));
922 memset(ssp
, 0, sizeof *ssp
);
929 read_fully(int fd
, void *p_
, size_t size
, size_t *bytes_read
)
935 ssize_t retval
= read(fd
, p
, size
);
937 *bytes_read
+= retval
;
940 } else if (retval
== 0) {
942 } else if (errno
!= EINTR
) {
950 write_fully(int fd
, const void *p_
, size_t size
, size_t *bytes_written
)
952 const uint8_t *p
= p_
;
956 ssize_t retval
= write(fd
, p
, size
);
958 *bytes_written
+= retval
;
961 } else if (retval
== 0) {
962 VLOG_WARN("write returned 0");
964 } else if (errno
!= EINTR
) {
971 /* Given file name 'file_name', fsyncs the directory in which it is contained.
972 * Returns 0 if successful, otherwise a positive errno value. */
974 fsync_parent_dir(const char *file_name
)
981 dir
= dir_name(file_name
);
982 fd
= open(dir
, O_RDONLY
);
985 if (errno
== EINVAL
|| errno
== EROFS
) {
986 /* This directory does not support synchronization. Not
987 * really an error. */
990 VLOG_ERR("%s: fsync failed (%s)", dir
, ovs_strerror(error
));
996 VLOG_ERR("%s: open failed (%s)", dir
, ovs_strerror(error
));
1004 /* Obtains the modification time of the file named 'file_name' to the greatest
1005 * supported precision. If successful, stores the mtime in '*mtime' and
1006 * returns 0. On error, returns a positive errno value and stores zeros in
1009 get_mtime(const char *file_name
, struct timespec
*mtime
)
1013 if (!stat(file_name
, &s
)) {
1014 mtime
->tv_sec
= s
.st_mtime
;
1016 #if HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC
1017 mtime
->tv_nsec
= s
.st_mtim
.tv_nsec
;
1018 #elif HAVE_STRUCT_STAT_ST_MTIMENSEC
1019 mtime
->tv_nsec
= s
.st_mtimensec
;
1026 mtime
->tv_sec
= mtime
->tv_nsec
= 0;
1035 VLOG_FATAL("failed to create pipe (%s)", ovs_strerror(errno
));
1040 xpipe_nonblocking(int fds
[2])
1043 xset_nonblocking(fds
[0]);
1044 xset_nonblocking(fds
[1]);
1048 getsockopt_int(int fd
, int level
, int option
, const char *optname
, int *valuep
)
1050 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 10);
1056 if (getsockopt(fd
, level
, option
, &value
, &len
)) {
1057 error
= sock_errno();
1058 VLOG_ERR_RL(&rl
, "getsockopt(%s): %s", optname
, sock_strerror(error
));
1059 } else if (len
!= sizeof value
) {
1061 VLOG_ERR_RL(&rl
, "getsockopt(%s): value is %u bytes (expected %"PRIuSIZE
")",
1062 optname
, (unsigned int) len
, sizeof value
);
1067 *valuep
= error
? 0 : value
;
1072 describe_sockaddr(struct ds
*string
, int fd
,
1073 int (*getaddr
)(int, struct sockaddr
*, socklen_t
*))
1075 struct sockaddr_storage ss
;
1076 socklen_t len
= sizeof ss
;
1078 if (!getaddr(fd
, (struct sockaddr
*) &ss
, &len
)) {
1079 if (ss
.ss_family
== AF_INET
|| ss
.ss_family
== AF_INET6
) {
1080 char addrbuf
[SS_NTOP_BUFSIZE
];
1082 ds_put_format(string
, "%s:%"PRIu16
,
1083 ss_format_address(&ss
, addrbuf
, sizeof addrbuf
),
1086 } else if (ss
.ss_family
== AF_UNIX
) {
1087 struct sockaddr_un sun
;
1091 memcpy(&sun
, &ss
, sizeof sun
);
1092 maxlen
= len
- offsetof(struct sockaddr_un
, sun_path
);
1093 null
= memchr(sun
.sun_path
, '\0', maxlen
);
1094 ds_put_buffer(string
, sun
.sun_path
,
1095 null
? null
- sun
.sun_path
: maxlen
);
1099 else if (ss
.ss_family
== AF_NETLINK
) {
1102 /* SO_PROTOCOL was introduced in 2.6.32. Support it regardless of the version
1103 * of the Linux kernel headers in use at build time. */
1105 #define SO_PROTOCOL 38
1108 if (!getsockopt_int(fd
, SOL_SOCKET
, SO_PROTOCOL
, "SO_PROTOCOL",
1112 ds_put_cstr(string
, "NETLINK_ROUTE");
1115 case NETLINK_GENERIC
:
1116 ds_put_cstr(string
, "NETLINK_GENERIC");
1120 ds_put_format(string
, "AF_NETLINK family %d", protocol
);
1124 ds_put_cstr(string
, "AF_NETLINK");
1128 #if AF_PACKET && LINUX_DATAPATH
1129 else if (ss
.ss_family
== AF_PACKET
) {
1130 struct sockaddr_ll sll
;
1132 memcpy(&sll
, &ss
, sizeof sll
);
1133 ds_put_cstr(string
, "AF_PACKET");
1134 if (sll
.sll_ifindex
) {
1135 char name
[IFNAMSIZ
];
1137 if (if_indextoname(sll
.sll_ifindex
, name
)) {
1138 ds_put_format(string
, "(%s)", name
);
1140 ds_put_format(string
, "(ifindex=%d)", sll
.sll_ifindex
);
1143 if (sll
.sll_protocol
) {
1144 ds_put_format(string
, "(protocol=0x%"PRIu16
")",
1145 ntohs(sll
.sll_protocol
));
1149 else if (ss
.ss_family
== AF_UNSPEC
) {
1150 ds_put_cstr(string
, "AF_UNSPEC");
1152 ds_put_format(string
, "AF_%d", (int) ss
.ss_family
);
1158 #ifdef LINUX_DATAPATH
1160 put_fd_filename(struct ds
*string
, int fd
)
1166 linkname
= xasprintf("/proc/self/fd/%d", fd
);
1167 n
= readlink(linkname
, buf
, sizeof buf
);
1169 ds_put_char(string
, ' ');
1170 ds_put_buffer(string
, buf
, n
);
1171 if (n
> sizeof buf
) {
1172 ds_put_cstr(string
, "...");
1179 /* Returns a malloc()'d string describing 'fd', for use in logging. */
1188 if (fstat(fd
, &s
)) {
1189 ds_put_format(&string
, "fstat failed (%s)", ovs_strerror(errno
));
1190 } else if (S_ISSOCK(s
.st_mode
)) {
1191 describe_sockaddr(&string
, fd
, getsockname
);
1192 ds_put_cstr(&string
, "<->");
1193 describe_sockaddr(&string
, fd
, getpeername
);
1195 ds_put_cstr(&string
, (isatty(fd
) ? "tty"
1196 : S_ISDIR(s
.st_mode
) ? "directory"
1197 : S_ISCHR(s
.st_mode
) ? "character device"
1198 : S_ISBLK(s
.st_mode
) ? "block device"
1199 : S_ISREG(s
.st_mode
) ? "file"
1200 : S_ISFIFO(s
.st_mode
) ? "FIFO"
1201 : S_ISLNK(s
.st_mode
) ? "symbolic link"
1203 #ifdef LINUX_DATAPATH
1204 put_fd_filename(&string
, fd
);
1208 ds_put_format(&string
,"file descriptor");
1210 return ds_steal_cstr(&string
);
1214 /* Calls ioctl() on an AF_INET sock, passing the specified 'command' and
1215 * 'arg'. Returns 0 if successful, otherwise a positive errno value. */
1217 af_inet_ioctl(unsigned long int command
, const void *arg
)
1219 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
1222 if (ovsthread_once_start(&once
)) {
1223 sock
= socket(AF_INET
, SOCK_DGRAM
, 0);
1225 int error
= sock_errno();
1226 VLOG_ERR("failed to create inet socket: %s", sock_strerror(error
));
1229 ovsthread_once_done(&once
);
1232 return (sock
< 0 ? -sock
1233 : ioctl(sock
, command
, arg
) == -1 ? errno
1238 af_inet_ifreq_ioctl(const char *name
, struct ifreq
*ifr
, unsigned long int cmd
,
1239 const char *cmd_name
)
1243 ovs_strzcpy(ifr
->ifr_name
, name
, sizeof ifr
->ifr_name
);
1244 error
= af_inet_ioctl(cmd
, ifr
);
1246 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
1247 VLOG_DBG_RL(&rl
, "%s: ioctl(%s) failed: %s", name
, cmd_name
,
1248 ovs_strerror(error
));
1254 /* sockaddr_storage helpers. */
1256 /* Returns the IPv4 or IPv6 port in 'ss'. */
1258 ss_get_port(const struct sockaddr_storage
*ss
)
1260 if (ss
->ss_family
== AF_INET
) {
1261 const struct sockaddr_in
*sin
1262 = ALIGNED_CAST(const struct sockaddr_in
*, ss
);
1263 return ntohs(sin
->sin_port
);
1264 } else if (ss
->ss_family
== AF_INET6
) {
1265 const struct sockaddr_in6
*sin6
1266 = ALIGNED_CAST(const struct sockaddr_in6
*, ss
);
1267 return ntohs(sin6
->sin6_port
);
1273 /* Formats the IPv4 or IPv6 address in 'ss' into the 'bufsize' bytes in 'buf'.
1274 * If 'ss' is an IPv6 address, puts square brackets around the address.
1275 * 'bufsize' should be at least SS_NTOP_BUFSIZE.
1279 ss_format_address(const struct sockaddr_storage
*ss
,
1280 char *buf
, size_t bufsize
)
1282 ovs_assert(bufsize
>= SS_NTOP_BUFSIZE
);
1283 if (ss
->ss_family
== AF_INET
) {
1284 const struct sockaddr_in
*sin
1285 = ALIGNED_CAST(const struct sockaddr_in
*, ss
);
1287 snprintf(buf
, bufsize
, IP_FMT
, IP_ARGS(sin
->sin_addr
.s_addr
));
1288 } else if (ss
->ss_family
== AF_INET6
) {
1289 const struct sockaddr_in6
*sin6
1290 = ALIGNED_CAST(const struct sockaddr_in6
*, ss
);
1293 inet_ntop(AF_INET6
, sin6
->sin6_addr
.s6_addr
, buf
+ 1, bufsize
- 1);
1294 strcpy(strchr(buf
, '\0'), "]");
1303 ss_length(const struct sockaddr_storage
*ss
)
1305 switch (ss
->ss_family
) {
1307 return sizeof(struct sockaddr_in
);
1310 return sizeof(struct sockaddr_in6
);
1317 /* For Windows socket calls, 'errno' is not set. One has to call
1318 * WSAGetLastError() to get the error number and then pass it to
1319 * this function to get the correct error string.
1321 * ovs_strerror() calls strerror_r() and would not get the correct error
1322 * string for Windows sockets, but is good for POSIX. */
1324 sock_strerror(int error
)
1327 return ovs_format_message(error
);
1329 return ovs_strerror(error
);