1 /* Evaluate MSG_ZEROCOPY
3 * Send traffic between two processes over one of the supported
9 * - SOCK_DGRAM with UDP_CORK
11 * - SOCK_RAW with IP_HDRINCL
17 * Start this program on two connected hosts, one in send mode and
18 * the other with option '-r' to put it in receiver mode.
20 * If zerocopy mode ('-z') is enabled, the sender will verify that
21 * the kernel queues completions on the error queue for all zerocopy
27 #include <arpa/inet.h>
31 #include <linux/errqueue.h>
32 #include <linux/if_packet.h>
33 #include <linux/ipv6.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <net/ethernet.h>
38 #include <netinet/ip.h>
39 #include <netinet/ip6.h>
40 #include <netinet/tcp.h>
41 #include <netinet/udp.h>
49 #include <sys/ioctl.h>
50 #include <sys/socket.h>
53 #include <sys/types.h>
57 #ifndef SO_EE_ORIGIN_ZEROCOPY
58 #define SO_EE_ORIGIN_ZEROCOPY 5
62 #define SO_ZEROCOPY 60
65 #ifndef SO_EE_CODE_ZEROCOPY_COPIED
66 #define SO_EE_CODE_ZEROCOPY_COPIED 1
70 #define MSG_ZEROCOPY 0x4000000
74 static bool cfg_cork_mixed
;
75 static int cfg_cpu
= -1; /* default: pin to last cpu */
76 static int cfg_family
= PF_UNSPEC
;
77 static int cfg_ifindex
= 1;
78 static int cfg_payload_len
;
79 static int cfg_port
= 8000;
81 static int cfg_runtime_ms
= 4200;
82 static int cfg_verbose
;
83 static int cfg_waittime_ms
= 500;
84 static bool cfg_zerocopy
;
86 static socklen_t cfg_alen
;
87 static struct sockaddr_storage cfg_dst_addr
;
88 static struct sockaddr_storage cfg_src_addr
;
90 static char payload
[IP_MAXPACKET
];
91 static long packets
, bytes
, completions
, expected_completions
;
92 static int zerocopied
= -1;
93 static uint32_t next_completion
;
95 static unsigned long gettimeofday_ms(void)
99 gettimeofday(&tv
, NULL
);
100 return (tv
.tv_sec
* 1000) + (tv
.tv_usec
/ 1000);
103 static uint16_t get_ip_csum(const uint16_t *start
, int num_words
)
105 unsigned long sum
= 0;
108 for (i
= 0; i
< num_words
; i
++)
112 sum
= (sum
& 0xFFFF) + (sum
>> 16);
117 static int do_setcpu(int cpu
)
123 if (sched_setaffinity(0, sizeof(mask
), &mask
))
124 error(1, 0, "setaffinity %d", cpu
);
127 fprintf(stderr
, "cpu: %u\n", cpu
);
132 static void do_setsockopt(int fd
, int level
, int optname
, int val
)
134 if (setsockopt(fd
, level
, optname
, &val
, sizeof(val
)))
135 error(1, errno
, "setsockopt %d.%d: %d", level
, optname
, val
);
138 static int do_poll(int fd
, int events
)
147 ret
= poll(&pfd
, 1, cfg_waittime_ms
);
149 error(1, errno
, "poll");
151 return ret
&& (pfd
.revents
& events
);
154 static int do_accept(int fd
)
158 fd
= accept(fda
, NULL
, NULL
);
160 error(1, errno
, "accept");
162 error(1, errno
, "close listen sock");
167 static bool do_sendmsg(int fd
, struct msghdr
*msg
, bool do_zerocopy
)
169 int ret
, len
, i
, flags
;
172 for (i
= 0; i
< msg
->msg_iovlen
; i
++)
173 len
+= msg
->msg_iov
[i
].iov_len
;
175 flags
= MSG_DONTWAIT
;
177 flags
|= MSG_ZEROCOPY
;
179 ret
= sendmsg(fd
, msg
, flags
);
180 if (ret
== -1 && errno
== EAGAIN
)
183 error(1, errno
, "send");
184 if (cfg_verbose
&& ret
!= len
)
185 fprintf(stderr
, "send: ret=%u != %u\n", ret
, len
);
190 if (do_zerocopy
&& ret
)
191 expected_completions
++;
197 static void do_sendmsg_corked(int fd
, struct msghdr
*msg
)
199 bool do_zerocopy
= cfg_zerocopy
;
200 int i
, payload_len
, extra_len
;
202 /* split up the packet. for non-multiple, make first buffer longer */
203 payload_len
= cfg_payload_len
/ cfg_cork
;
204 extra_len
= cfg_payload_len
- (cfg_cork
* payload_len
);
206 do_setsockopt(fd
, IPPROTO_UDP
, UDP_CORK
, 1);
208 for (i
= 0; i
< cfg_cork
; i
++) {
210 /* in mixed-frags mode, alternate zerocopy and copy frags
211 * start with non-zerocopy, to ensure attach later works
214 do_zerocopy
= (i
& 1);
216 msg
->msg_iov
[0].iov_len
= payload_len
+ extra_len
;
219 do_sendmsg(fd
, msg
, do_zerocopy
);
222 do_setsockopt(fd
, IPPROTO_UDP
, UDP_CORK
, 0);
225 static int setup_iph(struct iphdr
*iph
, uint16_t payload_len
)
227 struct sockaddr_in
*daddr
= (void *) &cfg_dst_addr
;
228 struct sockaddr_in
*saddr
= (void *) &cfg_src_addr
;
230 memset(iph
, 0, sizeof(*iph
));
236 iph
->saddr
= saddr
->sin_addr
.s_addr
;
237 iph
->daddr
= daddr
->sin_addr
.s_addr
;
238 iph
->protocol
= IPPROTO_EGP
;
239 iph
->tot_len
= htons(sizeof(*iph
) + payload_len
);
240 iph
->check
= get_ip_csum((void *) iph
, iph
->ihl
<< 1);
245 static int setup_ip6h(struct ipv6hdr
*ip6h
, uint16_t payload_len
)
247 struct sockaddr_in6
*daddr
= (void *) &cfg_dst_addr
;
248 struct sockaddr_in6
*saddr
= (void *) &cfg_src_addr
;
250 memset(ip6h
, 0, sizeof(*ip6h
));
253 ip6h
->payload_len
= htons(payload_len
);
254 ip6h
->nexthdr
= IPPROTO_EGP
;
256 ip6h
->saddr
= saddr
->sin6_addr
;
257 ip6h
->daddr
= daddr
->sin6_addr
;
259 return sizeof(*ip6h
);
262 static void setup_sockaddr(int domain
, const char *str_addr
, void *sockaddr
)
264 struct sockaddr_in6
*addr6
= (void *) sockaddr
;
265 struct sockaddr_in
*addr4
= (void *) sockaddr
;
269 addr4
->sin_family
= AF_INET
;
270 addr4
->sin_port
= htons(cfg_port
);
271 if (inet_pton(AF_INET
, str_addr
, &(addr4
->sin_addr
)) != 1)
272 error(1, 0, "ipv4 parse error: %s", str_addr
);
275 addr6
->sin6_family
= AF_INET6
;
276 addr6
->sin6_port
= htons(cfg_port
);
277 if (inet_pton(AF_INET6
, str_addr
, &(addr6
->sin6_addr
)) != 1)
278 error(1, 0, "ipv6 parse error: %s", str_addr
);
281 error(1, 0, "illegal domain");
285 static int do_setup_tx(int domain
, int type
, int protocol
)
289 fd
= socket(domain
, type
, protocol
);
291 error(1, errno
, "socket t");
293 do_setsockopt(fd
, SOL_SOCKET
, SO_SNDBUF
, 1 << 21);
295 do_setsockopt(fd
, SOL_SOCKET
, SO_ZEROCOPY
, 1);
297 if (domain
!= PF_PACKET
)
298 if (connect(fd
, (void *) &cfg_dst_addr
, cfg_alen
))
299 error(1, errno
, "connect");
304 static bool do_recv_completion(int fd
)
306 struct sock_extended_err
*serr
;
307 struct msghdr msg
= {};
309 uint32_t hi
, lo
, range
;
313 msg
.msg_control
= control
;
314 msg
.msg_controllen
= sizeof(control
);
316 ret
= recvmsg(fd
, &msg
, MSG_ERRQUEUE
);
317 if (ret
== -1 && errno
== EAGAIN
)
320 error(1, errno
, "recvmsg notification");
321 if (msg
.msg_flags
& MSG_CTRUNC
)
322 error(1, errno
, "recvmsg notification: truncated");
324 cm
= CMSG_FIRSTHDR(&msg
);
326 error(1, 0, "cmsg: no cmsg");
327 if (!((cm
->cmsg_level
== SOL_IP
&& cm
->cmsg_type
== IP_RECVERR
) ||
328 (cm
->cmsg_level
== SOL_IPV6
&& cm
->cmsg_type
== IPV6_RECVERR
) ||
329 (cm
->cmsg_level
== SOL_PACKET
&& cm
->cmsg_type
== PACKET_TX_TIMESTAMP
)))
330 error(1, 0, "serr: wrong type: %d.%d",
331 cm
->cmsg_level
, cm
->cmsg_type
);
333 serr
= (void *) CMSG_DATA(cm
);
334 if (serr
->ee_origin
!= SO_EE_ORIGIN_ZEROCOPY
)
335 error(1, 0, "serr: wrong origin: %u", serr
->ee_origin
);
336 if (serr
->ee_errno
!= 0)
337 error(1, 0, "serr: wrong error code: %u", serr
->ee_errno
);
343 /* Detect notification gaps. These should not happen often, if at all.
344 * Gaps can occur due to drops, reordering and retransmissions.
346 if (lo
!= next_completion
)
347 fprintf(stderr
, "gap: %u..%u does not append to %u\n",
348 lo
, hi
, next_completion
);
349 next_completion
= hi
+ 1;
351 zerocopy
= !(serr
->ee_code
& SO_EE_CODE_ZEROCOPY_COPIED
);
352 if (zerocopied
== -1)
353 zerocopied
= zerocopy
;
354 else if (zerocopied
!= zerocopy
) {
355 fprintf(stderr
, "serr: inconsistent\n");
356 zerocopied
= zerocopy
;
359 if (cfg_verbose
>= 2)
360 fprintf(stderr
, "completed: %u (h=%u l=%u)\n",
363 completions
+= range
;
367 /* Read all outstanding messages on the errqueue */
368 static void do_recv_completions(int fd
)
370 while (do_recv_completion(fd
)) {}
373 /* Wait for all remaining completions on the errqueue */
374 static void do_recv_remaining_completions(int fd
)
376 int64_t tstop
= gettimeofday_ms() + cfg_waittime_ms
;
378 while (completions
< expected_completions
&&
379 gettimeofday_ms() < tstop
) {
380 if (do_poll(fd
, POLLERR
))
381 do_recv_completions(fd
);
384 if (completions
< expected_completions
)
385 fprintf(stderr
, "missing notifications: %lu < %lu\n",
386 completions
, expected_completions
);
389 static void do_tx(int domain
, int type
, int protocol
)
391 struct iovec iov
[3] = { {0} };
392 struct sockaddr_ll laddr
;
393 struct msghdr msg
= {0};
402 fd
= do_setup_tx(domain
, type
, protocol
);
404 if (domain
== PF_PACKET
) {
405 uint16_t proto
= cfg_family
== PF_INET
? ETH_P_IP
: ETH_P_IPV6
;
407 /* sock_raw passes ll header as data */
408 if (type
== SOCK_RAW
) {
409 memset(eth
.h_dest
, 0x06, ETH_ALEN
);
410 memset(eth
.h_source
, 0x02, ETH_ALEN
);
411 eth
.h_proto
= htons(proto
);
412 iov
[0].iov_base
= ð
;
413 iov
[0].iov_len
= sizeof(eth
);
417 /* both sock_raw and sock_dgram expect name */
418 memset(&laddr
, 0, sizeof(laddr
));
419 laddr
.sll_family
= AF_PACKET
;
420 laddr
.sll_ifindex
= cfg_ifindex
;
421 laddr
.sll_protocol
= htons(proto
);
422 laddr
.sll_halen
= ETH_ALEN
;
424 memset(laddr
.sll_addr
, 0x06, ETH_ALEN
);
426 msg
.msg_name
= &laddr
;
427 msg
.msg_namelen
= sizeof(laddr
);
430 /* packet and raw sockets with hdrincl must pass network header */
431 if (domain
== PF_PACKET
|| protocol
== IPPROTO_RAW
) {
432 if (cfg_family
== PF_INET
)
433 iov
[1].iov_len
= setup_iph(&nh
.iph
, cfg_payload_len
);
435 iov
[1].iov_len
= setup_ip6h(&nh
.ip6h
, cfg_payload_len
);
437 iov
[1].iov_base
= (void *) &nh
;
441 iov
[2].iov_base
= payload
;
442 iov
[2].iov_len
= cfg_payload_len
;
444 msg
.msg_iov
= &iov
[3 - msg
.msg_iovlen
];
446 tstop
= gettimeofday_ms() + cfg_runtime_ms
;
449 do_sendmsg_corked(fd
, &msg
);
451 do_sendmsg(fd
, &msg
, cfg_zerocopy
);
453 while (!do_poll(fd
, POLLOUT
)) {
455 do_recv_completions(fd
);
458 } while (gettimeofday_ms() < tstop
);
461 do_recv_remaining_completions(fd
);
464 error(1, errno
, "close");
466 fprintf(stderr
, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
467 packets
, bytes
>> 20, completions
,
468 zerocopied
== 1 ? 'y' : 'n');
471 static int do_setup_rx(int domain
, int type
, int protocol
)
475 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
476 * to recv the only copy of the packet, not a clone
478 if (domain
== PF_PACKET
)
479 error(1, 0, "Use PF_INET/SOCK_RAW to read");
481 if (type
== SOCK_RAW
&& protocol
== IPPROTO_RAW
)
482 error(1, 0, "IPPROTO_RAW: not supported on Rx");
484 fd
= socket(domain
, type
, protocol
);
486 error(1, errno
, "socket r");
488 do_setsockopt(fd
, SOL_SOCKET
, SO_RCVBUF
, 1 << 21);
489 do_setsockopt(fd
, SOL_SOCKET
, SO_RCVLOWAT
, 1 << 16);
490 do_setsockopt(fd
, SOL_SOCKET
, SO_REUSEPORT
, 1);
492 if (bind(fd
, (void *) &cfg_dst_addr
, cfg_alen
))
493 error(1, errno
, "bind");
495 if (type
== SOCK_STREAM
) {
497 error(1, errno
, "listen");
504 /* Flush all outstanding bytes for the tcp receive queue */
505 static void do_flush_tcp(int fd
)
509 /* MSG_TRUNC flushes up to len bytes */
510 ret
= recv(fd
, NULL
, 1 << 21, MSG_TRUNC
| MSG_DONTWAIT
);
511 if (ret
== -1 && errno
== EAGAIN
)
514 error(1, errno
, "flush");
522 /* Flush all outstanding datagrams. Verify first few bytes of each. */
523 static void do_flush_datagram(int fd
, int type
)
528 /* MSG_TRUNC will return full datagram length */
529 ret
= recv(fd
, buf
, sizeof(buf
), MSG_DONTWAIT
| MSG_TRUNC
);
530 if (ret
== -1 && errno
== EAGAIN
)
533 /* raw ipv4 return with header, raw ipv6 without */
534 if (cfg_family
== PF_INET
&& type
== SOCK_RAW
) {
535 off
+= sizeof(struct iphdr
);
536 ret
-= sizeof(struct iphdr
);
540 error(1, errno
, "recv");
541 if (ret
!= cfg_payload_len
)
542 error(1, 0, "recv: ret=%u != %u", ret
, cfg_payload_len
);
543 if (ret
> sizeof(buf
) - off
)
544 ret
= sizeof(buf
) - off
;
545 if (memcmp(buf
+ off
, payload
, ret
))
546 error(1, 0, "recv: data mismatch");
549 bytes
+= cfg_payload_len
;
552 static void do_rx(int domain
, int type
, int protocol
)
557 fd
= do_setup_rx(domain
, type
, protocol
);
559 tstop
= gettimeofday_ms() + cfg_runtime_ms
;
561 if (type
== SOCK_STREAM
)
564 do_flush_datagram(fd
, type
);
568 } while (gettimeofday_ms() < tstop
);
571 error(1, errno
, "close");
573 fprintf(stderr
, "rx=%lu (%lu MB)\n", packets
, bytes
>> 20);
576 static void do_test(int domain
, int type
, int protocol
)
580 if (cfg_cork
&& (domain
== PF_PACKET
|| type
!= SOCK_DGRAM
))
581 error(1, 0, "can only cork udp sockets");
585 for (i
= 0; i
< IP_MAXPACKET
; i
++)
586 payload
[i
] = 'a' + (i
% 26);
589 do_rx(domain
, type
, protocol
);
591 do_tx(domain
, type
, protocol
);
594 static void usage(const char *filepath
)
596 error(1, 0, "Usage: %s [options] <test>", filepath
);
599 static void parse_opts(int argc
, char **argv
)
601 const int max_payload_len
= sizeof(payload
) -
602 sizeof(struct ipv6hdr
) -
603 sizeof(struct tcphdr
) -
604 40 /* max tcp options */;
607 cfg_payload_len
= max_payload_len
;
609 while ((c
= getopt(argc
, argv
, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
612 if (cfg_family
!= PF_UNSPEC
)
613 error(1, 0, "Pass one of -4 or -6");
614 cfg_family
= PF_INET
;
615 cfg_alen
= sizeof(struct sockaddr_in
);
618 if (cfg_family
!= PF_UNSPEC
)
619 error(1, 0, "Pass one of -4 or -6");
620 cfg_family
= PF_INET6
;
621 cfg_alen
= sizeof(struct sockaddr_in6
);
624 cfg_cork
= strtol(optarg
, NULL
, 0);
627 cfg_cpu
= strtol(optarg
, NULL
, 0);
630 setup_sockaddr(cfg_family
, optarg
, &cfg_dst_addr
);
633 cfg_ifindex
= if_nametoindex(optarg
);
634 if (cfg_ifindex
== 0)
635 error(1, errno
, "invalid iface: %s", optarg
);
638 cfg_cork_mixed
= true;
641 cfg_port
= htons(strtoul(optarg
, NULL
, 0));
647 cfg_payload_len
= strtoul(optarg
, NULL
, 0);
650 setup_sockaddr(cfg_family
, optarg
, &cfg_src_addr
);
653 cfg_runtime_ms
= 200 + strtoul(optarg
, NULL
, 10) * 1000;
664 if (cfg_payload_len
> max_payload_len
)
665 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len
);
666 if (cfg_cork_mixed
&& (!cfg_zerocopy
|| !cfg_cork
))
667 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
669 if (optind
!= argc
- 1)
673 int main(int argc
, char **argv
)
675 const char *cfg_test
;
677 parse_opts(argc
, argv
);
679 cfg_test
= argv
[argc
- 1];
681 if (!strcmp(cfg_test
, "packet"))
682 do_test(PF_PACKET
, SOCK_RAW
, 0);
683 else if (!strcmp(cfg_test
, "packet_dgram"))
684 do_test(PF_PACKET
, SOCK_DGRAM
, 0);
685 else if (!strcmp(cfg_test
, "raw"))
686 do_test(cfg_family
, SOCK_RAW
, IPPROTO_EGP
);
687 else if (!strcmp(cfg_test
, "raw_hdrincl"))
688 do_test(cfg_family
, SOCK_RAW
, IPPROTO_RAW
);
689 else if (!strcmp(cfg_test
, "tcp"))
690 do_test(cfg_family
, SOCK_STREAM
, 0);
691 else if (!strcmp(cfg_test
, "udp"))
692 do_test(cfg_family
, SOCK_DGRAM
, 0);
694 error(1, 0, "unknown cfg_test %s", cfg_test
);