4 * Copyright(c) 2016 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <rte_atomic.h>
35 #include <rte_branch_prediction.h>
36 #include <rte_common.h>
38 #include <rte_ethdev.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_malloc.h>
42 #include <rte_kvargs.h>
45 #include <sys/types.h>
47 #include <sys/socket.h>
48 #include <sys/ioctl.h>
49 #include <sys/utsname.h>
56 #include <arpa/inet.h>
58 #include <linux/if_tun.h>
59 #include <linux/if_ether.h>
60 #include <linux/version.h>
63 #include <rte_eth_tap.h>
65 #include <tap_netlink.h>
66 #include <tap_tcmsgs.h>
68 /* Linux based path to the TUN device */
69 #define TUN_TAP_DEV_PATH "/dev/net/tun"
70 #define DEFAULT_TAP_NAME "dtap"
72 #define ETH_TAP_IFACE_ARG "iface"
73 #define ETH_TAP_SPEED_ARG "speed"
74 #define ETH_TAP_REMOTE_ARG "remote"
76 #define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
77 #define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
79 static struct rte_vdev_driver pmd_tap_drv
;
81 static const char *valid_arguments
[] = {
90 static volatile uint32_t tap_trigger
; /* Rx trigger */
92 static struct rte_eth_link pmd_link
= {
93 .link_speed
= ETH_SPEED_NUM_10G
,
94 .link_duplex
= ETH_LINK_FULL_DUPLEX
,
95 .link_status
= ETH_LINK_DOWN
,
96 .link_autoneg
= ETH_LINK_SPEED_AUTONEG
100 tap_trigger_cb(int sig __rte_unused
)
102 /* Valid trigger values are nonzero */
103 tap_trigger
= (tap_trigger
+ 1) | 0x80000000;
106 /* Specifies on what netdevices the ioctl should be applied */
114 tap_ioctl(struct pmd_internals
*pmd
, unsigned long request
,
115 struct ifreq
*ifr
, int set
, enum ioctl_mode mode
);
117 static int tap_intr_handle_set(struct rte_eth_dev
*dev
, int set
);
119 /* Tun/Tap allocation routine
121 * name is the number of the interface to use, unless NULL to take the host
125 tun_alloc(struct pmd_internals
*pmd
, uint16_t qid
)
128 #ifdef IFF_MULTI_QUEUE
129 unsigned int features
;
133 memset(&ifr
, 0, sizeof(struct ifreq
));
136 * Do not set IFF_NO_PI as packet information header will be needed
137 * to check if a received packet has been truncated.
139 ifr
.ifr_flags
= IFF_TAP
;
140 snprintf(ifr
.ifr_name
, IFNAMSIZ
, "%s", pmd
->name
);
142 RTE_LOG(DEBUG
, PMD
, "ifr_name '%s'\n", ifr
.ifr_name
);
144 fd
= open(TUN_TAP_DEV_PATH
, O_RDWR
);
146 RTE_LOG(ERR
, PMD
, "Unable to create TAP interface");
150 #ifdef IFF_MULTI_QUEUE
151 /* Grab the TUN features to verify we can work multi-queue */
152 if (ioctl(fd
, TUNGETFEATURES
, &features
) < 0) {
153 RTE_LOG(ERR
, PMD
, "TAP unable to get TUN/TAP features\n");
156 RTE_LOG(DEBUG
, PMD
, " TAP Features %08x\n", features
);
158 if (features
& IFF_MULTI_QUEUE
) {
159 RTE_LOG(DEBUG
, PMD
, " Multi-queue support for %d queues\n",
160 RTE_PMD_TAP_MAX_QUEUES
);
161 ifr
.ifr_flags
|= IFF_MULTI_QUEUE
;
165 ifr
.ifr_flags
|= IFF_ONE_QUEUE
;
166 RTE_LOG(DEBUG
, PMD
, " Single queue only support\n");
169 /* Set the TUN/TAP configuration and set the name if needed */
170 if (ioctl(fd
, TUNSETIFF
, (void *)&ifr
) < 0) {
171 RTE_LOG(WARNING
, PMD
,
172 "Unable to set TUNSETIFF for %s\n",
178 /* Always set the file descriptor to non-blocking */
179 if (fcntl(fd
, F_SETFL
, O_NONBLOCK
) < 0) {
180 RTE_LOG(WARNING
, PMD
,
181 "Unable to set %s to nonblocking\n",
183 perror("F_SETFL, NONBLOCK");
187 /* Set up trigger to optimize empty Rx bursts */
191 int flags
= fcntl(fd
, F_GETFL
);
193 if (flags
== -1 || sigaction(SIGIO
, NULL
, &sa
) == -1)
195 if (sa
.sa_handler
!= tap_trigger_cb
) {
197 * Make sure SIGIO is not already taken. This is done
198 * as late as possible to leave the application a
199 * chance to set up its own signal handler first.
201 if (sa
.sa_handler
!= SIG_IGN
&&
202 sa
.sa_handler
!= SIG_DFL
) {
206 sa
= (struct sigaction
){
207 .sa_flags
= SA_RESTART
,
208 .sa_handler
= tap_trigger_cb
,
210 if (sigaction(SIGIO
, &sa
, NULL
) == -1)
213 /* Enable SIGIO on file descriptor */
214 fcntl(fd
, F_SETFL
, flags
| O_ASYNC
);
215 fcntl(fd
, F_SETOWN
, getpid());
218 /* Disable trigger globally in case of error */
220 RTE_LOG(WARNING
, PMD
, "Rx trigger disabled: %s\n",
228 * pmd->eth_addr contains the desired MAC, either from remote
229 * or from a random assignment. Sync it with the tap netdevice.
231 ifr
.ifr_hwaddr
.sa_family
= AF_LOCAL
;
232 rte_memcpy(ifr
.ifr_hwaddr
.sa_data
, &pmd
->eth_addr
,
234 if (tap_ioctl(pmd
, SIOCSIFHWADDR
, &ifr
, 0, LOCAL_ONLY
) < 0)
237 pmd
->if_index
= if_nametoindex(pmd
->name
);
238 if (!pmd
->if_index
) {
240 "Could not find ifindex for %s: rte_flow won't be usable.\n",
244 if (!pmd
->flower_support
)
246 if (qdisc_create_multiq(pmd
->nlsk_fd
, pmd
->if_index
) < 0) {
248 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
252 if (qdisc_create_ingress(pmd
->nlsk_fd
, pmd
->if_index
) < 0) {
254 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
258 if (pmd
->remote_if_index
) {
260 * Flush usually returns negative value because it tries
261 * to delete every QDISC (and on a running device, one
262 * QDISC at least is needed). Ignore negative return
265 qdisc_flush(pmd
->nlsk_fd
, pmd
->remote_if_index
);
266 if (qdisc_create_ingress(pmd
->nlsk_fd
,
267 pmd
->remote_if_index
) < 0)
269 LIST_INIT(&pmd
->implicit_flows
);
270 if (tap_flow_implicit_create(
271 pmd
, TAP_REMOTE_LOCAL_MAC
) < 0)
273 if (tap_flow_implicit_create(
274 pmd
, TAP_REMOTE_BROADCAST
) < 0)
276 if (tap_flow_implicit_create(
277 pmd
, TAP_REMOTE_BROADCASTV6
) < 0)
279 if (tap_flow_implicit_create(
280 pmd
, TAP_REMOTE_TX
) < 0)
289 "Could not set up remote flow rules for %s: remote disabled.\n",
291 pmd
->remote_if_index
= 0;
292 tap_flow_implicit_flush(pmd
, NULL
);
301 /* Callback to handle the rx burst of packets to the correct interface and
302 * file descriptor(s) in a multi-queue setup.
305 pmd_rx_burst(void *queue
, struct rte_mbuf
**bufs
, uint16_t nb_pkts
)
307 struct rx_queue
*rxq
= queue
;
309 unsigned long num_rx_bytes
= 0;
310 uint32_t trigger
= tap_trigger
;
312 if (trigger
== rxq
->trigger_seen
)
315 rxq
->trigger_seen
= trigger
;
316 rte_compiler_barrier();
317 for (num_rx
= 0; num_rx
< nb_pkts
; ) {
318 struct rte_mbuf
*mbuf
= rxq
->pool
;
319 struct rte_mbuf
*seg
= NULL
;
320 struct rte_mbuf
*new_tail
= NULL
;
321 uint16_t data_off
= rte_pktmbuf_headroom(mbuf
);
324 len
= readv(rxq
->fd
, *rxq
->iovecs
,
325 1 + (rxq
->rxmode
->enable_scatter
?
326 rxq
->nb_rx_desc
: 1));
327 if (len
< (int)sizeof(struct tun_pi
))
330 /* Packet couldn't fit in the provided mbuf */
331 if (unlikely(rxq
->pi
.flags
& TUN_PKT_STRIP
)) {
332 rxq
->stats
.ierrors
++;
336 len
-= sizeof(struct tun_pi
);
339 mbuf
->port
= rxq
->in_port
;
341 struct rte_mbuf
*buf
= rte_pktmbuf_alloc(rxq
->mp
);
343 if (unlikely(!buf
)) {
344 rxq
->stats
.rx_nombuf
++;
345 /* No new buf has been allocated: do nothing */
346 if (!new_tail
|| !seg
)
350 rte_pktmbuf_free(mbuf
);
354 seg
= seg
? seg
->next
: mbuf
;
355 if (rxq
->pool
== mbuf
)
358 new_tail
->next
= buf
;
360 new_tail
->next
= seg
->next
;
362 /* iovecs[0] is reserved for packet info (pi) */
363 (*rxq
->iovecs
)[mbuf
->nb_segs
].iov_len
=
364 buf
->buf_len
- data_off
;
365 (*rxq
->iovecs
)[mbuf
->nb_segs
].iov_base
=
366 (char *)buf
->buf_addr
+ data_off
;
368 seg
->data_len
= RTE_MIN(seg
->buf_len
- data_off
, len
);
369 seg
->data_off
= data_off
;
371 len
-= seg
->data_len
;
375 /* First segment has headroom, not the others */
379 mbuf
->packet_type
= rte_net_get_ptype(mbuf
, NULL
,
382 /* account for the receive frame */
383 bufs
[num_rx
++] = mbuf
;
384 num_rx_bytes
+= mbuf
->pkt_len
;
387 rxq
->stats
.ipackets
+= num_rx
;
388 rxq
->stats
.ibytes
+= num_rx_bytes
;
393 /* Callback to handle sending packets from the tap interface
396 pmd_tx_burst(void *queue
, struct rte_mbuf
**bufs
, uint16_t nb_pkts
)
398 struct tx_queue
*txq
= queue
;
400 unsigned long num_tx_bytes
= 0;
404 if (unlikely(nb_pkts
== 0))
407 max_size
= *txq
->mtu
+ (ETHER_HDR_LEN
+ ETHER_CRC_LEN
+ 4);
408 for (i
= 0; i
< nb_pkts
; i
++) {
409 struct rte_mbuf
*mbuf
= bufs
[num_tx
];
410 struct iovec iovecs
[mbuf
->nb_segs
+ 1];
411 struct tun_pi pi
= { .flags
= 0 };
412 struct rte_mbuf
*seg
= mbuf
;
416 /* stats.errs will be incremented */
417 if (rte_pktmbuf_pkt_len(mbuf
) > max_size
)
420 iovecs
[0].iov_base
= &pi
;
421 iovecs
[0].iov_len
= sizeof(pi
);
422 for (j
= 1; j
<= mbuf
->nb_segs
; j
++) {
423 iovecs
[j
].iov_len
= rte_pktmbuf_data_len(seg
);
425 rte_pktmbuf_mtod(seg
, void *);
428 /* copy the tx frame data */
429 n
= writev(txq
->fd
, iovecs
, mbuf
->nb_segs
+ 1);
434 num_tx_bytes
+= mbuf
->pkt_len
;
435 rte_pktmbuf_free(mbuf
);
438 txq
->stats
.opackets
+= num_tx
;
439 txq
->stats
.errs
+= nb_pkts
- num_tx
;
440 txq
->stats
.obytes
+= num_tx_bytes
;
446 tap_ioctl(struct pmd_internals
*pmd
, unsigned long request
,
447 struct ifreq
*ifr
, int set
, enum ioctl_mode mode
)
449 short req_flags
= ifr
->ifr_flags
;
450 int remote
= pmd
->remote_if_index
&&
451 (mode
== REMOTE_ONLY
|| mode
== LOCAL_AND_REMOTE
);
453 if (!pmd
->remote_if_index
&& mode
== REMOTE_ONLY
)
456 * If there is a remote netdevice, apply ioctl on it, then apply it on
461 snprintf(ifr
->ifr_name
, IFNAMSIZ
, "%s", pmd
->remote_iface
);
462 else if (mode
== LOCAL_ONLY
|| mode
== LOCAL_AND_REMOTE
)
463 snprintf(ifr
->ifr_name
, IFNAMSIZ
, "%s", pmd
->name
);
466 /* fetch current flags to leave other flags untouched */
467 if (ioctl(pmd
->ioctl_sock
, SIOCGIFFLAGS
, ifr
) < 0)
470 ifr
->ifr_flags
|= req_flags
;
472 ifr
->ifr_flags
&= ~req_flags
;
480 RTE_LOG(WARNING
, PMD
, "%s: ioctl() called with wrong arg\n",
484 if (ioctl(pmd
->ioctl_sock
, request
, ifr
) < 0)
486 if (remote
-- && mode
== LOCAL_AND_REMOTE
)
491 RTE_LOG(ERR
, PMD
, "%s: ioctl(%lu) failed with error: %s\n",
492 ifr
->ifr_name
, request
, strerror(errno
));
497 tap_link_set_down(struct rte_eth_dev
*dev
)
499 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
500 struct ifreq ifr
= { .ifr_flags
= IFF_UP
};
502 dev
->data
->dev_link
.link_status
= ETH_LINK_DOWN
;
503 return tap_ioctl(pmd
, SIOCSIFFLAGS
, &ifr
, 0, LOCAL_AND_REMOTE
);
507 tap_link_set_up(struct rte_eth_dev
*dev
)
509 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
510 struct ifreq ifr
= { .ifr_flags
= IFF_UP
};
512 dev
->data
->dev_link
.link_status
= ETH_LINK_UP
;
513 return tap_ioctl(pmd
, SIOCSIFFLAGS
, &ifr
, 1, LOCAL_AND_REMOTE
);
517 tap_dev_start(struct rte_eth_dev
*dev
)
521 err
= tap_intr_handle_set(dev
, 1);
524 return tap_link_set_up(dev
);
527 /* This function gets called when the current port gets stopped.
530 tap_dev_stop(struct rte_eth_dev
*dev
)
532 tap_intr_handle_set(dev
, 0);
533 tap_link_set_down(dev
);
537 tap_dev_configure(struct rte_eth_dev
*dev __rte_unused
)
543 tap_dev_speed_capa(void)
545 uint32_t speed
= pmd_link
.link_speed
;
548 if (speed
>= ETH_SPEED_NUM_10M
)
549 capa
|= ETH_LINK_SPEED_10M
;
550 if (speed
>= ETH_SPEED_NUM_100M
)
551 capa
|= ETH_LINK_SPEED_100M
;
552 if (speed
>= ETH_SPEED_NUM_1G
)
553 capa
|= ETH_LINK_SPEED_1G
;
554 if (speed
>= ETH_SPEED_NUM_5G
)
555 capa
|= ETH_LINK_SPEED_2_5G
;
556 if (speed
>= ETH_SPEED_NUM_5G
)
557 capa
|= ETH_LINK_SPEED_5G
;
558 if (speed
>= ETH_SPEED_NUM_10G
)
559 capa
|= ETH_LINK_SPEED_10G
;
560 if (speed
>= ETH_SPEED_NUM_20G
)
561 capa
|= ETH_LINK_SPEED_20G
;
562 if (speed
>= ETH_SPEED_NUM_25G
)
563 capa
|= ETH_LINK_SPEED_25G
;
564 if (speed
>= ETH_SPEED_NUM_40G
)
565 capa
|= ETH_LINK_SPEED_40G
;
566 if (speed
>= ETH_SPEED_NUM_50G
)
567 capa
|= ETH_LINK_SPEED_50G
;
568 if (speed
>= ETH_SPEED_NUM_56G
)
569 capa
|= ETH_LINK_SPEED_56G
;
570 if (speed
>= ETH_SPEED_NUM_100G
)
571 capa
|= ETH_LINK_SPEED_100G
;
577 tap_dev_info(struct rte_eth_dev
*dev
, struct rte_eth_dev_info
*dev_info
)
579 struct pmd_internals
*internals
= dev
->data
->dev_private
;
581 dev_info
->if_index
= internals
->if_index
;
582 dev_info
->max_mac_addrs
= 1;
583 dev_info
->max_rx_pktlen
= (uint32_t)ETHER_MAX_VLAN_FRAME_LEN
;
584 dev_info
->max_rx_queues
= internals
->nb_queues
;
585 dev_info
->max_tx_queues
= internals
->nb_queues
;
586 dev_info
->min_rx_bufsize
= 0;
587 dev_info
->pci_dev
= NULL
;
588 dev_info
->speed_capa
= tap_dev_speed_capa();
592 tap_stats_get(struct rte_eth_dev
*dev
, struct rte_eth_stats
*tap_stats
)
594 unsigned int i
, imax
;
595 unsigned long rx_total
= 0, tx_total
= 0, tx_err_total
= 0;
596 unsigned long rx_bytes_total
= 0, tx_bytes_total
= 0;
597 unsigned long rx_nombuf
= 0, ierrors
= 0;
598 const struct pmd_internals
*pmd
= dev
->data
->dev_private
;
600 imax
= (pmd
->nb_queues
< RTE_ETHDEV_QUEUE_STAT_CNTRS
) ?
601 pmd
->nb_queues
: RTE_ETHDEV_QUEUE_STAT_CNTRS
;
603 for (i
= 0; i
< imax
; i
++) {
604 tap_stats
->q_ipackets
[i
] = pmd
->rxq
[i
].stats
.ipackets
;
605 tap_stats
->q_ibytes
[i
] = pmd
->rxq
[i
].stats
.ibytes
;
606 rx_total
+= tap_stats
->q_ipackets
[i
];
607 rx_bytes_total
+= tap_stats
->q_ibytes
[i
];
608 rx_nombuf
+= pmd
->rxq
[i
].stats
.rx_nombuf
;
609 ierrors
+= pmd
->rxq
[i
].stats
.ierrors
;
611 tap_stats
->q_opackets
[i
] = pmd
->txq
[i
].stats
.opackets
;
612 tap_stats
->q_errors
[i
] = pmd
->txq
[i
].stats
.errs
;
613 tap_stats
->q_obytes
[i
] = pmd
->txq
[i
].stats
.obytes
;
614 tx_total
+= tap_stats
->q_opackets
[i
];
615 tx_err_total
+= tap_stats
->q_errors
[i
];
616 tx_bytes_total
+= tap_stats
->q_obytes
[i
];
619 tap_stats
->ipackets
= rx_total
;
620 tap_stats
->ibytes
= rx_bytes_total
;
621 tap_stats
->ierrors
= ierrors
;
622 tap_stats
->rx_nombuf
= rx_nombuf
;
623 tap_stats
->opackets
= tx_total
;
624 tap_stats
->oerrors
= tx_err_total
;
625 tap_stats
->obytes
= tx_bytes_total
;
629 tap_stats_reset(struct rte_eth_dev
*dev
)
632 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
634 for (i
= 0; i
< pmd
->nb_queues
; i
++) {
635 pmd
->rxq
[i
].stats
.ipackets
= 0;
636 pmd
->rxq
[i
].stats
.ibytes
= 0;
637 pmd
->rxq
[i
].stats
.ierrors
= 0;
638 pmd
->rxq
[i
].stats
.rx_nombuf
= 0;
640 pmd
->txq
[i
].stats
.opackets
= 0;
641 pmd
->txq
[i
].stats
.errs
= 0;
642 pmd
->txq
[i
].stats
.obytes
= 0;
647 tap_dev_close(struct rte_eth_dev
*dev __rte_unused
)
650 struct pmd_internals
*internals
= dev
->data
->dev_private
;
652 tap_link_set_down(dev
);
653 tap_flow_flush(dev
, NULL
);
654 tap_flow_implicit_flush(internals
, NULL
);
656 for (i
= 0; i
< internals
->nb_queues
; i
++) {
657 if (internals
->rxq
[i
].fd
!= -1)
658 close(internals
->rxq
[i
].fd
);
659 internals
->rxq
[i
].fd
= -1;
660 internals
->txq
[i
].fd
= -1;
665 tap_rx_queue_release(void *queue
)
667 struct rx_queue
*rxq
= queue
;
669 if (rxq
&& (rxq
->fd
> 0)) {
672 rte_pktmbuf_free(rxq
->pool
);
673 rte_free(rxq
->iovecs
);
680 tap_tx_queue_release(void *queue
)
682 struct tx_queue
*txq
= queue
;
684 if (txq
&& (txq
->fd
> 0)) {
691 tap_link_update(struct rte_eth_dev
*dev
, int wait_to_complete __rte_unused
)
693 struct rte_eth_link
*dev_link
= &dev
->data
->dev_link
;
694 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
695 struct ifreq ifr
= { .ifr_flags
= 0 };
697 if (pmd
->remote_if_index
) {
698 tap_ioctl(pmd
, SIOCGIFFLAGS
, &ifr
, 0, REMOTE_ONLY
);
699 if (!(ifr
.ifr_flags
& IFF_UP
) ||
700 !(ifr
.ifr_flags
& IFF_RUNNING
)) {
701 dev_link
->link_status
= ETH_LINK_DOWN
;
705 tap_ioctl(pmd
, SIOCGIFFLAGS
, &ifr
, 0, LOCAL_ONLY
);
706 dev_link
->link_status
=
707 ((ifr
.ifr_flags
& IFF_UP
) && (ifr
.ifr_flags
& IFF_RUNNING
) ?
714 tap_promisc_enable(struct rte_eth_dev
*dev
)
716 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
717 struct ifreq ifr
= { .ifr_flags
= IFF_PROMISC
};
719 dev
->data
->promiscuous
= 1;
720 tap_ioctl(pmd
, SIOCSIFFLAGS
, &ifr
, 1, LOCAL_AND_REMOTE
);
721 if (pmd
->remote_if_index
)
722 tap_flow_implicit_create(pmd
, TAP_REMOTE_PROMISC
);
726 tap_promisc_disable(struct rte_eth_dev
*dev
)
728 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
729 struct ifreq ifr
= { .ifr_flags
= IFF_PROMISC
};
731 dev
->data
->promiscuous
= 0;
732 tap_ioctl(pmd
, SIOCSIFFLAGS
, &ifr
, 0, LOCAL_AND_REMOTE
);
733 if (pmd
->remote_if_index
)
734 tap_flow_implicit_destroy(pmd
, TAP_REMOTE_PROMISC
);
738 tap_allmulti_enable(struct rte_eth_dev
*dev
)
740 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
741 struct ifreq ifr
= { .ifr_flags
= IFF_ALLMULTI
};
743 dev
->data
->all_multicast
= 1;
744 tap_ioctl(pmd
, SIOCSIFFLAGS
, &ifr
, 1, LOCAL_AND_REMOTE
);
745 if (pmd
->remote_if_index
)
746 tap_flow_implicit_create(pmd
, TAP_REMOTE_ALLMULTI
);
750 tap_allmulti_disable(struct rte_eth_dev
*dev
)
752 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
753 struct ifreq ifr
= { .ifr_flags
= IFF_ALLMULTI
};
755 dev
->data
->all_multicast
= 0;
756 tap_ioctl(pmd
, SIOCSIFFLAGS
, &ifr
, 0, LOCAL_AND_REMOTE
);
757 if (pmd
->remote_if_index
)
758 tap_flow_implicit_destroy(pmd
, TAP_REMOTE_ALLMULTI
);
763 tap_mac_set(struct rte_eth_dev
*dev
, struct ether_addr
*mac_addr
)
765 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
768 if (is_zero_ether_addr(mac_addr
)) {
769 RTE_LOG(ERR
, PMD
, "%s: can't set an empty MAC address\n",
773 /* Check the actual current MAC address on the tap netdevice */
774 if (tap_ioctl(pmd
, SIOCGIFHWADDR
, &ifr
, 0, LOCAL_ONLY
) != 0) {
776 "%s: couldn't check current tap MAC address\n",
780 if (is_same_ether_addr((struct ether_addr
*)&ifr
.ifr_hwaddr
.sa_data
,
784 ifr
.ifr_hwaddr
.sa_family
= AF_LOCAL
;
785 rte_memcpy(ifr
.ifr_hwaddr
.sa_data
, mac_addr
, ETHER_ADDR_LEN
);
786 if (tap_ioctl(pmd
, SIOCSIFHWADDR
, &ifr
, 1, LOCAL_AND_REMOTE
) < 0)
788 rte_memcpy(&pmd
->eth_addr
, mac_addr
, ETHER_ADDR_LEN
);
789 if (pmd
->remote_if_index
) {
790 /* Replace MAC redirection rule after a MAC change */
791 if (tap_flow_implicit_destroy(pmd
, TAP_REMOTE_LOCAL_MAC
) < 0) {
793 "%s: Couldn't delete MAC redirection rule\n",
797 if (tap_flow_implicit_create(pmd
, TAP_REMOTE_LOCAL_MAC
) < 0)
799 "%s: Couldn't add MAC redirection rule\n",
805 tap_setup_queue(struct rte_eth_dev
*dev
,
806 struct pmd_internals
*internals
,
809 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
810 struct rx_queue
*rx
= &internals
->rxq
[qid
];
811 struct tx_queue
*tx
= &internals
->txq
[qid
];
818 RTE_LOG(INFO
, PMD
, "Add queue to TAP %s for qid %d\n",
820 fd
= tun_alloc(pmd
, qid
);
822 RTE_LOG(ERR
, PMD
, "tun_alloc(%s, %d) failed\n",
829 ifr
.ifr_mtu
= dev
->data
->mtu
;
830 if (tap_ioctl(pmd
, SIOCSIFMTU
, &ifr
, 1,
831 LOCAL_AND_REMOTE
) < 0) {
841 tx
->mtu
= &dev
->data
->mtu
;
842 rx
->rxmode
= &dev
->data
->dev_conf
.rxmode
;
848 rx_setup_queue(struct rte_eth_dev
*dev
,
849 struct pmd_internals
*internals
,
852 dev
->data
->rx_queues
[qid
] = &internals
->rxq
[qid
];
854 return tap_setup_queue(dev
, internals
, qid
);
858 tx_setup_queue(struct rte_eth_dev
*dev
,
859 struct pmd_internals
*internals
,
862 dev
->data
->tx_queues
[qid
] = &internals
->txq
[qid
];
864 return tap_setup_queue(dev
, internals
, qid
);
868 tap_rx_queue_setup(struct rte_eth_dev
*dev
,
869 uint16_t rx_queue_id
,
871 unsigned int socket_id
,
872 const struct rte_eth_rxconf
*rx_conf __rte_unused
,
873 struct rte_mempool
*mp
)
875 struct pmd_internals
*internals
= dev
->data
->dev_private
;
876 struct rx_queue
*rxq
= &internals
->rxq
[rx_queue_id
];
877 struct rte_mbuf
**tmp
= &rxq
->pool
;
878 long iov_max
= sysconf(_SC_IOV_MAX
);
879 uint16_t nb_desc
= RTE_MIN(nb_rx_desc
, iov_max
- 1);
880 struct iovec (*iovecs
)[nb_desc
+ 1];
881 int data_off
= RTE_PKTMBUF_HEADROOM
;
886 if ((rx_queue_id
>= internals
->nb_queues
) || !mp
) {
887 RTE_LOG(WARNING
, PMD
,
888 "nb_queues %d too small or mempool NULL\n",
889 internals
->nb_queues
);
894 rxq
->trigger_seen
= 1; /* force initial burst */
895 rxq
->in_port
= dev
->data
->port_id
;
896 rxq
->nb_rx_desc
= nb_desc
;
897 iovecs
= rte_zmalloc_socket(dev
->data
->name
, sizeof(*iovecs
), 0,
900 RTE_LOG(WARNING
, PMD
,
901 "%s: Couldn't allocate %d RX descriptors\n",
902 dev
->data
->name
, nb_desc
);
905 rxq
->iovecs
= iovecs
;
907 fd
= rx_setup_queue(dev
, internals
, rx_queue_id
);
913 (*rxq
->iovecs
)[0].iov_len
= sizeof(struct tun_pi
);
914 (*rxq
->iovecs
)[0].iov_base
= &rxq
->pi
;
916 for (i
= 1; i
<= nb_desc
; i
++) {
917 *tmp
= rte_pktmbuf_alloc(rxq
->mp
);
919 RTE_LOG(WARNING
, PMD
,
920 "%s: couldn't allocate memory for queue %d\n",
921 dev
->data
->name
, rx_queue_id
);
925 (*rxq
->iovecs
)[i
].iov_len
= (*tmp
)->buf_len
- data_off
;
926 (*rxq
->iovecs
)[i
].iov_base
=
927 (char *)(*tmp
)->buf_addr
+ data_off
;
932 RTE_LOG(DEBUG
, PMD
, " RX TAP device name %s, qid %d on fd %d\n",
933 internals
->name
, rx_queue_id
, internals
->rxq
[rx_queue_id
].fd
);
938 rte_pktmbuf_free(rxq
->pool
);
940 rte_free(rxq
->iovecs
);
946 tap_tx_queue_setup(struct rte_eth_dev
*dev
,
947 uint16_t tx_queue_id
,
948 uint16_t nb_tx_desc __rte_unused
,
949 unsigned int socket_id __rte_unused
,
950 const struct rte_eth_txconf
*tx_conf __rte_unused
)
952 struct pmd_internals
*internals
= dev
->data
->dev_private
;
955 if (tx_queue_id
>= internals
->nb_queues
)
958 ret
= tx_setup_queue(dev
, internals
, tx_queue_id
);
962 RTE_LOG(DEBUG
, PMD
, " TX TAP device name %s, qid %d on fd %d\n",
963 internals
->name
, tx_queue_id
, internals
->txq
[tx_queue_id
].fd
);
969 tap_mtu_set(struct rte_eth_dev
*dev
, uint16_t mtu
)
971 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
972 struct ifreq ifr
= { .ifr_mtu
= mtu
};
975 err
= tap_ioctl(pmd
, SIOCSIFMTU
, &ifr
, 1, LOCAL_AND_REMOTE
);
977 dev
->data
->mtu
= mtu
;
983 tap_set_mc_addr_list(struct rte_eth_dev
*dev __rte_unused
,
984 struct ether_addr
*mc_addr_set __rte_unused
,
985 uint32_t nb_mc_addr __rte_unused
)
988 * Nothing to do actually: the tap has no filtering whatsoever, every
989 * packet is received.
995 tap_nl_msg_handler(struct nlmsghdr
*nh
, void *arg
)
997 struct rte_eth_dev
*dev
= arg
;
998 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
999 struct ifinfomsg
*info
= NLMSG_DATA(nh
);
1001 if (nh
->nlmsg_type
!= RTM_NEWLINK
||
1002 (info
->ifi_index
!= pmd
->if_index
&&
1003 info
->ifi_index
!= pmd
->remote_if_index
))
1005 return tap_link_update(dev
, 0);
1009 tap_dev_intr_handler(void *cb_arg
)
1011 struct rte_eth_dev
*dev
= cb_arg
;
1012 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
1014 nl_recv(pmd
->intr_handle
.fd
, tap_nl_msg_handler
, dev
);
1018 tap_intr_handle_set(struct rte_eth_dev
*dev
, int set
)
1020 struct pmd_internals
*pmd
= dev
->data
->dev_private
;
1022 /* In any case, disable interrupt if the conf is no longer there. */
1023 if (!dev
->data
->dev_conf
.intr_conf
.lsc
) {
1024 if (pmd
->intr_handle
.fd
!= -1)
1025 nl_final(pmd
->intr_handle
.fd
);
1026 rte_intr_callback_unregister(
1027 &pmd
->intr_handle
, tap_dev_intr_handler
, dev
);
1031 pmd
->intr_handle
.fd
= nl_init(RTMGRP_LINK
);
1032 if (unlikely(pmd
->intr_handle
.fd
== -1))
1034 return rte_intr_callback_register(
1035 &pmd
->intr_handle
, tap_dev_intr_handler
, dev
);
1037 nl_final(pmd
->intr_handle
.fd
);
1038 return rte_intr_callback_unregister(&pmd
->intr_handle
,
1039 tap_dev_intr_handler
, dev
);
1042 static const uint32_t*
1043 tap_dev_supported_ptypes_get(struct rte_eth_dev
*dev __rte_unused
)
1045 static const uint32_t ptypes
[] = {
1046 RTE_PTYPE_INNER_L2_ETHER
,
1047 RTE_PTYPE_INNER_L2_ETHER_VLAN
,
1048 RTE_PTYPE_INNER_L2_ETHER_QINQ
,
1049 RTE_PTYPE_INNER_L3_IPV4
,
1050 RTE_PTYPE_INNER_L3_IPV4_EXT
,
1051 RTE_PTYPE_INNER_L3_IPV6
,
1052 RTE_PTYPE_INNER_L3_IPV6_EXT
,
1053 RTE_PTYPE_INNER_L4_FRAG
,
1054 RTE_PTYPE_INNER_L4_UDP
,
1055 RTE_PTYPE_INNER_L4_TCP
,
1056 RTE_PTYPE_INNER_L4_SCTP
,
1058 RTE_PTYPE_L2_ETHER_VLAN
,
1059 RTE_PTYPE_L2_ETHER_QINQ
,
1061 RTE_PTYPE_L3_IPV4_EXT
,
1062 RTE_PTYPE_L3_IPV6_EXT
,
1074 tap_flow_ctrl_get(struct rte_eth_dev
*dev __rte_unused
,
1075 struct rte_eth_fc_conf
*fc_conf
)
1077 fc_conf
->mode
= RTE_FC_NONE
;
1082 tap_flow_ctrl_set(struct rte_eth_dev
*dev __rte_unused
,
1083 struct rte_eth_fc_conf
*fc_conf
)
1085 if (fc_conf
->mode
!= RTE_FC_NONE
)
1090 static const struct eth_dev_ops ops
= {
1091 .dev_start
= tap_dev_start
,
1092 .dev_stop
= tap_dev_stop
,
1093 .dev_close
= tap_dev_close
,
1094 .dev_configure
= tap_dev_configure
,
1095 .dev_infos_get
= tap_dev_info
,
1096 .rx_queue_setup
= tap_rx_queue_setup
,
1097 .tx_queue_setup
= tap_tx_queue_setup
,
1098 .rx_queue_release
= tap_rx_queue_release
,
1099 .tx_queue_release
= tap_tx_queue_release
,
1100 .flow_ctrl_get
= tap_flow_ctrl_get
,
1101 .flow_ctrl_set
= tap_flow_ctrl_set
,
1102 .link_update
= tap_link_update
,
1103 .dev_set_link_up
= tap_link_set_up
,
1104 .dev_set_link_down
= tap_link_set_down
,
1105 .promiscuous_enable
= tap_promisc_enable
,
1106 .promiscuous_disable
= tap_promisc_disable
,
1107 .allmulticast_enable
= tap_allmulti_enable
,
1108 .allmulticast_disable
= tap_allmulti_disable
,
1109 .mac_addr_set
= tap_mac_set
,
1110 .mtu_set
= tap_mtu_set
,
1111 .set_mc_addr_list
= tap_set_mc_addr_list
,
1112 .stats_get
= tap_stats_get
,
1113 .stats_reset
= tap_stats_reset
,
1114 .dev_supported_ptypes_get
= tap_dev_supported_ptypes_get
,
1115 .filter_ctrl
= tap_dev_filter_ctrl
,
1119 tap_kernel_support(struct pmd_internals
*pmd
)
1121 struct utsname utsname
;
1124 if (uname(&utsname
) == -1 ||
1125 sscanf(utsname
.release
, "%d.%d.%d",
1126 &ver
[0], &ver
[1], &ver
[2]) != 3)
1128 if (KERNEL_VERSION(ver
[0], ver
[1], ver
[2]) >= FLOWER_KERNEL_VERSION
)
1129 pmd
->flower_support
= 1;
1130 if (KERNEL_VERSION(ver
[0], ver
[1], ver
[2]) >=
1131 FLOWER_VLAN_KERNEL_VERSION
)
1132 pmd
->flower_vlan_support
= 1;
1137 eth_dev_tap_create(struct rte_vdev_device
*vdev
, char *tap_name
,
1140 int numa_node
= rte_socket_id();
1141 struct rte_eth_dev
*dev
;
1142 struct pmd_internals
*pmd
;
1143 struct rte_eth_dev_data
*data
;
1146 RTE_LOG(DEBUG
, PMD
, " TAP device on numa %u\n", rte_socket_id());
1148 data
= rte_zmalloc_socket(tap_name
, sizeof(*data
), 0, numa_node
);
1150 RTE_LOG(ERR
, PMD
, "TAP Failed to allocate data\n");
1154 dev
= rte_eth_vdev_allocate(vdev
, sizeof(*pmd
));
1156 RTE_LOG(ERR
, PMD
, "TAP Unable to allocate device struct\n");
1160 pmd
= dev
->data
->dev_private
;
1161 snprintf(pmd
->name
, sizeof(pmd
->name
), "%s", tap_name
);
1162 pmd
->nb_queues
= RTE_PMD_TAP_MAX_QUEUES
;
1164 pmd
->ioctl_sock
= socket(AF_INET
, SOCK_DGRAM
, 0);
1165 if (pmd
->ioctl_sock
== -1) {
1167 "TAP Unable to get a socket for management: %s\n",
1172 /* Setup some default values */
1173 rte_memcpy(data
, dev
->data
, sizeof(*data
));
1174 data
->dev_private
= pmd
;
1175 data
->dev_flags
= RTE_ETH_DEV_DETACHABLE
| RTE_ETH_DEV_INTR_LSC
;
1176 data
->numa_node
= numa_node
;
1177 data
->drv_name
= pmd_tap_drv
.driver
.name
;
1179 data
->dev_link
= pmd_link
;
1180 data
->mac_addrs
= &pmd
->eth_addr
;
1181 data
->nb_rx_queues
= pmd
->nb_queues
;
1182 data
->nb_tx_queues
= pmd
->nb_queues
;
1185 dev
->dev_ops
= &ops
;
1186 dev
->rx_pkt_burst
= pmd_rx_burst
;
1187 dev
->tx_pkt_burst
= pmd_tx_burst
;
1189 pmd
->intr_handle
.type
= RTE_INTR_HANDLE_EXT
;
1190 pmd
->intr_handle
.fd
= -1;
1192 /* Presetup the fds to -1 as being not valid */
1193 for (i
= 0; i
< RTE_PMD_TAP_MAX_QUEUES
; i
++) {
1194 pmd
->rxq
[i
].fd
= -1;
1195 pmd
->txq
[i
].fd
= -1;
1198 tap_kernel_support(pmd
);
1199 if (!pmd
->flower_support
)
1201 LIST_INIT(&pmd
->flows
);
1203 * If no netlink socket can be created, then it will fail when
1204 * creating/destroying flow rules.
1206 pmd
->nlsk_fd
= nl_init(0);
1207 if (strlen(remote_iface
)) {
1210 pmd
->remote_if_index
= if_nametoindex(remote_iface
);
1211 snprintf(pmd
->remote_iface
, RTE_ETH_NAME_MAX_LEN
,
1212 "%s", remote_iface
);
1213 if (!pmd
->remote_if_index
) {
1214 RTE_LOG(ERR
, PMD
, "Could not find %s ifindex: "
1215 "remote interface will remain unconfigured\n",
1219 if (tap_ioctl(pmd
, SIOCGIFHWADDR
, &ifr
, 0, REMOTE_ONLY
) < 0) {
1220 RTE_LOG(ERR
, PMD
, "Could not get remote MAC address\n");
1223 rte_memcpy(&pmd
->eth_addr
, ifr
.ifr_hwaddr
.sa_data
,
1226 eth_random_addr((uint8_t *)&pmd
->eth_addr
);
1232 RTE_LOG(DEBUG
, PMD
, "TAP Unable to initialize %s\n",
1233 rte_vdev_device_name(vdev
));
1240 set_interface_name(const char *key __rte_unused
,
1244 char *name
= (char *)extra_args
;
1247 snprintf(name
, RTE_ETH_NAME_MAX_LEN
- 1, "%s", value
);
1249 snprintf(name
, RTE_ETH_NAME_MAX_LEN
- 1, "%s%d",
1250 DEFAULT_TAP_NAME
, (tap_unit
- 1));
1256 set_interface_speed(const char *key __rte_unused
,
1260 *(int *)extra_args
= (value
) ? atoi(value
) : ETH_SPEED_NUM_10G
;
1266 set_remote_iface(const char *key __rte_unused
,
1270 char *name
= (char *)extra_args
;
1273 snprintf(name
, RTE_ETH_NAME_MAX_LEN
, "%s", value
);
1278 /* Open a TAP interface device.
1281 rte_pmd_tap_probe(struct rte_vdev_device
*dev
)
1283 const char *name
, *params
;
1285 struct rte_kvargs
*kvlist
= NULL
;
1287 char tap_name
[RTE_ETH_NAME_MAX_LEN
];
1288 char remote_iface
[RTE_ETH_NAME_MAX_LEN
];
1290 name
= rte_vdev_device_name(dev
);
1291 params
= rte_vdev_device_args(dev
);
1293 speed
= ETH_SPEED_NUM_10G
;
1294 snprintf(tap_name
, sizeof(tap_name
), "%s%d",
1295 DEFAULT_TAP_NAME
, tap_unit
++);
1296 memset(remote_iface
, 0, RTE_ETH_NAME_MAX_LEN
);
1298 if (params
&& (params
[0] != '\0')) {
1299 RTE_LOG(DEBUG
, PMD
, "paramaters (%s)\n", params
);
1301 kvlist
= rte_kvargs_parse(params
, valid_arguments
);
1303 if (rte_kvargs_count(kvlist
, ETH_TAP_SPEED_ARG
) == 1) {
1304 ret
= rte_kvargs_process(kvlist
,
1306 &set_interface_speed
,
1312 if (rte_kvargs_count(kvlist
, ETH_TAP_IFACE_ARG
) == 1) {
1313 ret
= rte_kvargs_process(kvlist
,
1315 &set_interface_name
,
1321 if (rte_kvargs_count(kvlist
, ETH_TAP_REMOTE_ARG
) == 1) {
1322 ret
= rte_kvargs_process(kvlist
,
1331 pmd_link
.link_speed
= speed
;
1333 RTE_LOG(NOTICE
, PMD
, "Initializing pmd_tap for %s as %s\n",
1336 ret
= eth_dev_tap_create(dev
, tap_name
, remote_iface
);
1340 RTE_LOG(ERR
, PMD
, "Failed to create pmd for %s as %s\n",
1342 tap_unit
--; /* Restore the unit number */
1344 rte_kvargs_free(kvlist
);
1349 /* detach a TAP device.
1352 rte_pmd_tap_remove(struct rte_vdev_device
*dev
)
1354 struct rte_eth_dev
*eth_dev
= NULL
;
1355 struct pmd_internals
*internals
;
1358 RTE_LOG(DEBUG
, PMD
, "Closing TUN/TAP Ethernet device on numa %u\n",
1361 /* find the ethdev entry */
1362 eth_dev
= rte_eth_dev_allocated(rte_vdev_device_name(dev
));
1366 internals
= eth_dev
->data
->dev_private
;
1367 if (internals
->flower_support
&& internals
->nlsk_fd
) {
1368 tap_flow_flush(eth_dev
, NULL
);
1369 tap_flow_implicit_flush(internals
, NULL
);
1370 nl_final(internals
->nlsk_fd
);
1372 for (i
= 0; i
< internals
->nb_queues
; i
++)
1373 if (internals
->rxq
[i
].fd
!= -1)
1374 close(internals
->rxq
[i
].fd
);
1376 close(internals
->ioctl_sock
);
1377 rte_free(eth_dev
->data
->dev_private
);
1378 rte_free(eth_dev
->data
);
1380 rte_eth_dev_release_port(eth_dev
);
1385 static struct rte_vdev_driver pmd_tap_drv
= {
1386 .probe
= rte_pmd_tap_probe
,
1387 .remove
= rte_pmd_tap_remove
,
1389 RTE_PMD_REGISTER_VDEV(net_tap
, pmd_tap_drv
);
1390 RTE_PMD_REGISTER_ALIAS(net_tap
, eth_tap
);
1391 RTE_PMD_REGISTER_PARAM_STRING(net_tap
,
1392 ETH_TAP_IFACE_ARG
"=<string> "
1393 ETH_TAP_SPEED_ARG
"=<int> "
1394 ETH_TAP_REMOTE_ARG
"=<string>");