2 * VXLAN: Virtual eXtensible Local Area Network
4 * Copyright (c) 2012-2013 Vyatta Inc.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/kernel.h>
14 #include <linux/types.h>
15 #include <linux/module.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/skbuff.h>
19 #include <linux/rculist.h>
20 #include <linux/netdevice.h>
23 #include <linux/udp.h>
24 #include <linux/igmp.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/hash.h>
29 #include <linux/ethtool.h>
30 #include <linux/netdev_features.h>
32 #include <net/ndisc.h>
34 #include <net/ip_tunnels.h>
37 #include <net/udp_tunnel.h>
38 #include <net/rtnetlink.h>
39 #include <net/route.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44 #include <net/vxlan.h>
45 #include <net/protocol.h>
46 #include <net/udp_tunnel.h>
47 #include <net/ip6_route.h>
48 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/addrconf.h>
51 #include <net/ip6_tunnel.h>
52 #include <net/ip6_checksum.h>
54 #include <net/dst_metadata.h>
56 #ifndef HAVE_METADATA_DST
58 #include "vport-netdev.h"
60 #define VXLAN_VERSION "0.1"
62 #define PORT_HASH_BITS 8
63 #define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
64 #define FDB_AGE_DEFAULT 300 /* 5 min */
65 #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
71 /* UDP port for VXLAN traffic.
72 * The IANA assigned port is 4789, but the Linux default is 8472
73 * for compatibility with early adopters.
75 static unsigned short vxlan_port __read_mostly
= 8472;
76 module_param_named(udp_port
, vxlan_port
, ushort
, 0444);
77 MODULE_PARM_DESC(udp_port
, "Destination UDP port");
79 static int vxlan_net_id
;
80 static struct rtnl_link_ops vxlan_link_ops
;
82 static const u8 all_zeros_mac
[ETH_ALEN
];
84 static struct vxlan_sock
*vxlan_sock_add(struct net
*net
, __be16 port
,
85 bool no_share
, u32 flags
);
87 /* per-network namespace private data for this module */
89 struct list_head vxlan_list
;
90 struct hlist_head sock_list
[PORT_HASH_SIZE
];
94 /* Forwarding table entry */
96 struct hlist_node hlist
; /* linked list of entries */
98 unsigned long updated
; /* jiffies */
100 struct list_head remotes
;
101 u8 eth_addr
[ETH_ALEN
];
102 u16 state
; /* see ndm_state */
103 u8 flags
; /* see ndm_flags */
106 /* salt for hash table */
107 static u32 vxlan_salt __read_mostly
;
108 static struct workqueue_struct
*vxlan_wq
;
110 static inline bool vxlan_collect_metadata(struct vxlan_sock
*vs
)
112 return vs
->flags
& VXLAN_F_COLLECT_METADATA
||
113 ip_tunnel_collect_metadata();
116 #if IS_ENABLED(CONFIG_IPV6)
118 bool vxlan_addr_equal(const union vxlan_addr
*a
, const union vxlan_addr
*b
)
120 if (a
->sa
.sa_family
!= b
->sa
.sa_family
)
122 if (a
->sa
.sa_family
== AF_INET6
)
123 return ipv6_addr_equal(&a
->sin6
.sin6_addr
, &b
->sin6
.sin6_addr
);
125 return a
->sin
.sin_addr
.s_addr
== b
->sin
.sin_addr
.s_addr
;
128 static inline bool vxlan_addr_any(const union vxlan_addr
*ipa
)
130 if (ipa
->sa
.sa_family
== AF_INET6
)
131 return ipv6_addr_any(&ipa
->sin6
.sin6_addr
);
133 return ipa
->sin
.sin_addr
.s_addr
== htonl(INADDR_ANY
);
136 static inline bool vxlan_addr_multicast(const union vxlan_addr
*ipa
)
138 if (ipa
->sa
.sa_family
== AF_INET6
)
139 return ipv6_addr_is_multicast(&ipa
->sin6
.sin6_addr
);
141 return IN_MULTICAST(ntohl(ipa
->sin
.sin_addr
.s_addr
));
144 #else /* !CONFIG_IPV6 */
147 bool vxlan_addr_equal(const union vxlan_addr
*a
, const union vxlan_addr
*b
)
149 return a
->sin
.sin_addr
.s_addr
== b
->sin
.sin_addr
.s_addr
;
152 static inline bool vxlan_addr_any(const union vxlan_addr
*ipa
)
154 return ipa
->sin
.sin_addr
.s_addr
== htonl(INADDR_ANY
);
157 static inline bool vxlan_addr_multicast(const union vxlan_addr
*ipa
)
159 return IN_MULTICAST(ntohl(ipa
->sin
.sin_addr
.s_addr
));
164 /* Virtual Network hash table head */
165 static inline struct hlist_head
*vni_head(struct vxlan_sock
*vs
, u32 id
)
167 return &vs
->vni_list
[hash_32(id
, VNI_HASH_BITS
)];
170 /* Socket hash table head */
171 static inline struct hlist_head
*vs_head(struct net
*net
, __be16 port
)
173 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
175 return &vn
->sock_list
[hash_32(ntohs(port
), PORT_HASH_BITS
)];
178 /* First remote destination for a forwarding entry.
179 * Guaranteed to be non-NULL because remotes are never deleted.
181 static inline struct vxlan_rdst
*first_remote_rcu(struct vxlan_fdb
*fdb
)
183 return list_entry_rcu(fdb
->remotes
.next
, struct vxlan_rdst
, list
);
186 static inline struct vxlan_rdst
*first_remote_rtnl(struct vxlan_fdb
*fdb
)
188 return list_first_entry(&fdb
->remotes
, struct vxlan_rdst
, list
);
191 /* Find VXLAN socket based on network namespace, address family and UDP port
192 * and enabled unshareable flags.
194 static struct vxlan_sock
*vxlan_find_sock(struct net
*net
, sa_family_t family
,
195 __be16 port
, u32 flags
)
197 struct vxlan_sock
*vs
;
199 flags
&= VXLAN_F_RCV_FLAGS
;
201 hlist_for_each_entry_rcu(vs
, vs_head(net
, port
), hlist
) {
202 if (inet_sport(vs
->sock
->sk
) == port
&&
203 vxlan_get_sk_family(vs
) == family
&&
210 static struct vxlan_dev
*vxlan_vs_find_vni(struct vxlan_sock
*vs
, u32 id
)
212 struct vxlan_dev
*vxlan
;
214 hlist_for_each_entry_rcu(vxlan
, vni_head(vs
, id
), hlist
) {
215 if (vxlan
->default_dst
.remote_vni
== id
)
222 /* Look up VNI in a per net namespace table */
223 static struct vxlan_dev
*vxlan_find_vni(struct net
*net
, u32 id
,
224 sa_family_t family
, __be16 port
,
227 struct vxlan_sock
*vs
;
229 vs
= vxlan_find_sock(net
, family
, port
, flags
);
233 return vxlan_vs_find_vni(vs
, id
);
236 /* Fill in neighbour message in skbuff. */
237 static int vxlan_fdb_info(struct sk_buff
*skb
, struct vxlan_dev
*vxlan
,
238 const struct vxlan_fdb
*fdb
,
239 u32 portid
, u32 seq
, int type
, unsigned int flags
,
240 const struct vxlan_rdst
*rdst
)
245 static inline size_t vxlan_nlmsg_size(void)
247 return NLMSG_ALIGN(sizeof(struct ndmsg
))
248 + nla_total_size(ETH_ALEN
) /* NDA_LLADDR */
249 + nla_total_size(sizeof(struct in6_addr
)) /* NDA_DST */
250 + nla_total_size(sizeof(__be16
)) /* NDA_PORT */
251 + nla_total_size(sizeof(__be32
)) /* NDA_VNI */
252 + nla_total_size(sizeof(__u32
)) /* NDA_IFINDEX */
253 + nla_total_size(sizeof(__s32
)) /* NDA_LINK_NETNSID */
254 + nla_total_size(sizeof(struct nda_cacheinfo
));
257 static void vxlan_fdb_notify(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*fdb
,
258 struct vxlan_rdst
*rd
, int type
)
260 struct net
*net
= dev_net(vxlan
->dev
);
264 skb
= nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC
);
268 err
= vxlan_fdb_info(skb
, vxlan
, fdb
, 0, 0, type
, 0, rd
);
270 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
271 WARN_ON(err
== -EMSGSIZE
);
276 rtnl_notify(skb
, net
, 0, RTNLGRP_NEIGH
, NULL
, GFP_ATOMIC
);
280 rtnl_set_sk_err(net
, RTNLGRP_NEIGH
, err
);
283 /* Hash Ethernet address */
284 static u32
eth_hash(const unsigned char *addr
)
286 u64 value
= get_unaligned((u64
*)addr
);
288 /* only want 6 bytes */
294 return hash_64(value
, FDB_HASH_BITS
);
297 /* Hash chain to use given mac address */
298 static inline struct hlist_head
*vxlan_fdb_head(struct vxlan_dev
*vxlan
,
301 return &vxlan
->fdb_head
[eth_hash(mac
)];
304 /* Look up Ethernet address in forwarding table */
305 static struct vxlan_fdb
*__vxlan_find_mac(struct vxlan_dev
*vxlan
,
308 struct hlist_head
*head
= vxlan_fdb_head(vxlan
, mac
);
311 hlist_for_each_entry_rcu(f
, head
, hlist
) {
312 if (ether_addr_equal(mac
, f
->eth_addr
))
319 static struct vxlan_fdb
*vxlan_find_mac(struct vxlan_dev
*vxlan
,
324 f
= __vxlan_find_mac(vxlan
, mac
);
331 /* caller should hold vxlan->hash_lock */
332 static struct vxlan_rdst
*vxlan_fdb_find_rdst(struct vxlan_fdb
*f
,
333 union vxlan_addr
*ip
, __be16 port
,
334 __u32 vni
, __u32 ifindex
)
336 struct vxlan_rdst
*rd
;
338 list_for_each_entry(rd
, &f
->remotes
, list
) {
339 if (vxlan_addr_equal(&rd
->remote_ip
, ip
) &&
340 rd
->remote_port
== port
&&
341 rd
->remote_vni
== vni
&&
342 rd
->remote_ifindex
== ifindex
)
349 /* Replace destination of unicast mac */
350 static int vxlan_fdb_replace(struct vxlan_fdb
*f
,
351 union vxlan_addr
*ip
, __be16 port
, __u32 vni
, __u32 ifindex
)
353 struct vxlan_rdst
*rd
;
355 rd
= vxlan_fdb_find_rdst(f
, ip
, port
, vni
, ifindex
);
359 rd
= list_first_entry_or_null(&f
->remotes
, struct vxlan_rdst
, list
);
363 rd
->remote_port
= port
;
364 rd
->remote_vni
= vni
;
365 rd
->remote_ifindex
= ifindex
;
369 /* Add/update destinations for multicast */
370 static int vxlan_fdb_append(struct vxlan_fdb
*f
,
371 union vxlan_addr
*ip
, __be16 port
, __u32 vni
,
372 __u32 ifindex
, struct vxlan_rdst
**rdp
)
374 struct vxlan_rdst
*rd
;
376 rd
= vxlan_fdb_find_rdst(f
, ip
, port
, vni
, ifindex
);
380 rd
= kmalloc(sizeof(*rd
), GFP_ATOMIC
);
384 rd
->remote_port
= port
;
385 rd
->remote_vni
= vni
;
386 rd
->remote_ifindex
= ifindex
;
388 list_add_tail_rcu(&rd
->list
, &f
->remotes
);
394 #ifdef HAVE_UDP_OFFLOAD
395 #ifdef HAVE_NETIF_F_GSO_TUNNEL_REMCSUM
396 static struct vxlanhdr
*vxlan_gro_remcsum(struct sk_buff
*skb
,
398 struct vxlanhdr
*vh
, size_t hdrlen
,
399 u32 data
, struct gro_remcsum
*grc
,
402 size_t start
, offset
;
404 if (skb
->remcsum_offload
)
407 if (!NAPI_GRO_CB(skb
)->csum_valid
)
410 start
= (data
& VXLAN_RCO_MASK
) << VXLAN_RCO_SHIFT
;
411 offset
= start
+ ((data
& VXLAN_RCO_UDP
) ?
412 offsetof(struct udphdr
, check
) :
413 offsetof(struct tcphdr
, check
));
415 vh
= skb_gro_remcsum_process(skb
, (void *)vh
, off
, hdrlen
,
416 start
, offset
, grc
, nopartial
);
418 skb
->remcsum_offload
= 1;
423 static struct vxlanhdr
*vxlan_gro_remcsum(struct sk_buff
*skb
,
425 struct vxlanhdr
*vh
, size_t hdrlen
,
426 u32 data
, struct gro_remcsum
*grc
,
433 #ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
434 static struct sk_buff
**vxlan_gro_receive(struct sk_buff
**head
,
437 static struct sk_buff
**vxlan_gro_receive(struct sk_buff
**head
,
439 struct udp_offload
*uoff
)
442 #ifdef HAVE_UDP_OFFLOAD_ARG_UOFF
443 struct vxlan_sock
*vs
= container_of(uoff
, struct vxlan_sock
,
446 struct vxlan_sock
*vs
= NULL
;
448 struct sk_buff
*p
, **pp
= NULL
;
449 struct vxlanhdr
*vh
, *vh2
;
450 unsigned int hlen
, off_vx
;
453 struct gro_remcsum grc
;
455 skb_gro_remcsum_init(&grc
);
457 off_vx
= skb_gro_offset(skb
);
458 hlen
= off_vx
+ sizeof(*vh
);
459 vh
= skb_gro_header_fast(skb
, off_vx
);
460 if (skb_gro_header_hard(skb
, hlen
)) {
461 vh
= skb_gro_header_slow(skb
, hlen
, off_vx
);
466 skb_gro_postpull_rcsum(skb
, vh
, sizeof(struct vxlanhdr
));
468 flags
= ntohl(vh
->vx_flags
);
470 if ((flags
& VXLAN_HF_RCO
) && vs
&& (vs
->flags
& VXLAN_F_REMCSUM_RX
)) {
472 vh
= vxlan_gro_remcsum(skb
, off_vx
, vh
, sizeof(struct vxlanhdr
),
473 ntohl(vh
->vx_vni
), &grc
,
475 VXLAN_F_REMCSUM_NOPARTIAL
));
481 skb_gro_pull(skb
, sizeof(struct vxlanhdr
)); /* pull vxlan header */
485 for (p
= *head
; p
; p
= p
->next
) {
486 if (!NAPI_GRO_CB(p
)->same_flow
)
489 vh2
= (struct vxlanhdr
*)(p
->data
+ off_vx
);
490 if (vh
->vx_flags
!= vh2
->vx_flags
||
491 vh
->vx_vni
!= vh2
->vx_vni
) {
492 NAPI_GRO_CB(p
)->same_flow
= 0;
497 pp
= eth_gro_receive(head
, skb
);
500 skb_gro_remcsum_cleanup(skb
, &grc
);
501 NAPI_GRO_CB(skb
)->flush
|= flush
;
506 #ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
507 static int vxlan_gro_complete(struct sk_buff
*skb
, int nhoff
)
509 static int vxlan_gro_complete(struct sk_buff
*skb
, int nhoff
,
510 struct udp_offload
*uoff
)
513 udp_tunnel_gro_complete(skb
, nhoff
);
515 return eth_gro_complete(skb
, nhoff
+ sizeof(struct vxlanhdr
));
518 /* Notify netdevs that UDP port started listening */
519 static void vxlan_notify_add_rx_port(struct vxlan_sock
*vs
)
521 struct net_device
*dev
;
522 struct sock
*sk
= vs
->sock
->sk
;
523 struct net
*net
= sock_net(sk
);
524 sa_family_t sa_family
= vxlan_get_sk_family(vs
);
525 __be16 port
= inet_sk(sk
)->inet_sport
;
528 if (sa_family
== AF_INET
) {
529 err
= udp_add_offload(&vs
->udp_offloads
);
531 pr_warn("vxlan: udp_add_offload failed with status %d\n", err
);
535 for_each_netdev_rcu(net
, dev
) {
536 if (dev
->netdev_ops
->ndo_add_vxlan_port
)
537 dev
->netdev_ops
->ndo_add_vxlan_port(dev
, sa_family
,
543 /* Notify netdevs that UDP port is no more listening */
544 static void vxlan_notify_del_rx_port(struct vxlan_sock
*vs
)
546 struct net_device
*dev
;
547 struct sock
*sk
= vs
->sock
->sk
;
548 struct net
*net
= sock_net(sk
);
549 sa_family_t sa_family
= vxlan_get_sk_family(vs
);
550 __be16 port
= inet_sk(sk
)->inet_sport
;
553 for_each_netdev_rcu(net
, dev
) {
554 if (dev
->netdev_ops
->ndo_del_vxlan_port
)
555 dev
->netdev_ops
->ndo_del_vxlan_port(dev
, sa_family
,
560 if (sa_family
== AF_INET
)
561 udp_del_offload(&vs
->udp_offloads
);
565 /* Add new entry to forwarding table -- assumes lock held */
566 static int vxlan_fdb_create(struct vxlan_dev
*vxlan
,
567 const u8
*mac
, union vxlan_addr
*ip
,
568 __u16 state
, __u16 flags
,
569 __be16 port
, __u32 vni
, __u32 ifindex
,
572 struct vxlan_rdst
*rd
= NULL
;
576 f
= __vxlan_find_mac(vxlan
, mac
);
578 if (flags
& NLM_F_EXCL
) {
579 netdev_dbg(vxlan
->dev
,
580 "lost race to create %pM\n", mac
);
583 if (f
->state
!= state
) {
585 f
->updated
= jiffies
;
588 if (f
->flags
!= ndm_flags
) {
589 f
->flags
= ndm_flags
;
590 f
->updated
= jiffies
;
593 if ((flags
& NLM_F_REPLACE
)) {
594 /* Only change unicasts */
595 if (!(is_multicast_ether_addr(f
->eth_addr
) ||
596 is_zero_ether_addr(f
->eth_addr
))) {
597 notify
|= vxlan_fdb_replace(f
, ip
, port
, vni
,
602 if ((flags
& NLM_F_APPEND
) &&
603 (is_multicast_ether_addr(f
->eth_addr
) ||
604 is_zero_ether_addr(f
->eth_addr
))) {
605 int rc
= vxlan_fdb_append(f
, ip
, port
, vni
, ifindex
,
613 if (!(flags
& NLM_F_CREATE
))
616 if (vxlan
->cfg
.addrmax
&&
617 vxlan
->addrcnt
>= vxlan
->cfg
.addrmax
)
620 /* Disallow replace to add a multicast entry */
621 if ((flags
& NLM_F_REPLACE
) &&
622 (is_multicast_ether_addr(mac
) || is_zero_ether_addr(mac
)))
625 netdev_dbg(vxlan
->dev
, "add %pM -> %pIS\n", mac
, ip
);
626 f
= kmalloc(sizeof(*f
), GFP_ATOMIC
);
632 f
->flags
= ndm_flags
;
633 f
->updated
= f
->used
= jiffies
;
634 INIT_LIST_HEAD(&f
->remotes
);
635 memcpy(f
->eth_addr
, mac
, ETH_ALEN
);
637 vxlan_fdb_append(f
, ip
, port
, vni
, ifindex
, &rd
);
640 hlist_add_head_rcu(&f
->hlist
,
641 vxlan_fdb_head(vxlan
, mac
));
646 rd
= first_remote_rtnl(f
);
647 vxlan_fdb_notify(vxlan
, f
, rd
, RTM_NEWNEIGH
);
653 static void vxlan_fdb_free(struct rcu_head
*head
)
655 struct vxlan_fdb
*f
= container_of(head
, struct vxlan_fdb
, rcu
);
656 struct vxlan_rdst
*rd
, *nd
;
658 list_for_each_entry_safe(rd
, nd
, &f
->remotes
, list
)
663 static void vxlan_fdb_destroy(struct vxlan_dev
*vxlan
, struct vxlan_fdb
*f
)
665 netdev_dbg(vxlan
->dev
,
666 "delete %pM\n", f
->eth_addr
);
669 vxlan_fdb_notify(vxlan
, f
, first_remote_rtnl(f
), RTM_DELNEIGH
);
671 hlist_del_rcu(&f
->hlist
);
672 call_rcu(&f
->rcu
, vxlan_fdb_free
);
675 /* Watch incoming packets to learn mapping between Ethernet address
676 * and Tunnel endpoint.
677 * Return true if packet is bogus and should be dropped.
679 static bool vxlan_snoop(struct net_device
*dev
,
680 union vxlan_addr
*src_ip
, const u8
*src_mac
)
682 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
685 f
= vxlan_find_mac(vxlan
, src_mac
);
687 struct vxlan_rdst
*rdst
= first_remote_rcu(f
);
689 if (likely(vxlan_addr_equal(&rdst
->remote_ip
, src_ip
)))
692 /* Don't migrate static entries, drop packets */
693 if (f
->state
& NUD_NOARP
)
698 "%pM migrated from %pIS to %pIS\n",
699 src_mac
, &rdst
->remote_ip
.sa
, &src_ip
->sa
);
701 rdst
->remote_ip
= *src_ip
;
702 f
->updated
= jiffies
;
703 vxlan_fdb_notify(vxlan
, f
, rdst
, RTM_NEWNEIGH
);
705 /* learned new entry */
706 spin_lock(&vxlan
->hash_lock
);
708 /* close off race between vxlan_flush and incoming packets */
709 if (netif_running(dev
))
710 vxlan_fdb_create(vxlan
, src_mac
, src_ip
,
712 NLM_F_EXCL
|NLM_F_CREATE
,
714 vxlan
->default_dst
.remote_vni
,
716 spin_unlock(&vxlan
->hash_lock
);
722 /* See if multicast group is already in use by other ID */
723 static bool vxlan_group_used(struct vxlan_net
*vn
, struct vxlan_dev
*dev
)
725 struct vxlan_dev
*vxlan
;
727 /* The vxlan_sock is only used by dev, leaving group has
728 * no effect on other vxlan devices.
730 if (atomic_read(&dev
->vn_sock
->refcnt
) == 1)
733 list_for_each_entry(vxlan
, &vn
->vxlan_list
, next
) {
734 if (!netif_running(vxlan
->dev
) || vxlan
== dev
)
737 if (vxlan
->vn_sock
!= dev
->vn_sock
)
740 if (!vxlan_addr_equal(&vxlan
->default_dst
.remote_ip
,
741 &dev
->default_dst
.remote_ip
))
744 if (vxlan
->default_dst
.remote_ifindex
!=
745 dev
->default_dst
.remote_ifindex
)
754 static void vxlan_sock_release(struct vxlan_sock
*vs
)
756 struct sock
*sk
= vs
->sock
->sk
;
757 struct net
*net
= sock_net(sk
);
758 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
760 if (!atomic_dec_and_test(&vs
->refcnt
))
763 spin_lock(&vn
->sock_lock
);
764 hlist_del_rcu(&vs
->hlist
);
765 #ifdef HAVE_UDP_OFFLOAD
766 vxlan_notify_del_rx_port(vs
);
768 spin_unlock(&vn
->sock_lock
);
770 queue_work(vxlan_wq
, &vs
->del_work
);
773 /* Update multicast group membership when first VNI on
774 * multicast address is brought up
776 static int vxlan_igmp_join(struct vxlan_dev
*vxlan
)
781 /* Inverse of vxlan_igmp_join when last VNI is brought down */
782 static int vxlan_igmp_leave(struct vxlan_dev
*vxlan
)
787 #ifdef HAVE_VXLAN_HF_RCO
788 static struct vxlanhdr
*vxlan_remcsum(struct sk_buff
*skb
, struct vxlanhdr
*vh
,
789 size_t hdrlen
, u32 data
, bool nopartial
)
791 size_t start
, offset
, plen
;
793 if (skb
->remcsum_offload
)
796 start
= (data
& VXLAN_RCO_MASK
) << VXLAN_RCO_SHIFT
;
797 offset
= start
+ ((data
& VXLAN_RCO_UDP
) ?
798 offsetof(struct udphdr
, check
) :
799 offsetof(struct tcphdr
, check
));
801 plen
= hdrlen
+ offset
+ sizeof(u16
);
803 if (!pskb_may_pull(skb
, plen
))
806 vh
= (struct vxlanhdr
*)(udp_hdr(skb
) + 1);
808 skb_remcsum_process(skb
, (void *)vh
+ hdrlen
, start
, offset
,
815 static void vxlan_rcv(struct vxlan_sock
*vs
, struct sk_buff
*skb
,
816 struct vxlan_metadata
*md
, u32 vni
,
817 struct metadata_dst
*tun_dst
)
819 struct iphdr
*oip
= NULL
;
820 struct ipv6hdr
*oip6
= NULL
;
821 struct vxlan_dev
*vxlan
;
822 #ifdef HAVE_DEV_TSTATS
823 struct pcpu_sw_netstats
*stats
;
825 union vxlan_addr saddr
;
828 /* For flow based devices, map all packets to VNI 0 */
829 if (vs
->flags
& VXLAN_F_COLLECT_METADATA
)
832 /* Is this VNI defined? */
833 vxlan
= vxlan_vs_find_vni(vs
, vni
);
837 skb_reset_mac_header(skb
);
838 skb_scrub_packet(skb
, !net_eq(vxlan
->net
, dev_net(vxlan
->dev
)));
839 skb
->protocol
= eth_type_trans(skb
, vxlan
->dev
);
840 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
842 /* Ignore packet loops (and multicast echo) */
843 if (ether_addr_equal(eth_hdr(skb
)->h_source
, vxlan
->dev
->dev_addr
))
846 /* Get data from the outer IP header */
847 if (vxlan_get_sk_family(vs
) == AF_INET
) {
849 saddr
.sin
.sin_addr
.s_addr
= oip
->saddr
;
850 saddr
.sa
.sa_family
= AF_INET
;
851 #if IS_ENABLED(CONFIG_IPV6)
853 oip6
= ipv6_hdr(skb
);
854 saddr
.sin6
.sin6_addr
= oip6
->saddr
;
855 saddr
.sa
.sa_family
= AF_INET6
;
860 ovs_skb_dst_set(skb
, (struct dst_entry
*)tun_dst
);
866 if ((vxlan
->flags
& VXLAN_F_LEARN
) &&
867 vxlan_snoop(skb
->dev
, &saddr
, eth_hdr(skb
)->h_source
))
870 skb_reset_network_header(skb
);
871 /* In flow-based mode, GBP is carried in dst_metadata */
872 if (!(vs
->flags
& VXLAN_F_COLLECT_METADATA
))
876 err
= IP6_ECN_decapsulate(oip6
, skb
);
878 err
= IP_ECN_decapsulate(oip
, skb
);
882 ++vxlan
->dev
->stats
.rx_frame_errors
;
883 ++vxlan
->dev
->stats
.rx_errors
;
888 #ifdef HAVE_DEV_TSTATS
889 stats
= this_cpu_ptr((struct pcpu_sw_netstats __percpu
*)vxlan
->dev
->tstats
);
890 u64_stats_update_begin(&stats
->syncp
);
892 stats
->rx_bytes
+= skb
->len
;
893 u64_stats_update_end(&stats
->syncp
);
895 netdev_port_receive(skb
, skb_tunnel_info(skb
));
899 /* Consume bad packet */
903 /* Callback from net/ipv4/udp.c to receive packets */
904 static int vxlan_udp_encap_recv(struct sock
*sk
, struct sk_buff
*skb
)
906 struct vxlan_sock
*vs
;
907 struct vxlanhdr
*vxh
;
909 struct vxlan_metadata _md
;
910 struct vxlan_metadata
*md
= &_md
;
912 struct metadata_dst dst
;
913 char buf
[sizeof(struct metadata_dst
) + sizeof(*md
)];
916 /* Need Vxlan and inner Ethernet header to be present */
917 if (!pskb_may_pull(skb
, VXLAN_HLEN
))
920 vxh
= (struct vxlanhdr
*)(udp_hdr(skb
) + 1);
921 flags
= ntohl(vxh
->vx_flags
);
922 vni
= ntohl(vxh
->vx_vni
);
924 if (flags
& VXLAN_HF_VNI
) {
925 flags
&= ~VXLAN_HF_VNI
;
927 /* VNI flag always required to be set */
931 if (iptunnel_pull_header(skb
, VXLAN_HLEN
, htons(ETH_P_TEB
)))
933 vxh
= (struct vxlanhdr
*)(udp_hdr(skb
) + 1);
935 vs
= rcu_dereference_sk_user_data(sk
);
939 #ifdef HAVE_VXLAN_HF_RCO
940 if ((flags
& VXLAN_HF_RCO
) && (vs
->flags
& VXLAN_F_REMCSUM_RX
)) {
941 vxh
= vxlan_remcsum(skb
, vxh
, sizeof(struct vxlanhdr
), vni
,
942 !!(vs
->flags
& VXLAN_F_REMCSUM_NOPARTIAL
));
946 flags
&= ~VXLAN_HF_RCO
;
947 vni
&= VXLAN_VNI_MASK
;
951 if (vxlan_collect_metadata(vs
)) {
952 ovs_udp_tun_rx_dst(&buf
.dst
.u
.tun_info
, skb
, AF_INET
, TUNNEL_KEY
,
953 cpu_to_be64(vni
>> 8), sizeof(*md
));
955 md
= ip_tunnel_info_opts(&buf
.dst
.u
.tun_info
);
957 memset(md
, 0, sizeof(*md
));
960 /* For backwards compatibility, only allow reserved fields to be
961 * used by VXLAN extensions if explicitly requested.
963 if ((flags
& VXLAN_HF_GBP
) && (vs
->flags
& VXLAN_F_GBP
)) {
964 struct vxlanhdr_gbp
*gbp
;
966 gbp
= (struct vxlanhdr_gbp
*)vxh
;
967 md
->gbp
= ntohs(gbp
->policy_id
);
969 buf
.dst
.u
.tun_info
.key
.tun_flags
|= TUNNEL_VXLAN_OPT
;
972 md
->gbp
|= VXLAN_GBP_DONT_LEARN
;
974 if (gbp
->policy_applied
)
975 md
->gbp
|= VXLAN_GBP_POLICY_APPLIED
;
977 flags
&= ~VXLAN_GBP_USED_BITS
;
980 if (flags
|| vni
& ~VXLAN_VNI_MASK
) {
981 /* If there are any unprocessed flags remaining treat
982 * this as a malformed packet. This behavior diverges from
983 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
984 * in reserved fields are to be ignored. The approach here
985 * maintains compatibility with previous stack code, and also
986 * is more robust and provides a little more security in
987 * adding extensions to VXLAN.
993 vxlan_rcv(vs
, skb
, md
, vni
>> 8, &buf
.dst
);
997 /* Consume bad packet */
1002 netdev_dbg(skb
->dev
, "invalid vxlan flags=%#x vni=%#x\n",
1003 ntohl(vxh
->vx_flags
), ntohl(vxh
->vx_vni
));
1006 /* Return non vxlan pkt */
1010 static void vxlan_build_gbp_hdr(struct vxlanhdr
*vxh
, u32 vxflags
,
1011 struct vxlan_metadata
*md
)
1013 struct vxlanhdr_gbp
*gbp
;
1018 gbp
= (struct vxlanhdr_gbp
*)vxh
;
1019 vxh
->vx_flags
|= htonl(VXLAN_HF_GBP
);
1021 if (md
->gbp
& VXLAN_GBP_DONT_LEARN
)
1022 gbp
->dont_learn
= 1;
1024 if (md
->gbp
& VXLAN_GBP_POLICY_APPLIED
)
1025 gbp
->policy_applied
= 1;
1027 gbp
->policy_id
= htons(md
->gbp
& VXLAN_GBP_ID_MASK
);
1030 #if IS_ENABLED(CONFIG_IPV6)
1031 static int vxlan6_xmit_skb(struct dst_entry
*dst
, struct sock
*sk
,
1032 struct sk_buff
*skb
,
1033 struct net_device
*dev
, struct in6_addr
*saddr
,
1034 struct in6_addr
*daddr
, __u8 prio
, __u8 ttl
,
1035 __be16 src_port
, __be16 dst_port
, __be32 vni
,
1036 struct vxlan_metadata
*md
, bool xnet
, u32 vxflags
)
1038 struct vxlanhdr
*vxh
;
1041 bool udp_sum
= !(vxflags
& VXLAN_F_UDP_ZERO_CSUM6_TX
);
1044 if ((vxflags
& VXLAN_F_REMCSUM_TX
) &&
1045 skb
->ip_summed
== CHECKSUM_PARTIAL
) {
1046 int csum_start
= skb_checksum_start_offset(skb
);
1048 if (csum_start
<= VXLAN_MAX_REMCSUM_START
&&
1049 !(csum_start
& VXLAN_RCO_SHIFT_MASK
) &&
1050 (skb
->csum_offset
== offsetof(struct udphdr
, check
) ||
1051 skb
->csum_offset
== offsetof(struct tcphdr
, check
))) {
1053 type
|= SKB_GSO_TUNNEL_REMCSUM
;
1054 /* Add support for remote csum. */
1055 if (!SKB_GSO_TUNNEL_REMCSUM
) {
1063 skb_scrub_packet(skb
, xnet
);
1065 min_headroom
= LL_RESERVED_SPACE(dst
->dev
) + dst
->header_len
1066 + VXLAN_HLEN
+ sizeof(struct ipv6hdr
)
1067 + (skb_vlan_tag_present(skb
) ? VLAN_HLEN
: 0);
1069 /* Need space for new headers (invalidates iph ptr) */
1070 err
= skb_cow_head(skb
, min_headroom
);
1071 if (unlikely(err
)) {
1076 skb
= vlan_hwaccel_push_inside(skb
);
1077 if (WARN_ON(!skb
)) {
1082 skb
= udp_tunnel_handle_offloads(skb
, udp_sum
, type
, true);
1088 vxh
= (struct vxlanhdr
*) __skb_push(skb
, sizeof(*vxh
));
1089 vxh
->vx_flags
= htonl(VXLAN_HF_VNI
);
1092 if (type
& SKB_GSO_TUNNEL_REMCSUM
) {
1093 u16 hdrlen
= sizeof(struct vxlanhdr
);
1094 u32 data
= (skb_checksum_start_offset(skb
) - hdrlen
) >>
1097 if (skb
->csum_offset
== offsetof(struct udphdr
, check
))
1098 data
|= VXLAN_RCO_UDP
;
1100 vxh
->vx_vni
|= htonl(data
);
1101 vxh
->vx_flags
|= htonl(VXLAN_HF_RCO
);
1103 if (!skb_is_gso(skb
)) {
1104 skb
->ip_summed
= CHECKSUM_NONE
;
1105 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
1106 skb
->encapsulation
= 0;
1111 if (vxflags
& VXLAN_F_GBP
)
1112 vxlan_build_gbp_hdr(vxh
, vxflags
, md
);
1114 ovs_skb_set_inner_protocol(skb
, htons(ETH_P_TEB
));
1116 udp_tunnel6_xmit_skb(dst
, sk
, skb
, dev
, saddr
, daddr
, prio
,
1117 ttl
, src_port
, dst_port
,
1118 !!(vxflags
& VXLAN_F_UDP_ZERO_CSUM6_TX
));
1126 static int vxlan_xmit_skb(struct rtable
*rt
, struct sock
*sk
, struct sk_buff
*skb
,
1127 __be32 src
, __be32 dst
, __u8 tos
, __u8 ttl
, __be16 df
,
1128 __be16 src_port
, __be16 dst_port
, __be32 vni
,
1129 struct vxlan_metadata
*md
, bool xnet
, u32 vxflags
)
1131 struct vxlanhdr
*vxh
;
1134 bool udp_sum
= !!(vxflags
& VXLAN_F_UDP_CSUM
);
1137 if ((vxflags
& VXLAN_F_REMCSUM_TX
) &&
1138 skb
->ip_summed
== CHECKSUM_PARTIAL
) {
1139 int csum_start
= skb_checksum_start_offset(skb
);
1141 if (csum_start
<= VXLAN_MAX_REMCSUM_START
&&
1142 !(csum_start
& VXLAN_RCO_SHIFT_MASK
) &&
1143 (skb
->csum_offset
== offsetof(struct udphdr
, check
) ||
1144 skb
->csum_offset
== offsetof(struct tcphdr
, check
))) {
1146 type
|= SKB_GSO_TUNNEL_REMCSUM
;
1148 if (!SKB_GSO_TUNNEL_REMCSUM
) {
1155 min_headroom
= LL_RESERVED_SPACE(rt_dst(rt
).dev
) + rt_dst(rt
).header_len
1156 + VXLAN_HLEN
+ sizeof(struct iphdr
)
1157 + (skb_vlan_tag_present(skb
) ? VLAN_HLEN
: 0);
1159 /* Need space for new headers (invalidates iph ptr) */
1160 err
= skb_cow_head(skb
, min_headroom
);
1161 if (unlikely(err
)) {
1166 skb
= vlan_hwaccel_push_inside(skb
);
1170 skb
= udp_tunnel_handle_offloads(skb
, udp_sum
, type
, true);
1172 return PTR_ERR(skb
);
1174 vxh
= (struct vxlanhdr
*) __skb_push(skb
, sizeof(*vxh
));
1175 vxh
->vx_flags
= htonl(VXLAN_HF_VNI
);
1178 if (type
& SKB_GSO_TUNNEL_REMCSUM
) {
1179 u16 hdrlen
= sizeof(struct vxlanhdr
);
1180 u32 data
= (skb_checksum_start_offset(skb
) - hdrlen
) >>
1183 if (skb
->csum_offset
== offsetof(struct udphdr
, check
))
1184 data
|= VXLAN_RCO_UDP
;
1186 vxh
->vx_vni
|= htonl(data
);
1187 vxh
->vx_flags
|= htonl(VXLAN_HF_RCO
);
1189 if (!skb_is_gso(skb
)) {
1190 skb
->ip_summed
= CHECKSUM_NONE
;
1191 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
1192 skb
->encapsulation
= 0;
1196 if (vxflags
& VXLAN_F_GBP
)
1197 vxlan_build_gbp_hdr(vxh
, vxflags
, md
);
1199 ovs_skb_set_inner_protocol(skb
, htons(ETH_P_TEB
));
1201 return udp_tunnel_xmit_skb(rt
, sk
, skb
, src
, dst
, tos
,
1202 ttl
, df
, src_port
, dst_port
, xnet
,
1203 !(vxflags
& VXLAN_F_UDP_CSUM
));
1206 static void vxlan_xmit_one(struct sk_buff
*skb
, struct net_device
*dev
,
1207 struct vxlan_rdst
*rdst
, bool did_rsc
)
1209 struct ip_tunnel_info
*info
;
1210 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1211 struct sock
*sk
= vxlan
->vn_sock
->sock
->sk
;
1212 unsigned short family
= vxlan_get_sk_family(vxlan
->vn_sock
);
1213 struct rtable
*rt
= NULL
;
1214 const struct iphdr
*old_iph
;
1216 union vxlan_addr
*dst
;
1217 union vxlan_addr remote_ip
;
1218 struct vxlan_metadata _md
;
1219 struct vxlan_metadata
*md
= &_md
;
1220 __be16 src_port
= 0, dst_port
;
1225 u32 flags
= vxlan
->flags
;
1227 info
= skb_tunnel_info(skb
);
1230 dst_port
= rdst
->remote_port
? rdst
->remote_port
: vxlan
->cfg
.dst_port
;
1231 vni
= rdst
->remote_vni
;
1232 dst
= &rdst
->remote_ip
;
1235 WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
1239 if (family
!= ip_tunnel_info_af(info
))
1242 dst_port
= info
->key
.tp_dst
? : vxlan
->cfg
.dst_port
;
1243 vni
= be64_to_cpu(info
->key
.tun_id
);
1244 remote_ip
.sa
.sa_family
= family
;
1245 if (family
== AF_INET
)
1246 remote_ip
.sin
.sin_addr
.s_addr
= info
->key
.u
.ipv4
.dst
;
1248 remote_ip
.sin6
.sin6_addr
= info
->key
.u
.ipv6
.dst
;
1252 if (vxlan_addr_any(dst
)) {
1254 /* short-circuited back to local bridge */
1255 WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
1261 old_iph
= ip_hdr(skb
);
1263 ttl
= vxlan
->cfg
.ttl
;
1264 if (!ttl
&& vxlan_addr_multicast(dst
))
1267 tos
= vxlan
->cfg
.tos
;
1269 tos
= ip_tunnel_get_dsfield(old_iph
, skb
);
1271 src_port
= udp_flow_src_port(dev_net(dev
), skb
, vxlan
->cfg
.port_min
,
1272 vxlan
->cfg
.port_max
, true);
1275 if (info
->key
.tun_flags
& TUNNEL_CSUM
)
1276 flags
|= VXLAN_F_UDP_CSUM
;
1278 flags
&= ~VXLAN_F_UDP_CSUM
;
1280 ttl
= info
->key
.ttl
;
1281 tos
= info
->key
.tos
;
1283 if (info
->options_len
)
1284 md
= ip_tunnel_info_opts(info
);
1286 md
->gbp
= skb
->mark
;
1289 if (dst
->sa
.sa_family
== AF_INET
) {
1290 if (info
&& (info
->key
.tun_flags
& TUNNEL_DONT_FRAGMENT
))
1293 memset(&fl4
, 0, sizeof(fl4
));
1294 fl4
.flowi4_oif
= rdst
? rdst
->remote_ifindex
: 0;
1295 fl4
.flowi4_tos
= RT_TOS(tos
);
1296 fl4
.flowi4_mark
= skb
->mark
;
1297 fl4
.flowi4_proto
= IPPROTO_UDP
;
1298 fl4
.daddr
= dst
->sin
.sin_addr
.s_addr
;
1299 fl4
.saddr
= vxlan
->cfg
.saddr
.sin
.sin_addr
.s_addr
;
1301 rt
= ip_route_output_key(vxlan
->net
, &fl4
);
1303 netdev_dbg(dev
, "no route to %pI4\n",
1304 &dst
->sin
.sin_addr
.s_addr
);
1305 dev
->stats
.tx_carrier_errors
++;
1309 if (rt_dst(rt
).dev
== dev
) {
1310 netdev_dbg(dev
, "circular route to %pI4\n",
1311 &dst
->sin
.sin_addr
.s_addr
);
1312 dev
->stats
.collisions
++;
1316 /* Bypass encapsulation if the destination is local */
1317 if (rt
->rt_flags
& RTCF_LOCAL
&&
1318 !(rt
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))) {
1319 struct vxlan_dev
*dst_vxlan
;
1322 dst_vxlan
= vxlan_find_vni(vxlan
->net
, vni
,
1323 dst
->sa
.sa_family
, dst_port
,
1327 WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
1332 tos
= ip_tunnel_ecn_encap(tos
, old_iph
, skb
);
1333 ttl
= ttl
? : ip4_dst_hoplimit(&rt_dst(rt
));
1334 err
= vxlan_xmit_skb(rt
, sk
, skb
, fl4
.saddr
,
1335 dst
->sin
.sin_addr
.s_addr
, tos
, ttl
, df
,
1336 src_port
, dst_port
, htonl(vni
<< 8), md
,
1337 !net_eq(vxlan
->net
, dev_net(vxlan
->dev
)),
1340 /* skb is already freed. */
1345 iptunnel_xmit_stats(err
, &dev
->stats
, (struct pcpu_sw_netstats __percpu
*)dev
->tstats
);
1346 #if IS_ENABLED(CONFIG_IPV6)
1348 struct dst_entry
*ndst
;
1352 memset(&fl6
, 0, sizeof(fl6
));
1353 fl6
.flowi6_oif
= rdst
? rdst
->remote_ifindex
: 0;
1354 fl6
.daddr
= dst
->sin6
.sin6_addr
;
1355 fl6
.saddr
= vxlan
->cfg
.saddr
.sin6
.sin6_addr
;
1356 fl6
.flowi6_mark
= skb
->mark
;
1357 fl6
.flowi6_proto
= IPPROTO_UDP
;
1359 #ifdef HAVE_IPV6_DST_LOOKUP_NET
1360 if (ipv6_stub
->ipv6_dst_lookup(vxlan
->net
, sk
, &ndst
, &fl6
)) {
1362 #ifdef HAVE_IPV6_STUB
1363 if (ipv6_stub
->ipv6_dst_lookup(sk
, &ndst
, &fl6
)) {
1365 ndst
= ip6_route_output(vxlan
->net
, sk
, &fl6
);
1369 netdev_dbg(dev
, "no route to %pI6\n",
1370 &dst
->sin6
.sin6_addr
);
1371 dev
->stats
.tx_carrier_errors
++;
1375 if (ndst
->dev
== dev
) {
1376 netdev_dbg(dev
, "circular route to %pI6\n",
1377 &dst
->sin6
.sin6_addr
);
1379 dev
->stats
.collisions
++;
1383 /* Bypass encapsulation if the destination is local */
1384 rt6i_flags
= ((struct rt6_info
*)ndst
)->rt6i_flags
;
1385 if (rt6i_flags
& RTF_LOCAL
&&
1386 !(rt6i_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))) {
1387 struct vxlan_dev
*dst_vxlan
;
1390 dst_vxlan
= vxlan_find_vni(vxlan
->net
, vni
,
1391 dst
->sa
.sa_family
, dst_port
,
1395 WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
1400 ttl
= ttl
? : ip6_dst_hoplimit(ndst
);
1401 err
= vxlan6_xmit_skb(ndst
, sk
, skb
, dev
, &fl6
.saddr
, &fl6
.daddr
,
1402 0, ttl
, src_port
, dst_port
, htonl(vni
<< 8), md
,
1403 !net_eq(vxlan
->net
, dev_net(vxlan
->dev
)),
1411 dev
->stats
.tx_dropped
++;
1417 dev
->stats
.tx_errors
++;
1422 /* Transmit local packets over Vxlan
1424 * Outer IP header inherits ECN and DF from inner header.
1425 * Outer UDP destination is the VXLAN assigned port.
1426 * source port is based on hash of flow
1428 netdev_tx_t
rpl_vxlan_xmit(struct sk_buff
*skb
)
1430 struct net_device
*dev
= skb
->dev
;
1431 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1432 const struct ip_tunnel_info
*info
;
1434 info
= skb_tunnel_info(skb
);
1436 skb_reset_mac_header(skb
);
1438 if ((vxlan
->flags
& VXLAN_F_PROXY
))
1441 if (vxlan
->flags
& VXLAN_F_COLLECT_METADATA
&&
1442 info
&& info
->mode
& IP_TUNNEL_INFO_TX
) {
1443 vxlan_xmit_one(skb
, dev
, NULL
, false);
1444 return NETDEV_TX_OK
;
1447 pr_warn("vxlan: unsupported flag set %x", vxlan
->flags
);
1449 return NETDEV_TX_OK
;
1451 EXPORT_SYMBOL(rpl_vxlan_xmit
);
1453 /* Walk the forwarding table and purge stale entries */
1454 static void vxlan_cleanup(unsigned long arg
)
1456 struct vxlan_dev
*vxlan
= (struct vxlan_dev
*) arg
;
1457 unsigned long next_timer
= jiffies
+ FDB_AGE_INTERVAL
;
1460 if (!netif_running(vxlan
->dev
))
1463 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
1464 struct hlist_node
*p
, *n
;
1466 spin_lock_bh(&vxlan
->hash_lock
);
1467 hlist_for_each_safe(p
, n
, &vxlan
->fdb_head
[h
]) {
1469 = container_of(p
, struct vxlan_fdb
, hlist
);
1470 unsigned long timeout
;
1472 if (f
->state
& NUD_PERMANENT
)
1475 timeout
= f
->used
+ vxlan
->cfg
.age_interval
* HZ
;
1476 if (time_before_eq(timeout
, jiffies
)) {
1477 netdev_dbg(vxlan
->dev
,
1478 "garbage collect %pM\n",
1480 f
->state
= NUD_STALE
;
1481 vxlan_fdb_destroy(vxlan
, f
);
1482 } else if (time_before(timeout
, next_timer
))
1483 next_timer
= timeout
;
1485 spin_unlock_bh(&vxlan
->hash_lock
);
1488 mod_timer(&vxlan
->age_timer
, next_timer
);
1491 static void vxlan_vs_add_dev(struct vxlan_sock
*vs
, struct vxlan_dev
*vxlan
)
1493 struct vxlan_net
*vn
= net_generic(vxlan
->net
, vxlan_net_id
);
1494 __u32 vni
= vxlan
->default_dst
.remote_vni
;
1496 vxlan
->vn_sock
= vs
;
1497 spin_lock(&vn
->sock_lock
);
1498 hlist_add_head_rcu(&vxlan
->hlist
, vni_head(vs
, vni
));
1499 spin_unlock(&vn
->sock_lock
);
1502 /* Setup stats when device is created */
1503 #ifdef HAVE_DEV_TSTATS
1504 static int vxlan_init(struct net_device
*dev
)
1506 dev
->tstats
= (typeof(dev
->tstats
)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats
);
1514 static void vxlan_fdb_delete_default(struct vxlan_dev
*vxlan
)
1516 struct vxlan_fdb
*f
;
1518 spin_lock_bh(&vxlan
->hash_lock
);
1519 f
= __vxlan_find_mac(vxlan
, all_zeros_mac
);
1521 vxlan_fdb_destroy(vxlan
, f
);
1522 spin_unlock_bh(&vxlan
->hash_lock
);
1525 #ifdef HAVE_DEV_TSTATS
1526 static void vxlan_uninit(struct net_device
*dev
)
1528 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1530 vxlan_fdb_delete_default(vxlan
);
1532 free_percpu(dev
->tstats
);
1536 /* Start ageing timer and join group when device is brought up */
1537 static int vxlan_open(struct net_device
*dev
)
1539 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1540 struct vxlan_sock
*vs
;
1543 vs
= vxlan_sock_add(vxlan
->net
, vxlan
->cfg
.dst_port
,
1544 vxlan
->cfg
.no_share
, vxlan
->flags
);
1548 vxlan_vs_add_dev(vs
, vxlan
);
1550 if (vxlan_addr_multicast(&vxlan
->default_dst
.remote_ip
)) {
1551 ret
= vxlan_igmp_join(vxlan
);
1552 if (ret
== -EADDRINUSE
)
1555 vxlan_sock_release(vs
);
1560 if (vxlan
->cfg
.age_interval
)
1561 mod_timer(&vxlan
->age_timer
, jiffies
+ FDB_AGE_INTERVAL
);
1566 /* Purge the forwarding table */
1567 static void vxlan_flush(struct vxlan_dev
*vxlan
)
1571 spin_lock_bh(&vxlan
->hash_lock
);
1572 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
) {
1573 struct hlist_node
*p
, *n
;
1575 hlist_for_each_safe(p
, n
, &vxlan
->fdb_head
[h
]) {
1577 = container_of(p
, struct vxlan_fdb
, hlist
);
1578 /* the all_zeros_mac entry is deleted at vxlan_uninit */
1579 if (!is_zero_ether_addr(f
->eth_addr
))
1580 vxlan_fdb_destroy(vxlan
, f
);
1583 spin_unlock_bh(&vxlan
->hash_lock
);
1586 /* Cleanup timer and forwarding table on shutdown */
1587 static int vxlan_stop(struct net_device
*dev
)
1589 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1590 struct vxlan_net
*vn
= net_generic(vxlan
->net
, vxlan_net_id
);
1591 struct vxlan_sock
*vs
= vxlan
->vn_sock
;
1594 if (vxlan_addr_multicast(&vxlan
->default_dst
.remote_ip
) &&
1595 !vxlan_group_used(vn
, vxlan
))
1596 ret
= vxlan_igmp_leave(vxlan
);
1598 del_timer_sync(&vxlan
->age_timer
);
1601 vxlan_sock_release(vs
);
1606 /* Stub, nothing needs to be done. */
1607 static void vxlan_set_multicast_list(struct net_device
*dev
)
1611 static int __vxlan_change_mtu(struct net_device
*dev
,
1612 struct net_device
*lowerdev
,
1613 struct vxlan_rdst
*dst
, int new_mtu
, bool strict
)
1615 int max_mtu
= IP_MAX_MTU
;
1618 max_mtu
= lowerdev
->mtu
;
1620 if (dst
->remote_ip
.sa
.sa_family
== AF_INET6
)
1621 max_mtu
-= VXLAN6_HEADROOM
;
1623 max_mtu
-= VXLAN_HEADROOM
;
1628 if (new_mtu
> max_mtu
) {
1639 static int vxlan_change_mtu(struct net_device
*dev
, int new_mtu
)
1641 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1642 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
1643 struct net_device
*lowerdev
= __dev_get_by_index(vxlan
->net
,
1644 dst
->remote_ifindex
);
1645 return __vxlan_change_mtu(dev
, lowerdev
, dst
, new_mtu
, true);
1648 static netdev_tx_t
vxlan_dev_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
1650 /* Drop All packets coming from networking stack. OVS-CB is
1651 * not initialized for these packets.
1655 dev
->stats
.tx_dropped
++;
1656 return NETDEV_TX_OK
;
1659 static const struct net_device_ops vxlan_netdev_ops
= {
1660 #ifdef HAVE_DEV_TSTATS
1661 .ndo_init
= vxlan_init
,
1662 .ndo_uninit
= vxlan_uninit
,
1663 .ndo_get_stats64
= ip_tunnel_get_stats64
,
1665 .ndo_open
= vxlan_open
,
1666 .ndo_stop
= vxlan_stop
,
1667 .ndo_start_xmit
= vxlan_dev_xmit
,
1668 .ndo_set_rx_mode
= vxlan_set_multicast_list
,
1669 .ndo_change_mtu
= vxlan_change_mtu
,
1670 .ndo_validate_addr
= eth_validate_addr
,
1671 .ndo_set_mac_address
= eth_mac_addr
,
1674 /* Info for udev, that this is a virtual tunnel endpoint */
1675 static struct device_type vxlan_type
= {
1679 /* Initialize the device structure. */
1680 static void vxlan_setup(struct net_device
*dev
)
1682 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1685 eth_hw_addr_random(dev
);
1688 dev
->netdev_ops
= &vxlan_netdev_ops
;
1689 dev
->destructor
= free_netdev
;
1690 SET_NETDEV_DEVTYPE(dev
, &vxlan_type
);
1692 dev
->features
|= NETIF_F_LLTX
;
1693 dev
->features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
;
1694 dev
->features
|= NETIF_F_RXCSUM
;
1695 dev
->features
|= NETIF_F_GSO_SOFTWARE
;
1697 dev
->vlan_features
= dev
->features
;
1698 dev
->features
|= NETIF_F_HW_VLAN_CTAG_TX
| NETIF_F_HW_VLAN_STAG_TX
;
1699 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
1700 dev
->hw_features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
| NETIF_F_RXCSUM
;
1701 dev
->hw_features
|= NETIF_F_GSO_SOFTWARE
;
1702 dev
->hw_features
|= NETIF_F_HW_VLAN_CTAG_TX
| NETIF_F_HW_VLAN_STAG_TX
;
1706 netif_keep_dst(dev
);
1708 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
| IFF_NO_QUEUE
;
1710 INIT_LIST_HEAD(&vxlan
->next
);
1711 spin_lock_init(&vxlan
->hash_lock
);
1713 init_timer_deferrable(&vxlan
->age_timer
);
1714 vxlan
->age_timer
.function
= vxlan_cleanup
;
1715 vxlan
->age_timer
.data
= (unsigned long) vxlan
;
1717 vxlan
->cfg
.dst_port
= htons(vxlan_port
);
1721 for (h
= 0; h
< FDB_HASH_SIZE
; ++h
)
1722 INIT_HLIST_HEAD(&vxlan
->fdb_head
[h
]);
1725 static const struct nla_policy vxlan_policy
[IFLA_VXLAN_MAX
+ 1] = {
1726 [IFLA_VXLAN_PORT
] = { .type
= NLA_U16
},
1729 static int vxlan_validate(struct nlattr
*tb
[], struct nlattr
*data
[])
1731 if (tb
[IFLA_ADDRESS
]) {
1732 if (nla_len(tb
[IFLA_ADDRESS
]) != ETH_ALEN
) {
1733 pr_debug("invalid link address (not ethernet)\n");
1737 if (!is_valid_ether_addr(nla_data(tb
[IFLA_ADDRESS
]))) {
1738 pr_debug("invalid all zero ethernet address\n");
1739 return -EADDRNOTAVAIL
;
1746 if (data
[IFLA_VXLAN_ID
]) {
1747 __u32 id
= nla_get_u32(data
[IFLA_VXLAN_ID
]);
1748 if (id
>= VXLAN_VID_MASK
)
1752 if (data
[IFLA_VXLAN_PORT_RANGE
]) {
1753 const struct ifla_vxlan_port_range
*p
1754 = nla_data(data
[IFLA_VXLAN_PORT_RANGE
]);
1756 if (ntohs(p
->high
) < ntohs(p
->low
)) {
1757 pr_debug("port range %u .. %u not valid\n",
1758 ntohs(p
->low
), ntohs(p
->high
));
1766 static void vxlan_get_drvinfo(struct net_device
*netdev
,
1767 struct ethtool_drvinfo
*drvinfo
)
1769 strlcpy(drvinfo
->version
, VXLAN_VERSION
, sizeof(drvinfo
->version
));
1770 strlcpy(drvinfo
->driver
, "vxlan", sizeof(drvinfo
->driver
));
1773 static const struct ethtool_ops vxlan_ethtool_ops
= {
1774 .get_drvinfo
= vxlan_get_drvinfo
,
1775 .get_link
= ethtool_op_get_link
,
1778 static void free_vs_rcu(struct rcu_head
*rcu
)
1780 struct vxlan_sock
*vs
= container_of(rcu
, struct vxlan_sock
, rcu
);
1785 static void vxlan_del_work(struct work_struct
*work
)
1787 struct vxlan_sock
*vs
= container_of(work
, struct vxlan_sock
, del_work
);
1788 udp_tunnel_sock_release(vs
->sock
);
1790 call_rcu(&vs
->rcu
, free_vs_rcu
);
1793 static struct socket
*vxlan_create_sock(struct net
*net
, bool ipv6
,
1794 __be16 port
, u32 flags
)
1796 struct socket
*sock
;
1797 struct udp_port_cfg udp_conf
;
1800 memset(&udp_conf
, 0, sizeof(udp_conf
));
1803 udp_conf
.family
= AF_INET6
;
1804 udp_conf
.use_udp6_rx_checksums
=
1805 !(flags
& VXLAN_F_UDP_ZERO_CSUM6_RX
);
1806 udp_conf
.ipv6_v6only
= 1;
1808 udp_conf
.family
= AF_INET
;
1811 udp_conf
.local_udp_port
= port
;
1813 /* Open UDP socket */
1814 err
= udp_sock_create(net
, &udp_conf
, &sock
);
1816 return ERR_PTR(err
);
1821 /* Create new listen socket if needed */
1822 static struct vxlan_sock
*vxlan_socket_create(struct net
*net
, __be16 port
,
1825 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
1826 struct vxlan_sock
*vs
;
1827 struct socket
*sock
;
1829 bool ipv6
= !!(flags
& VXLAN_F_IPV6
);
1830 struct udp_tunnel_sock_cfg tunnel_cfg
;
1832 vs
= kzalloc(sizeof(*vs
), GFP_KERNEL
);
1834 return ERR_PTR(-ENOMEM
);
1836 for (h
= 0; h
< VNI_HASH_SIZE
; ++h
)
1837 INIT_HLIST_HEAD(&vs
->vni_list
[h
]);
1839 INIT_WORK(&vs
->del_work
, vxlan_del_work
);
1841 sock
= vxlan_create_sock(net
, ipv6
, port
, flags
);
1843 pr_info("Cannot bind port %d, err=%ld\n", ntohs(port
),
1846 return ERR_CAST(sock
);
1850 atomic_set(&vs
->refcnt
, 1);
1851 vs
->flags
= (flags
& VXLAN_F_RCV_FLAGS
);
1853 /* Initialize the vxlan udp offloads structure */
1854 #ifdef HAVE_UDP_OFFLOAD
1855 vs
->udp_offloads
.port
= port
;
1856 vs
->udp_offloads
.callbacks
.gro_receive
= vxlan_gro_receive
;
1857 vs
->udp_offloads
.callbacks
.gro_complete
= vxlan_gro_complete
;
1858 vxlan_notify_add_rx_port(vs
);
1861 spin_lock(&vn
->sock_lock
);
1862 hlist_add_head_rcu(&vs
->hlist
, vs_head(net
, port
));
1863 spin_unlock(&vn
->sock_lock
);
1865 /* Mark socket as an encapsulation socket. */
1866 tunnel_cfg
.sk_user_data
= vs
;
1867 tunnel_cfg
.encap_type
= 1;
1868 tunnel_cfg
.encap_rcv
= vxlan_udp_encap_recv
;
1869 tunnel_cfg
.encap_destroy
= NULL
;
1871 setup_udp_tunnel_sock(net
, sock
, &tunnel_cfg
);
1876 static struct vxlan_sock
*vxlan_sock_add(struct net
*net
, __be16 port
,
1877 bool no_share
, u32 flags
)
1879 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
1880 struct vxlan_sock
*vs
;
1881 bool ipv6
= flags
& VXLAN_F_IPV6
;
1884 spin_lock(&vn
->sock_lock
);
1885 vs
= vxlan_find_sock(net
, ipv6
? AF_INET6
: AF_INET
, port
,
1888 if (!atomic_add_unless(&vs
->refcnt
, 1, 0))
1889 vs
= ERR_PTR(-EBUSY
);
1890 spin_unlock(&vn
->sock_lock
);
1893 spin_unlock(&vn
->sock_lock
);
1896 return vxlan_socket_create(net
, port
, flags
);
1899 static int vxlan_dev_configure(struct net
*src_net
, struct net_device
*dev
,
1900 struct vxlan_config
*conf
)
1902 struct vxlan_net
*vn
= net_generic(src_net
, vxlan_net_id
);
1903 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
1904 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
1906 bool use_ipv6
= false;
1907 __be16 default_port
= vxlan
->cfg
.dst_port
;
1908 struct net_device
*lowerdev
= NULL
;
1910 vxlan
->net
= src_net
;
1912 dst
->remote_vni
= conf
->vni
;
1914 memcpy(&dst
->remote_ip
, &conf
->remote_ip
, sizeof(conf
->remote_ip
));
1916 /* Unless IPv6 is explicitly requested, assume IPv4 */
1917 if (!dst
->remote_ip
.sa
.sa_family
)
1918 dst
->remote_ip
.sa
.sa_family
= AF_INET
;
1920 if (dst
->remote_ip
.sa
.sa_family
== AF_INET6
||
1921 vxlan
->cfg
.saddr
.sa
.sa_family
== AF_INET6
) {
1922 if (!IS_ENABLED(CONFIG_IPV6
))
1923 return -EPFNOSUPPORT
;
1927 if (conf
->remote_ifindex
) {
1928 lowerdev
= __dev_get_by_index(src_net
, conf
->remote_ifindex
);
1929 dst
->remote_ifindex
= conf
->remote_ifindex
;
1932 pr_info("ifindex %d does not exist\n", dst
->remote_ifindex
);
1936 #if IS_ENABLED(CONFIG_IPV6)
1938 struct inet6_dev
*idev
= __in6_dev_get(lowerdev
);
1939 if (idev
&& idev
->cnf
.disable_ipv6
) {
1940 pr_info("IPv6 is disabled via sysctl\n");
1943 vxlan
->flags
|= VXLAN_F_IPV6
;
1948 dev
->mtu
= lowerdev
->mtu
- (use_ipv6
? VXLAN6_HEADROOM
: VXLAN_HEADROOM
);
1950 dev
->needed_headroom
= lowerdev
->hard_header_len
+
1951 (use_ipv6
? VXLAN6_HEADROOM
: VXLAN_HEADROOM
);
1952 } else if (use_ipv6
) {
1953 vxlan
->flags
|= VXLAN_F_IPV6
;
1954 dev
->needed_headroom
= ETH_HLEN
+ VXLAN6_HEADROOM
;
1956 dev
->needed_headroom
= ETH_HLEN
+ VXLAN_HEADROOM
;
1960 err
= __vxlan_change_mtu(dev
, lowerdev
, dst
, conf
->mtu
, false);
1965 memcpy(&vxlan
->cfg
, conf
, sizeof(*conf
));
1966 if (!vxlan
->cfg
.dst_port
)
1967 vxlan
->cfg
.dst_port
= default_port
;
1968 vxlan
->flags
|= conf
->flags
;
1970 if (!vxlan
->cfg
.age_interval
)
1971 vxlan
->cfg
.age_interval
= FDB_AGE_DEFAULT
;
1973 if (vxlan_find_vni(src_net
, conf
->vni
, use_ipv6
? AF_INET6
: AF_INET
,
1974 vxlan
->cfg
.dst_port
, vxlan
->flags
))
1977 dev
->ethtool_ops
= &vxlan_ethtool_ops
;
1979 /* create an fdb entry for a valid default destination */
1980 if (!vxlan_addr_any(&vxlan
->default_dst
.remote_ip
)) {
1981 err
= vxlan_fdb_create(vxlan
, all_zeros_mac
,
1982 &vxlan
->default_dst
.remote_ip
,
1983 NUD_REACHABLE
|NUD_PERMANENT
,
1984 NLM_F_EXCL
|NLM_F_CREATE
,
1985 vxlan
->cfg
.dst_port
,
1986 vxlan
->default_dst
.remote_vni
,
1987 vxlan
->default_dst
.remote_ifindex
,
1993 err
= register_netdevice(dev
);
1995 vxlan_fdb_delete_default(vxlan
);
1999 list_add(&vxlan
->next
, &vn
->vxlan_list
);
2004 struct net_device
*rpl_vxlan_dev_create(struct net
*net
, const char *name
,
2005 u8 name_assign_type
, struct vxlan_config
*conf
)
2007 struct nlattr
*tb
[IFLA_MAX
+1];
2008 struct net_device
*dev
;
2011 memset(&tb
, 0, sizeof(tb
));
2013 dev
= rtnl_create_link(net
, (char *)name
, name_assign_type
,
2014 &vxlan_link_ops
, tb
);
2018 err
= vxlan_dev_configure(net
, dev
, conf
);
2021 return ERR_PTR(err
);
2026 EXPORT_SYMBOL_GPL(rpl_vxlan_dev_create
);
2028 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
2029 static int vxlan_newlink(struct net
*src_net
, struct net_device
*dev
,
2030 struct nlattr
*tb
[], struct nlattr
*data
[])
2032 static int vxlan_newlink(struct net_device
*dev
,
2033 struct nlattr
*tb
[], struct nlattr
*data
[])
2039 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
2040 static void vxlan_dellink(struct net_device
*dev
, struct list_head
*head
)
2042 static void vxlan_dellink(struct net_device
*dev
)
2045 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2046 struct vxlan_net
*vn
= net_generic(vxlan
->net
, vxlan_net_id
);
2048 spin_lock(&vn
->sock_lock
);
2049 if (!hlist_unhashed(&vxlan
->hlist
))
2050 hlist_del_rcu(&vxlan
->hlist
);
2051 spin_unlock(&vn
->sock_lock
);
2053 list_del(&vxlan
->next
);
2054 unregister_netdevice_queue(dev
, head
);
2057 static size_t vxlan_get_size(const struct net_device
*dev
)
2060 return nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_ID */
2061 nla_total_size(sizeof(struct in6_addr
)) + /* IFLA_VXLAN_GROUP{6} */
2062 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_LINK */
2063 nla_total_size(sizeof(struct in6_addr
)) + /* IFLA_VXLAN_LOCAL{6} */
2064 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_TTL */
2065 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_TOS */
2066 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_LEARNING */
2067 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_PROXY */
2068 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_RSC */
2069 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_L2MISS */
2070 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_L3MISS */
2071 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_COLLECT_METADATA */
2072 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_AGEING */
2073 nla_total_size(sizeof(__u32
)) + /* IFLA_VXLAN_LIMIT */
2074 nla_total_size(sizeof(struct ifla_vxlan_port_range
)) +
2075 nla_total_size(sizeof(__be16
)) + /* IFLA_VXLAN_PORT */
2076 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_UDP_CSUM */
2077 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
2078 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
2079 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_REMCSUM_TX */
2080 nla_total_size(sizeof(__u8
)) + /* IFLA_VXLAN_REMCSUM_RX */
2084 static int vxlan_fill_info(struct sk_buff
*skb
, const struct net_device
*dev
)
2086 const struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2088 if (nla_put_be16(skb
, IFLA_VXLAN_PORT
, vxlan
->cfg
.dst_port
))
2089 goto nla_put_failure
;
2097 #ifdef HAVE_GET_LINK_NET
2098 static struct net
*vxlan_get_link_net(const struct net_device
*dev
)
2100 struct vxlan_dev
*vxlan
= netdev_priv(dev
);
2106 static struct rtnl_link_ops vxlan_link_ops __read_mostly
= {
2107 .kind
= "ovs_vxlan",
2108 .maxtype
= IFLA_VXLAN_MAX
,
2109 .policy
= vxlan_policy
,
2110 .priv_size
= sizeof(struct vxlan_dev
),
2111 .setup
= vxlan_setup
,
2112 .validate
= vxlan_validate
,
2113 .newlink
= vxlan_newlink
,
2114 .dellink
= vxlan_dellink
,
2115 .get_size
= vxlan_get_size
,
2116 .fill_info
= vxlan_fill_info
,
2117 #ifdef HAVE_GET_LINK_NET
2118 .get_link_net
= vxlan_get_link_net
,
2122 static void vxlan_handle_lowerdev_unregister(struct vxlan_net
*vn
,
2123 struct net_device
*dev
)
2125 struct vxlan_dev
*vxlan
, *next
;
2126 LIST_HEAD(list_kill
);
2128 list_for_each_entry_safe(vxlan
, next
, &vn
->vxlan_list
, next
) {
2129 struct vxlan_rdst
*dst
= &vxlan
->default_dst
;
2131 /* In case we created vxlan device with carrier
2132 * and we loose the carrier due to module unload
2133 * we also need to remove vxlan device. In other
2134 * cases, it's not necessary and remote_ifindex
2135 * is 0 here, so no matches.
2137 if (dst
->remote_ifindex
== dev
->ifindex
)
2138 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
2139 vxlan_dellink(vxlan
->dev
, &list_kill
);
2141 vxlan_dellink(vxlan
->dev
);
2145 unregister_netdevice_many(&list_kill
);
2148 static int vxlan_lowerdev_event(struct notifier_block
*unused
,
2149 unsigned long event
, void *ptr
)
2151 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
2152 struct vxlan_net
*vn
= net_generic(dev_net(dev
), vxlan_net_id
);
2154 if (event
== NETDEV_UNREGISTER
)
2155 vxlan_handle_lowerdev_unregister(vn
, dev
);
2160 static struct notifier_block vxlan_notifier_block __read_mostly
= {
2161 .notifier_call
= vxlan_lowerdev_event
,
2164 static __net_init
int vxlan_init_net(struct net
*net
)
2166 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
2169 INIT_LIST_HEAD(&vn
->vxlan_list
);
2170 spin_lock_init(&vn
->sock_lock
);
2172 for (h
= 0; h
< PORT_HASH_SIZE
; ++h
)
2173 INIT_HLIST_HEAD(&vn
->sock_list
[h
]);
2178 static void __net_exit
vxlan_exit_net(struct net
*net
)
2180 struct vxlan_net
*vn
= net_generic(net
, vxlan_net_id
);
2181 struct vxlan_dev
*vxlan
, *next
;
2182 struct net_device
*dev
, *aux
;
2186 for_each_netdev_safe(net
, dev
, aux
)
2187 if (dev
->rtnl_link_ops
== &vxlan_link_ops
)
2188 unregister_netdevice_queue(dev
, &list
);
2190 list_for_each_entry_safe(vxlan
, next
, &vn
->vxlan_list
, next
) {
2191 /* If vxlan->dev is in the same netns, it has already been added
2192 * to the list by the previous loop.
2194 if (!net_eq(dev_net(vxlan
->dev
), net
))
2195 unregister_netdevice_queue(vxlan
->dev
, &list
);
2198 unregister_netdevice_many(&list
);
2202 static struct pernet_operations vxlan_net_ops
= {
2203 .init
= vxlan_init_net
,
2204 .exit
= vxlan_exit_net
,
2205 .id
= &vxlan_net_id
,
2206 .size
= sizeof(struct vxlan_net
),
2209 DEFINE_COMPAT_PNET_REG_FUNC(device
)
2210 int rpl_vxlan_init_module(void)
2214 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
2215 vxlan_wq
= create_workqueue("vxlan");
2217 vxlan_wq
= alloc_workqueue("vxlan", 0, 0);
2222 get_random_bytes(&vxlan_salt
, sizeof(vxlan_salt
));
2224 rc
= register_pernet_subsys(&vxlan_net_ops
);
2228 rc
= register_netdevice_notifier(&vxlan_notifier_block
);
2232 rc
= rtnl_link_register(&vxlan_link_ops
);
2236 pr_info("VxLAN tunneling driver\n");
2239 unregister_netdevice_notifier(&vxlan_notifier_block
);
2241 unregister_pernet_subsys(&vxlan_net_ops
);
2243 destroy_workqueue(vxlan_wq
);
2247 void rpl_vxlan_cleanup_module(void)
2249 rtnl_link_unregister(&vxlan_link_ops
);
2250 unregister_netdevice_notifier(&vxlan_notifier_block
);
2251 destroy_workqueue(vxlan_wq
);
2252 unregister_pernet_subsys(&vxlan_net_ops
);
2253 /* rcu_barrier() is called by netns */