2 * Copyright (c) 2013,2018 Nicira, Inc.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 #include <linux/capability.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/slab.h>
22 #include <linux/uaccess.h>
23 #include <linux/skbuff.h>
24 #include <linux/netdevice.h>
26 #include <linux/tcp.h>
27 #include <linux/udp.h>
28 #include <linux/if_arp.h>
29 #include <linux/mroute.h>
30 #include <linux/init.h>
31 #include <linux/in6.h>
32 #include <linux/inetdevice.h>
33 #include <linux/igmp.h>
34 #include <linux/netfilter_ipv4.h>
35 #include <linux/etherdevice.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/rculist.h>
39 #include <linux/err.h>
44 #include <net/protocol.h>
45 #include <net/ip_tunnels.h>
47 #include <net/checksum.h>
48 #include <net/dsfield.h>
49 #include <net/inet_ecn.h>
51 #include <net/net_namespace.h>
52 #include <net/netns/generic.h>
53 #include <net/rtnetlink.h>
55 #if IS_ENABLED(CONFIG_IPV6)
57 #include <net/ip6_fib.h>
58 #include <net/ip6_route.h>
63 #ifndef USE_UPSTREAM_TUNNEL
64 const struct ip_tunnel_encap_ops __rcu
*
65 rpl_iptun_encaps
[MAX_IPTUN_ENCAP_OPS
] __read_mostly
;
67 static unsigned int rpl_ip_tunnel_hash(__be32 key
, __be32 remote
)
69 return hash_32((__force u32
)key
^ (__force u32
)remote
,
73 static bool rpl_ip_tunnel_key_match(const struct ip_tunnel_parm
*p
,
74 __be16 flags
, __be32 key
)
76 if (p
->i_flags
& TUNNEL_KEY
) {
77 if (flags
& TUNNEL_KEY
)
78 return key
== p
->i_key
;
80 /* key expected, none present */
83 return !(flags
& TUNNEL_KEY
);
86 static struct hlist_head
*ip_bucket(struct ip_tunnel_net
*itn
,
87 struct ip_tunnel_parm
*parms
)
91 __be32 i_key
= parms
->i_key
;
93 if (parms
->iph
.daddr
&& !ipv4_is_multicast(parms
->iph
.daddr
))
94 remote
= parms
->iph
.daddr
;
98 if (!(parms
->i_flags
& TUNNEL_KEY
) && (parms
->i_flags
& VTI_ISVTI
))
101 h
= rpl_ip_tunnel_hash(i_key
, remote
);
102 return &itn
->tunnels
[h
];
105 static void ip_tunnel_add(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
107 struct hlist_head
*head
= ip_bucket(itn
, &t
->parms
);
110 rcu_assign_pointer(itn
->collect_md_tun
, t
);
111 hlist_add_head_rcu(&t
->hash_node
, head
);
114 static void ip_tunnel_del(struct ip_tunnel_net
*itn
, struct ip_tunnel
*t
)
117 rcu_assign_pointer(itn
->collect_md_tun
, NULL
);
118 hlist_del_init_rcu(&t
->hash_node
);
121 static struct net_device
*__ip_tunnel_create(struct net
*net
,
122 const struct rtnl_link_ops
*ops
,
123 struct ip_tunnel_parm
*parms
)
126 struct ip_tunnel
*tunnel
;
127 struct net_device
*dev
;
131 strlcpy(name
, parms
->name
, IFNAMSIZ
);
133 if (strlen(ops
->kind
) > (IFNAMSIZ
- 3)) {
137 strlcpy(name
, ops
->kind
, IFNAMSIZ
);
138 strncat(name
, "%d", 2);
142 dev
= alloc_netdev(ops
->priv_size
, name
, NET_NAME_UNKNOWN
, ops
->setup
);
147 dev_net_set(dev
, net
);
149 dev
->rtnl_link_ops
= ops
;
151 tunnel
= netdev_priv(dev
);
152 tunnel
->parms
= *parms
;
155 err
= register_netdevice(dev
);
167 static inline void init_tunnel_flow(struct flowi4
*fl4
,
169 __be32 daddr
, __be32 saddr
,
170 __be32 key
, __u8 tos
, int oif
)
172 memset(fl4
, 0, sizeof(*fl4
));
173 fl4
->flowi4_oif
= oif
;
176 fl4
->flowi4_tos
= tos
;
177 fl4
->flowi4_proto
= proto
;
178 fl4
->fl4_gre_key
= key
;
181 static int ip_tunnel_bind_dev(struct net_device
*dev
)
183 struct net_device
*tdev
= NULL
;
184 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
185 const struct iphdr
*iph
;
186 int hlen
= LL_MAX_HEADER
;
187 int mtu
= ETH_DATA_LEN
;
188 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
190 iph
= &tunnel
->parms
.iph
;
192 /* Guess output device to choose reasonable mtu and needed_headroom */
197 init_tunnel_flow(&fl4
, iph
->protocol
, iph
->daddr
,
198 iph
->saddr
, tunnel
->parms
.o_key
,
199 RT_TOS(iph
->tos
), tunnel
->parms
.link
);
200 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
206 if (dev
->type
!= ARPHRD_ETHER
)
207 dev
->flags
|= IFF_POINTOPOINT
;
209 dst_cache_reset(&tunnel
->dst_cache
);
212 if (!tdev
&& tunnel
->parms
.link
)
213 tdev
= __dev_get_by_index(tunnel
->net
, tunnel
->parms
.link
);
216 hlen
= tdev
->hard_header_len
+ tdev
->needed_headroom
;
220 dev
->needed_headroom
= t_hlen
+ hlen
;
221 mtu
-= (dev
->hard_header_len
+ t_hlen
);
229 int rpl___ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
, bool strict
)
231 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
232 int t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
233 int max_mtu
= 0xFFF8 - dev
->hard_header_len
- t_hlen
;
238 if (new_mtu
> max_mtu
) {
249 int rpl_ip_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
)
251 return rpl___ip_tunnel_change_mtu(dev
, new_mtu
, true);
254 static int rpl_tnl_update_pmtu(struct net_device
*dev
, struct sk_buff
*skb
,
255 struct rtable
*rt
, __be16 df
,
256 const struct iphdr
*inner_iph
)
258 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
259 int pkt_size
= skb
->len
- tunnel
->hlen
- dev
->hard_header_len
;
263 mtu
= dst_mtu(&rt
->dst
) - dev
->hard_header_len
264 - sizeof(struct iphdr
) - tunnel
->hlen
;
266 mtu
= skb_dst(skb
) ? dst_mtu(skb_dst(skb
)) : dev
->mtu
;
269 skb_dst(skb
)->ops
->update_pmtu(skb_dst(skb
), NULL
, skb
, mtu
);
271 if (skb
->protocol
== htons(ETH_P_IP
)) {
272 if (!skb_is_gso(skb
) &&
273 (inner_iph
->frag_off
& htons(IP_DF
)) &&
275 memset(IPCB(skb
), 0, sizeof(*IPCB(skb
)));
276 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
280 #if IS_ENABLED(CONFIG_IPV6)
281 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
282 struct rt6_info
*rt6
= (struct rt6_info
*)skb_dst(skb
);
284 if (rt6
&& mtu
< dst_mtu(skb_dst(skb
)) &&
285 mtu
>= IPV6_MIN_MTU
) {
286 if ((tunnel
->parms
.iph
.daddr
&&
287 !ipv4_is_multicast(tunnel
->parms
.iph
.daddr
)) ||
288 rt6
->rt6i_dst
.plen
== 128) {
289 rt6
->rt6i_flags
|= RTF_MODIFIED
;
290 dst_metric_set(skb_dst(skb
), RTAX_MTU
, mtu
);
294 if (!skb_is_gso(skb
) && mtu
>= IPV6_MIN_MTU
&&
296 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
304 void rpl_ip_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
305 const struct iphdr
*tnl_params
, const u8 protocol
)
307 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
308 const struct iphdr
*inner_iph
;
312 struct rtable
*rt
; /* Route to the other host */
313 unsigned int max_headroom
; /* The extra header space needed */
317 inner_iph
= (const struct iphdr
*)skb_inner_network_header(skb
);
318 connected
= (tunnel
->parms
.iph
.daddr
!= 0);
320 dst
= tnl_params
->daddr
;
324 if (skb_dst(skb
) == NULL
) {
325 dev
->stats
.tx_fifo_errors
++;
329 if (skb
->protocol
== htons(ETH_P_IP
)) {
330 rt
= skb_rtable(skb
);
331 dst
= rt_nexthop(rt
, inner_iph
->daddr
);
333 #if IS_ENABLED(CONFIG_IPV6)
334 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
335 const struct in6_addr
*addr6
;
336 struct neighbour
*neigh
;
337 bool do_tx_error_icmp
;
340 neigh
= dst_neigh_lookup(skb_dst(skb
),
341 &ipv6_hdr(skb
)->daddr
);
345 addr6
= (const struct in6_addr
*)&neigh
->primary_key
;
346 addr_type
= ipv6_addr_type(addr6
);
348 if (addr_type
== IPV6_ADDR_ANY
) {
349 addr6
= &ipv6_hdr(skb
)->daddr
;
350 addr_type
= ipv6_addr_type(addr6
);
353 if ((addr_type
& IPV6_ADDR_COMPATv4
) == 0)
354 do_tx_error_icmp
= true;
356 do_tx_error_icmp
= false;
357 dst
= addr6
->s6_addr32
[3];
359 neigh_release(neigh
);
360 if (do_tx_error_icmp
)
370 tos
= tnl_params
->tos
;
373 if (skb
->protocol
== htons(ETH_P_IP
)) {
374 tos
= inner_iph
->tos
;
376 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
377 tos
= ipv6_get_dsfield((const struct ipv6hdr
*)inner_iph
);
382 init_tunnel_flow(&fl4
, protocol
, dst
, tnl_params
->saddr
,
383 tunnel
->parms
.o_key
, RT_TOS(tos
), tunnel
->parms
.link
);
385 if (ovs_ip_tunnel_encap(skb
, tunnel
, &protocol
, &fl4
) < 0)
388 rt
= connected
? dst_cache_get_ip4(&tunnel
->dst_cache
, &fl4
.saddr
) :
392 rt
= ip_route_output_key(tunnel
->net
, &fl4
);
395 dev
->stats
.tx_carrier_errors
++;
399 dst_cache_set_ip4(&tunnel
->dst_cache
, &rt
->dst
,
403 if (rt
->dst
.dev
== dev
) {
405 dev
->stats
.collisions
++;
409 if (rpl_tnl_update_pmtu(dev
, skb
, rt
,
410 tnl_params
->frag_off
, inner_iph
)) {
415 if (tunnel
->err_count
> 0) {
416 if (time_before(jiffies
,
417 tunnel
->err_time
+ IPTUNNEL_ERR_TIMEO
)) {
420 memset(IPCB(skb
), 0, sizeof(*IPCB(skb
)));
421 dst_link_failure(skb
);
423 tunnel
->err_count
= 0;
426 tos
= ip_tunnel_ecn_encap(tos
, inner_iph
, skb
);
427 ttl
= tnl_params
->ttl
;
429 if (skb
->protocol
== htons(ETH_P_IP
))
430 ttl
= inner_iph
->ttl
;
431 #if IS_ENABLED(CONFIG_IPV6)
432 else if (skb
->protocol
== htons(ETH_P_IPV6
))
433 ttl
= ((const struct ipv6hdr
*)inner_iph
)->hop_limit
;
436 ttl
= ip4_dst_hoplimit(&rt
->dst
);
439 df
= tnl_params
->frag_off
;
440 if (skb
->protocol
== htons(ETH_P_IP
))
441 df
|= (inner_iph
->frag_off
&htons(IP_DF
));
443 max_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + sizeof(struct iphdr
)
444 + rt
->dst
.header_len
;
445 if (max_headroom
> dev
->needed_headroom
)
446 dev
->needed_headroom
= max_headroom
;
448 if (skb_cow_head(skb
, dev
->needed_headroom
)) {
450 dev
->stats
.tx_dropped
++;
455 iptunnel_xmit(skb
->sk
, rt
, skb
, fl4
.saddr
, fl4
.daddr
, protocol
,
456 tos
, ttl
, df
, !net_eq(tunnel
->net
, dev_net(dev
)));
460 #if IS_ENABLED(CONFIG_IPV6)
462 dst_link_failure(skb
);
465 dev
->stats
.tx_errors
++;
468 EXPORT_SYMBOL_GPL(rpl_ip_tunnel_xmit
);
470 static void ip_tunnel_dev_free(struct net_device
*dev
)
472 free_percpu(dev
->tstats
);
476 void rpl_ip_tunnel_dellink(struct net_device
*dev
, struct list_head
*head
)
478 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
479 struct ip_tunnel_net
*itn
;
481 itn
= net_generic(tunnel
->net
, tunnel
->ip_tnl_net_id
);
483 ip_tunnel_del(itn
, netdev_priv(dev
));
484 unregister_netdevice_queue(dev
, head
);
487 int rpl_ip_tunnel_init_net(struct net
*net
, int ip_tnl_net_id
,
488 struct rtnl_link_ops
*ops
, char *devname
)
490 struct ip_tunnel_net
*itn
= net_generic(net
, ip_tnl_net_id
);
491 struct ip_tunnel_parm parms
;
494 for (i
= 0; i
< IP_TNL_HASH_SIZE
; i
++)
495 INIT_HLIST_HEAD(&itn
->tunnels
[i
]);
498 itn
->fb_tunnel_dev
= NULL
;
502 memset(&parms
, 0, sizeof(parms
));
504 strlcpy(parms
.name
, devname
, IFNAMSIZ
);
507 itn
->fb_tunnel_dev
= __ip_tunnel_create(net
, ops
, &parms
);
508 /* FB netdevice is special: we have one, and only one per netns.
509 * * Allowing to move it to another netns is clearly unsafe.
511 if (!IS_ERR(itn
->fb_tunnel_dev
)) {
512 itn
->fb_tunnel_dev
->features
|= NETIF_F_NETNS_LOCAL
;
513 itn
->fb_tunnel_dev
->mtu
= ip_tunnel_bind_dev(itn
->fb_tunnel_dev
);
514 ip_tunnel_add(itn
, netdev_priv(itn
->fb_tunnel_dev
));
518 return PTR_ERR_OR_ZERO(itn
->fb_tunnel_dev
);
521 static void ip_tunnel_destroy(struct ip_tunnel_net
*itn
, struct list_head
*head
,
522 struct rtnl_link_ops
*ops
)
524 struct net
*net
= dev_net(itn
->fb_tunnel_dev
);
525 struct net_device
*dev
, *aux
;
528 for_each_netdev_safe(net
, dev
, aux
)
529 if (dev
->rtnl_link_ops
== ops
)
530 unregister_netdevice_queue(dev
, head
);
532 for (h
= 0; h
< IP_TNL_HASH_SIZE
; h
++) {
534 struct hlist_node
*n
;
535 struct hlist_head
*thead
= &itn
->tunnels
[h
];
537 hlist_for_each_entry_safe(t
, n
, thead
, hash_node
)
538 /* If dev is in the same netns, it has already
539 * been added to the list by the previous loop.
541 if (!net_eq(dev_net(t
->dev
), net
))
542 unregister_netdevice_queue(t
->dev
, head
);
546 void rpl_ip_tunnel_delete_net(struct ip_tunnel_net
*itn
,
547 struct rtnl_link_ops
*ops
)
552 ip_tunnel_destroy(itn
, &list
, ops
);
553 unregister_netdevice_many(&list
);
557 int rpl_ip_tunnel_newlink(struct net_device
*dev
, struct nlattr
*tb
[],
558 struct ip_tunnel_parm
*p
)
560 struct ip_tunnel
*nt
;
561 struct net
*net
= dev_net(dev
);
562 struct ip_tunnel_net
*itn
;
566 nt
= netdev_priv(dev
);
567 itn
= net_generic(net
, nt
->ip_tnl_net_id
);
569 if (nt
->collect_md
) {
570 if (rtnl_dereference(itn
->collect_md_tun
))
578 err
= register_netdevice(dev
);
582 if (dev
->type
== ARPHRD_ETHER
&& !tb
[IFLA_ADDRESS
])
583 eth_hw_addr_random(dev
);
585 mtu
= ip_tunnel_bind_dev(dev
);
589 ip_tunnel_add(itn
, nt
);
594 int rpl_ip_tunnel_init(struct net_device
*dev
)
596 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
597 struct iphdr
*iph
= &tunnel
->parms
.iph
;
600 #ifndef HAVE_NEEDS_FREE_NETDEV
601 dev
->destructor
= ip_tunnel_dev_free
;
603 dev
->needs_free_netdev
= true;
604 dev
->priv_destructor
= ip_tunnel_dev_free
;
606 dev
->tstats
= netdev_alloc_pcpu_stats(struct pcpu_sw_netstats
);
610 err
= dst_cache_init(&tunnel
->dst_cache
, GFP_KERNEL
);
612 free_percpu(dev
->tstats
);
616 err
= gro_cells_init(&tunnel
->gro_cells
, dev
);
618 dst_cache_destroy(&tunnel
->dst_cache
);
619 free_percpu(dev
->tstats
);
624 tunnel
->net
= dev_net(dev
);
625 strcpy(tunnel
->parms
.name
, dev
->name
);
629 if (tunnel
->collect_md
) {
630 dev
->features
|= NETIF_F_NETNS_LOCAL
;
636 void rpl_ip_tunnel_uninit(struct net_device
*dev
)
638 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
639 struct net
*net
= tunnel
->net
;
640 struct ip_tunnel_net
*itn
;
642 itn
= net_generic(net
, tunnel
->ip_tnl_net_id
);
643 ip_tunnel_del(itn
, netdev_priv(dev
));
646 /* Do least required initialization, rest of init is done in tunnel_init call */
647 void rpl_ip_tunnel_setup(struct net_device
*dev
, int net_id
)
649 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
651 tunnel
->ip_tnl_net_id
= net_id
;
654 int rpl_ip_tunnel_get_iflink(const struct net_device
*dev
)
656 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
658 return tunnel
->parms
.link
;
661 struct net
*rpl_ip_tunnel_get_link_net(const struct net_device
*dev
)
663 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
668 struct ip_tunnel
*rpl_ip_tunnel_lookup(struct ip_tunnel_net
*itn
,
669 int link
, __be16 flags
,
670 __be32 remote
, __be32 local
,
674 struct ip_tunnel
*t
, *cand
= NULL
;
675 struct hlist_head
*head
;
677 hash
= rpl_ip_tunnel_hash(key
, remote
);
678 head
= &itn
->tunnels
[hash
];
680 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
681 if (local
!= t
->parms
.iph
.saddr
||
682 remote
!= t
->parms
.iph
.daddr
||
683 !(t
->dev
->flags
& IFF_UP
))
686 if (!rpl_ip_tunnel_key_match(&t
->parms
, flags
, key
))
689 if (t
->parms
.link
== link
)
695 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
696 if (remote
!= t
->parms
.iph
.daddr
||
697 t
->parms
.iph
.saddr
!= 0 ||
698 !(t
->dev
->flags
& IFF_UP
))
701 if (!rpl_ip_tunnel_key_match(&t
->parms
, flags
, key
))
704 if (t
->parms
.link
== link
)
710 hash
= rpl_ip_tunnel_hash(key
, 0);
711 head
= &itn
->tunnels
[hash
];
713 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
714 if ((local
!= t
->parms
.iph
.saddr
|| t
->parms
.iph
.daddr
!= 0) &&
715 (local
!= t
->parms
.iph
.daddr
|| !ipv4_is_multicast(local
)))
718 if (!(t
->dev
->flags
& IFF_UP
))
721 if (!rpl_ip_tunnel_key_match(&t
->parms
, flags
, key
))
724 if (t
->parms
.link
== link
)
730 if (flags
& TUNNEL_NO_KEY
)
731 goto skip_key_lookup
;
733 hlist_for_each_entry_rcu(t
, head
, hash_node
) {
734 if (t
->parms
.i_key
!= key
||
735 t
->parms
.iph
.saddr
!= 0 ||
736 t
->parms
.iph
.daddr
!= 0 ||
737 !(t
->dev
->flags
& IFF_UP
))
740 if (t
->parms
.link
== link
)
750 if (itn
->fb_tunnel_dev
&& itn
->fb_tunnel_dev
->flags
& IFF_UP
)
751 return netdev_priv(itn
->fb_tunnel_dev
);
756 EXPORT_SYMBOL_GPL(rpl_ip_tunnel_lookup
);