2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_vlan.h>
15 #include <linux/in_route.h>
16 #include <linux/jhash.h>
17 #include <linux/kernel.h>
18 #include <linux/version.h>
20 #include <net/dsfield.h>
23 #include <net/inet_ecn.h>
25 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
28 #include <net/protocol.h>
29 #include <net/route.h>
34 #include "openvswitch/gre.h"
37 #include "vport-generic.h"
39 /* The absolute minimum fragment size. Note that there are many other
40 * definitions of the minimum MTU. */
43 /* The GRE header is composed of a series of sections: a base and then a variable
44 * number of options. */
45 #define GRE_HEADER_SECTION 4
47 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
48 #define rt_dst(rt) (rt->dst)
50 #define rt_dst(rt) (rt->u.dst)
58 struct mutable_config
{
61 unsigned char eth_addr
[ETH_ALEN
];
63 struct gre_port_config port_config
;
65 int tunnel_hlen
; /* Tunnel header length. */
70 struct tbl_node tbl_node
;
74 /* Protected by RCU. */
75 struct mutable_config
*mutable;
78 /* Protected by RCU. */
79 static struct tbl
*port_table
;
81 /* These are just used as an optimization: they don't require any kind of
82 * synchronization because we could have just as easily read the value before
83 * the port change happened. */
84 static unsigned int key_local_remote_ports
;
85 static unsigned int key_remote_ports
;
86 static unsigned int local_remote_ports
;
87 static unsigned int remote_ports
;
89 static inline struct gre_vport
*gre_vport_priv(const struct vport
*vport
)
91 return vport_priv(vport
);
94 static inline struct vport
*gre_vport_to_vport(const struct gre_vport
*gre_vport
)
96 return vport_from_priv(gre_vport
);
99 static inline struct gre_vport
*gre_vport_table_cast(const struct tbl_node
*node
)
101 return container_of(node
, struct gre_vport
, tbl_node
);
105 static void free_config(struct rcu_head
*rcu
)
107 struct mutable_config
*c
= container_of(rcu
, struct mutable_config
, rcu
);
111 static void assign_config_rcu(struct vport
*vport
,
112 struct mutable_config
*new_config
)
114 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
115 struct mutable_config
*old_config
;
117 old_config
= rcu_dereference(gre_vport
->mutable);
118 rcu_assign_pointer(gre_vport
->mutable, new_config
);
119 call_rcu(&old_config
->rcu
, free_config
);
122 static unsigned int *find_port_pool(const struct mutable_config
*mutable)
124 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
125 if (mutable->port_config
.saddr
)
126 return &local_remote_ports
;
128 return &remote_ports
;
130 if (mutable->port_config
.saddr
)
131 return &key_local_remote_ports
;
133 return &key_remote_ports
;
144 struct port_lookup_key
{
145 u32 vals
[4]; /* Contains enum lookup_key keys. */
146 const struct mutable_config
*mutable;
149 /* Modifies 'target' to store the rcu_dereferenced pointer that was used to do
150 * the comparision. */
151 static int port_cmp(const struct tbl_node
*node
, void *target
)
153 const struct gre_vport
*gre_vport
= gre_vport_table_cast(node
);
154 struct port_lookup_key
*lookup
= target
;
156 lookup
->mutable = rcu_dereference(gre_vport
->mutable);
158 return ((lookup
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) ==
159 lookup
->vals
[LOOKUP_KEY_MATCH
]) &&
160 lookup
->mutable->port_config
.daddr
== lookup
->vals
[LOOKUP_DADDR
] &&
161 lookup
->mutable->port_config
.in_key
== lookup
->vals
[LOOKUP_KEY
] &&
162 lookup
->mutable->port_config
.saddr
== lookup
->vals
[LOOKUP_SADDR
];
165 static u32
port_hash(struct port_lookup_key
*lookup
)
167 return jhash2(lookup
->vals
, ARRAY_SIZE(lookup
->vals
), 0);
170 static int add_port(struct vport
*vport
)
172 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
173 struct port_lookup_key lookup
;
177 struct tbl
*new_table
;
179 new_table
= tbl_create(0);
183 rcu_assign_pointer(port_table
, new_table
);
185 } else if (tbl_count(port_table
) > tbl_n_buckets(port_table
)) {
186 struct tbl
*old_table
= port_table
;
187 struct tbl
*new_table
;
189 new_table
= tbl_expand(old_table
);
190 if (IS_ERR(new_table
))
191 return PTR_ERR(new_table
);
193 rcu_assign_pointer(port_table
, new_table
);
194 tbl_deferred_destroy(old_table
, NULL
);
197 lookup
.vals
[LOOKUP_SADDR
] = gre_vport
->mutable->port_config
.saddr
;
198 lookup
.vals
[LOOKUP_DADDR
] = gre_vport
->mutable->port_config
.daddr
;
199 lookup
.vals
[LOOKUP_KEY
] = gre_vport
->mutable->port_config
.in_key
;
200 lookup
.vals
[LOOKUP_KEY_MATCH
] = gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
;
202 err
= tbl_insert(port_table
, &gre_vport
->tbl_node
, port_hash(&lookup
));
206 (*find_port_pool(gre_vport
->mutable))++;
211 static int del_port(struct vport
*vport
)
213 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
216 err
= tbl_remove(port_table
, &gre_vport
->tbl_node
);
220 (*find_port_pool(gre_vport
->mutable))--;
225 #define FIND_PORT_KEY (1 << 0)
226 #define FIND_PORT_MATCH (1 << 1)
227 #define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH)
229 static struct vport
*find_port(__be32 saddr
, __be32 daddr
, __be32 key
,
231 const struct mutable_config
**mutable)
233 struct port_lookup_key lookup
;
234 struct tbl
*table
= rcu_dereference(port_table
);
235 struct tbl_node
*tbl_node
;
240 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
241 lookup
.vals
[LOOKUP_DADDR
] = daddr
;
243 if (port_type
& FIND_PORT_KEY
) {
244 lookup
.vals
[LOOKUP_KEY
] = key
;
245 lookup
.vals
[LOOKUP_KEY_MATCH
] = 0;
247 if (key_local_remote_ports
) {
248 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
253 if (key_remote_ports
) {
254 lookup
.vals
[LOOKUP_SADDR
] = 0;
256 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
260 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
264 if (port_type
& FIND_PORT_MATCH
) {
265 lookup
.vals
[LOOKUP_KEY
] = 0;
266 lookup
.vals
[LOOKUP_KEY_MATCH
] = GRE_F_IN_KEY_MATCH
;
268 if (local_remote_ports
) {
269 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
275 lookup
.vals
[LOOKUP_SADDR
] = 0;
277 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
286 *mutable = lookup
.mutable;
287 return gre_vport_to_vport(gre_vport_table_cast(tbl_node
));
290 static bool check_ipv4_address(__be32 addr
)
292 if (ipv4_is_multicast(addr
) || ipv4_is_lbcast(addr
)
293 || ipv4_is_loopback(addr
) || ipv4_is_zeronet(addr
))
299 static bool ipv4_should_icmp(struct sk_buff
*skb
)
301 struct iphdr
*old_iph
= ip_hdr(skb
);
303 /* Don't respond to L2 broadcast. */
304 if (is_multicast_ether_addr(eth_hdr(skb
)->h_dest
))
307 /* Don't respond to L3 broadcast or invalid addresses. */
308 if (!check_ipv4_address(old_iph
->daddr
) ||
309 !check_ipv4_address(old_iph
->saddr
))
312 /* Only respond to the first fragment. */
313 if (old_iph
->frag_off
& htons(IP_OFFSET
))
316 /* Don't respond to ICMP error messages. */
317 if (old_iph
->protocol
== IPPROTO_ICMP
) {
318 u8 icmp_type
, *icmp_typep
;
320 icmp_typep
= skb_header_pointer(skb
, (u8
*)old_iph
+
321 (old_iph
->ihl
<< 2) +
322 offsetof(struct icmphdr
, type
) -
323 skb
->data
, sizeof(icmp_type
),
329 if (*icmp_typep
> NR_ICMP_TYPES
330 || (*icmp_typep
<= ICMP_PARAMETERPROB
331 && *icmp_typep
!= ICMP_ECHOREPLY
332 && *icmp_typep
!= ICMP_ECHO
))
339 static void ipv4_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
,
340 unsigned int mtu
, unsigned int payload_length
)
342 struct iphdr
*iph
, *old_iph
= ip_hdr(skb
);
343 struct icmphdr
*icmph
;
346 iph
= (struct iphdr
*)skb_put(nskb
, sizeof(struct iphdr
));
347 icmph
= (struct icmphdr
*)skb_put(nskb
, sizeof(struct icmphdr
));
348 payload
= skb_put(nskb
, payload_length
);
352 iph
->ihl
= sizeof(struct iphdr
) >> 2;
353 iph
->tos
= (old_iph
->tos
& IPTOS_TOS_MASK
) |
354 IPTOS_PREC_INTERNETCONTROL
;
355 iph
->tot_len
= htons(sizeof(struct iphdr
)
356 + sizeof(struct icmphdr
)
358 get_random_bytes(&iph
->id
, sizeof(iph
->id
));
361 iph
->protocol
= IPPROTO_ICMP
;
362 iph
->daddr
= old_iph
->saddr
;
363 iph
->saddr
= old_iph
->daddr
;
368 icmph
->type
= ICMP_DEST_UNREACH
;
369 icmph
->code
= ICMP_FRAG_NEEDED
;
370 icmph
->un
.gateway
= htonl(mtu
);
373 nskb
->csum
= csum_partial((u8
*)icmph
, sizeof(struct icmphdr
), 0);
374 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_iph
- skb
->data
,
375 payload
, payload_length
,
377 icmph
->checksum
= csum_fold(nskb
->csum
);
380 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
381 static bool ipv6_should_icmp(struct sk_buff
*skb
)
383 struct ipv6hdr
*old_ipv6h
= ipv6_hdr(skb
);
385 int payload_off
= (u8
*)(old_ipv6h
+ 1) - skb
->data
;
386 u8 nexthdr
= ipv6_hdr(skb
)->nexthdr
;
388 /* Check source address is valid. */
389 addr_type
= ipv6_addr_type(&old_ipv6h
->saddr
);
390 if (addr_type
& IPV6_ADDR_MULTICAST
|| addr_type
== IPV6_ADDR_ANY
)
393 /* Don't reply to unspecified addresses. */
394 if (ipv6_addr_type(&old_ipv6h
->daddr
) == IPV6_ADDR_ANY
)
397 /* Don't respond to ICMP error messages. */
398 payload_off
= ipv6_skip_exthdr(skb
, payload_off
, &nexthdr
);
402 if (nexthdr
== NEXTHDR_ICMP
) {
403 u8 icmp_type
, *icmp_typep
;
405 icmp_typep
= skb_header_pointer(skb
, payload_off
+
406 offsetof(struct icmp6hdr
,
408 sizeof(icmp_type
), &icmp_type
);
410 if (!icmp_typep
|| !(*icmp_typep
& ICMPV6_INFOMSG_MASK
))
417 static void ipv6_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
,
418 unsigned int mtu
, unsigned int payload_length
)
420 struct ipv6hdr
*ipv6h
, *old_ipv6h
= ipv6_hdr(skb
);
421 struct icmp6hdr
*icmp6h
;
424 ipv6h
= (struct ipv6hdr
*)skb_put(nskb
, sizeof(struct ipv6hdr
));
425 icmp6h
= (struct icmp6hdr
*)skb_put(nskb
, sizeof(struct icmp6hdr
));
426 payload
= skb_put(nskb
, payload_length
);
431 memset(&ipv6h
->flow_lbl
, 0, sizeof(ipv6h
->flow_lbl
));
432 ipv6h
->payload_len
= htons(sizeof(struct icmp6hdr
)
434 ipv6h
->nexthdr
= NEXTHDR_ICMP
;
435 ipv6h
->hop_limit
= IPV6_DEFAULT_HOPLIMIT
;
436 ipv6_addr_copy(&ipv6h
->daddr
, &old_ipv6h
->saddr
);
437 ipv6_addr_copy(&ipv6h
->saddr
, &old_ipv6h
->daddr
);
440 icmp6h
->icmp6_type
= ICMPV6_PKT_TOOBIG
;
441 icmp6h
->icmp6_code
= 0;
442 icmp6h
->icmp6_cksum
= 0;
443 icmp6h
->icmp6_mtu
= htonl(mtu
);
445 nskb
->csum
= csum_partial((u8
*)icmp6h
, sizeof(struct icmp6hdr
), 0);
446 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_ipv6h
- skb
->data
,
447 payload
, payload_length
,
449 icmp6h
->icmp6_cksum
= csum_ipv6_magic(&ipv6h
->saddr
, &ipv6h
->daddr
,
450 sizeof(struct icmp6hdr
)
452 ipv6h
->nexthdr
, nskb
->csum
);
456 static bool send_frag_needed(struct vport
*vport
,
457 const struct mutable_config
*mutable,
458 struct sk_buff
*skb
, unsigned int mtu
,
461 unsigned int eth_hdr_len
= ETH_HLEN
;
462 unsigned int total_length
= 0, header_length
= 0, payload_length
;
463 struct ethhdr
*eh
, *old_eh
= eth_hdr(skb
);
464 struct sk_buff
*nskb
;
467 if (skb
->protocol
== htons(ETH_P_IP
)) {
468 if (mtu
< IP_MIN_MTU
)
471 if (!ipv4_should_icmp(skb
))
474 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
475 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
476 if (mtu
< IPV6_MIN_MTU
)
479 /* In theory we should do PMTUD on IPv6 multicast messages but
480 * we don't have an address to send from so just fragment. */
481 if (ipv6_addr_type(&ipv6_hdr(skb
)->daddr
) & IPV6_ADDR_MULTICAST
)
484 if (!ipv6_should_icmp(skb
))
492 if (old_eh
->h_proto
== htons(ETH_P_8021Q
))
493 eth_hdr_len
= VLAN_ETH_HLEN
;
495 payload_length
= skb
->len
- eth_hdr_len
;
496 if (skb
->protocol
== htons(ETH_P_IP
)) {
497 header_length
= sizeof(struct iphdr
) + sizeof(struct icmphdr
);
498 total_length
= min_t(unsigned int, header_length
+
499 payload_length
, 576);
501 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
503 header_length
= sizeof(struct ipv6hdr
) +
504 sizeof(struct icmp6hdr
);
505 total_length
= min_t(unsigned int, header_length
+
506 payload_length
, IPV6_MIN_MTU
);
510 total_length
= min(total_length
, mutable->mtu
);
511 payload_length
= total_length
- header_length
;
513 nskb
= dev_alloc_skb(NET_IP_ALIGN
+ eth_hdr_len
+ header_length
+
518 skb_reserve(nskb
, NET_IP_ALIGN
);
520 /* Ethernet / VLAN */
521 eh
= (struct ethhdr
*)skb_put(nskb
, eth_hdr_len
);
522 memcpy(eh
->h_dest
, old_eh
->h_source
, ETH_ALEN
);
523 memcpy(eh
->h_source
, mutable->eth_addr
, ETH_ALEN
);
524 nskb
->protocol
= eh
->h_proto
= old_eh
->h_proto
;
525 if (old_eh
->h_proto
== htons(ETH_P_8021Q
)) {
526 struct vlan_ethhdr
*vh
= (struct vlan_ethhdr
*)eh
;
528 vh
->h_vlan_TCI
= vlan_eth_hdr(skb
)->h_vlan_TCI
;
529 vh
->h_vlan_encapsulated_proto
= skb
->protocol
;
531 skb_reset_mac_header(nskb
);
534 if (skb
->protocol
== htons(ETH_P_IP
))
535 ipv4_build_icmp(skb
, nskb
, mtu
, payload_length
);
536 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
538 ipv6_build_icmp(skb
, nskb
, mtu
, payload_length
);
541 /* Assume that flow based keys are symmetric with respect to input
542 * and output and use the key that we were going to put on the
543 * outgoing packet for the fake received packet. If the keys are
544 * not symmetric then PMTUD needs to be disabled since we won't have
545 * any way of synthesizing packets. */
546 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
&&
547 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
548 OVS_CB(nskb
)->tun_id
= flow_key
;
550 compute_ip_summed(nskb
, false);
551 vport_receive(vport
, nskb
);
556 static struct sk_buff
*check_headroom(struct sk_buff
*skb
, int headroom
)
558 if (skb_headroom(skb
) < headroom
|| skb_header_cloned(skb
)) {
559 struct sk_buff
*nskb
= skb_realloc_headroom(skb
, headroom
+ 16);
562 return ERR_PTR(-ENOMEM
);
565 set_skb_csum_bits(skb
, nskb
);
568 skb_set_owner_w(nskb
, skb
->sk
);
577 static void create_gre_header(struct sk_buff
*skb
,
578 const struct mutable_config
*mutable)
580 struct iphdr
*iph
= ip_hdr(skb
);
581 struct gre_base_hdr
*greh
= (struct gre_base_hdr
*)(iph
+ 1);
582 __be32
*options
= (__be32
*)((u8
*)iph
+ mutable->tunnel_hlen
583 - GRE_HEADER_SECTION
);
585 greh
->protocol
= htons(ETH_P_TEB
);
588 /* Work backwards over the options so the checksum is last. */
589 if (mutable->port_config
.out_key
||
590 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
) {
591 greh
->flags
|= GRE_KEY
;
593 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
594 *options
= OVS_CB(skb
)->tun_id
;
596 *options
= mutable->port_config
.out_key
;
601 if (mutable->port_config
.flags
& GRE_F_CSUM
) {
602 greh
->flags
|= GRE_CSUM
;
605 *(__sum16
*)options
= csum_fold(skb_checksum(skb
,
606 sizeof(struct iphdr
),
607 skb
->len
- sizeof(struct iphdr
),
612 static int check_checksum(struct sk_buff
*skb
)
614 struct iphdr
*iph
= ip_hdr(skb
);
615 __be16 flags
= *(__be16
*)(iph
+ 1);
618 if (flags
& GRE_CSUM
) {
619 switch (skb
->ip_summed
) {
620 case CHECKSUM_COMPLETE
:
621 csum
= csum_fold(skb
->csum
);
629 csum
= __skb_checksum_complete(skb
);
630 skb
->ip_summed
= CHECKSUM_COMPLETE
;
638 static int parse_gre_header(struct iphdr
*iph
, __be16
*flags
, __be32
*key
)
640 /* IP and ICMP protocol handlers check that the IHL is valid. */
641 struct gre_base_hdr
*greh
= (struct gre_base_hdr
*)((u8
*)iph
+ (iph
->ihl
<< 2));
642 __be32
*options
= (__be32
*)(greh
+ 1);
645 *flags
= greh
->flags
;
647 if (greh
->flags
& (GRE_VERSION
| GRE_ROUTING
))
650 if (greh
->protocol
!= htons(ETH_P_TEB
))
653 hdr_len
= GRE_HEADER_SECTION
;
655 if (greh
->flags
& GRE_CSUM
) {
656 hdr_len
+= GRE_HEADER_SECTION
;
660 if (greh
->flags
& GRE_KEY
) {
661 hdr_len
+= GRE_HEADER_SECTION
;
668 if (greh
->flags
& GRE_SEQ
)
669 hdr_len
+= GRE_HEADER_SECTION
;
674 static inline u8
ecn_encapsulate(u8 tos
, struct sk_buff
*skb
)
678 if (skb
->protocol
== htons(ETH_P_IP
))
679 inner
= ((struct iphdr
*)skb_network_header(skb
))->tos
;
680 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
681 else if (skb
->protocol
== htons(ETH_P_IPV6
))
682 inner
= ipv6_get_dsfield((struct ipv6hdr
*)skb_network_header(skb
));
687 return INET_ECN_encapsulate(tos
, inner
);
690 static inline void ecn_decapsulate(u8 tos
, struct sk_buff
*skb
)
692 if (INET_ECN_is_ce(tos
)) {
693 __be16 protocol
= skb
->protocol
;
694 unsigned int nw_header
= skb_network_header(skb
) - skb
->data
;
696 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
697 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
700 protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
701 nw_header
+= VLAN_HLEN
;
704 if (protocol
== htons(ETH_P_IP
)) {
705 if (unlikely(!pskb_may_pull(skb
, nw_header
706 + sizeof(struct iphdr
))))
709 IP_ECN_set_ce((struct iphdr
*)(nw_header
+ skb
->data
));
711 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
712 else if (protocol
== htons(ETH_P_IPV6
)) {
713 if (unlikely(!pskb_may_pull(skb
, nw_header
714 + sizeof(struct ipv6hdr
))))
717 IP6_ECN_set_ce((struct ipv6hdr
*)(nw_header
724 static struct sk_buff
*handle_gso(struct sk_buff
*skb
)
726 if (skb_is_gso(skb
)) {
727 struct sk_buff
*nskb
= skb_gso_segment(skb
, 0);
736 static int handle_csum_offload(struct sk_buff
*skb
)
738 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
739 return skb_checksum_help(skb
);
741 skb
->ip_summed
= CHECKSUM_NONE
;
746 /* Called with rcu_read_lock. */
747 static void gre_err(struct sk_buff
*skb
, u32 info
)
750 const struct mutable_config
*mutable;
751 const int type
= icmp_hdr(skb
)->type
;
752 const int code
= icmp_hdr(skb
)->code
;
753 int mtu
= ntohs(icmp_hdr(skb
)->un
.frag
.mtu
);
758 int tunnel_hdr_len
, tot_hdr_len
;
759 unsigned int orig_mac_header
;
760 unsigned int orig_nw_header
;
762 if (type
!= ICMP_DEST_UNREACH
|| code
!= ICMP_FRAG_NEEDED
)
765 /* The mimimum size packet that we would actually be able to process:
766 * encapsulating IP header, minimum GRE header, Ethernet header,
767 * inner IPv4 header. */
768 if (!pskb_may_pull(skb
, sizeof(struct iphdr
) + GRE_HEADER_SECTION
+
769 ETH_HLEN
+ sizeof(struct iphdr
)))
772 iph
= (struct iphdr
*)skb
->data
;
774 tunnel_hdr_len
= parse_gre_header(iph
, &flags
, &key
);
775 if (tunnel_hdr_len
< 0)
778 vport
= find_port(iph
->saddr
, iph
->daddr
, key
, FIND_PORT_ANY
, &mutable);
782 /* Packets received by this function were previously sent by us, so
783 * any comparisons should be to the output values, not the input.
784 * However, it's not really worth it to have a hash table based on
785 * output keys (especially since ICMP error handling of tunneled packets
786 * isn't that reliable anyways). Therefore, we do a lookup based on the
787 * out key as if it were the in key and then check to see if the input
788 * and output keys are the same. */
789 if (mutable->port_config
.in_key
!= mutable->port_config
.out_key
)
792 if (!!(mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
793 !!(mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
))
796 if ((mutable->port_config
.flags
& GRE_F_CSUM
) && !(flags
& GRE_CSUM
))
799 tunnel_hdr_len
+= iph
->ihl
<< 2;
801 orig_mac_header
= skb_mac_header(skb
) - skb
->data
;
802 orig_nw_header
= skb_network_header(skb
) - skb
->data
;
803 skb_set_mac_header(skb
, tunnel_hdr_len
);
805 tot_hdr_len
= tunnel_hdr_len
+ ETH_HLEN
;
807 skb
->protocol
= eth_hdr(skb
)->h_proto
;
808 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
809 tot_hdr_len
+= VLAN_HLEN
;
810 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
813 skb_set_network_header(skb
, tot_hdr_len
);
816 if (skb
->protocol
== htons(ETH_P_IP
))
817 tot_hdr_len
+= sizeof(struct iphdr
);
818 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
819 else if (skb
->protocol
== htons(ETH_P_IPV6
))
820 tot_hdr_len
+= sizeof(struct ipv6hdr
);
825 if (!pskb_may_pull(skb
, tot_hdr_len
))
828 if (skb
->protocol
== htons(ETH_P_IP
)) {
829 if (mtu
< IP_MIN_MTU
) {
830 if (ntohs(ip_hdr(skb
)->tot_len
) >= IP_MIN_MTU
)
837 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
838 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
839 if (mtu
< IPV6_MIN_MTU
) {
840 unsigned int packet_length
= sizeof(struct ipv6hdr
) +
841 ntohs(ipv6_hdr(skb
)->payload_len
);
843 if (packet_length
>= IPV6_MIN_MTU
844 || ntohs(ipv6_hdr(skb
)->payload_len
) == 0)
852 __pskb_pull(skb
, tunnel_hdr_len
);
853 send_frag_needed(vport
, mutable, skb
, mtu
, key
);
854 skb_push(skb
, tunnel_hdr_len
);
857 skb_set_mac_header(skb
, orig_mac_header
);
858 skb_set_network_header(skb
, orig_nw_header
);
859 skb
->protocol
= htons(ETH_P_IP
);
862 /* Called with rcu_read_lock. */
863 static int gre_rcv(struct sk_buff
*skb
)
866 const struct mutable_config
*mutable;
872 if (!pskb_may_pull(skb
, GRE_HEADER_SECTION
+ ETH_HLEN
))
875 if (!check_checksum(skb
))
880 hdr_len
= parse_gre_header(iph
, &flags
, &key
);
884 vport
= find_port(iph
->daddr
, iph
->saddr
, key
, FIND_PORT_ANY
, &mutable);
886 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
890 if (!pskb_pull(skb
, hdr_len
) || !pskb_may_pull(skb
, ETH_HLEN
)) {
891 vport_record_error(vport
, VPORT_E_RX_ERROR
);
895 skb
->pkt_type
= PACKET_HOST
;
896 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
897 skb_postpull_rcsum(skb
, skb_transport_header(skb
), hdr_len
+ ETH_HLEN
);
902 skb_reset_network_header(skb
);
904 ecn_decapsulate(iph
->tos
, skb
);
906 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
907 OVS_CB(skb
)->tun_id
= key
;
909 OVS_CB(skb
)->tun_id
= 0;
911 skb_push(skb
, ETH_HLEN
);
912 compute_ip_summed(skb
, false);
914 vport_receive(vport
, skb
);
923 static int build_packet(struct vport
*vport
, const struct mutable_config
*mutable,
924 struct iphdr
*iph
, struct rtable
*rt
, int max_headroom
,
925 int mtu
, struct sk_buff
*skb
)
928 struct iphdr
*new_iph
;
929 int orig_len
= skb
->len
;
930 __be16 frag_off
= iph
->frag_off
;
932 skb
= check_headroom(skb
, max_headroom
);
933 if (unlikely(IS_ERR(skb
)))
936 err
= handle_csum_offload(skb
);
940 if (skb
->protocol
== htons(ETH_P_IP
)) {
941 struct iphdr
*old_iph
= ip_hdr(skb
);
943 if ((old_iph
->frag_off
& htons(IP_DF
)) &&
944 mtu
< ntohs(old_iph
->tot_len
)) {
945 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
950 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
951 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
952 unsigned int packet_length
= skb
->len
- ETH_HLEN
953 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
955 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
956 if (packet_length
> IPV6_MIN_MTU
)
957 frag_off
= htons(IP_DF
);
959 if (mtu
< packet_length
) {
960 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
966 skb_reset_transport_header(skb
);
967 new_iph
= (struct iphdr
*)skb_push(skb
, mutable->tunnel_hlen
);
968 skb_reset_network_header(skb
);
970 memcpy(new_iph
, iph
, sizeof(struct iphdr
));
971 new_iph
->frag_off
= frag_off
;
972 ip_select_ident(new_iph
, &rt_dst(rt
), NULL
);
974 create_gre_header(skb
, mutable);
976 /* Allow our local IP stack to fragment the outer packet even if the
977 * DF bit is set as a last resort. */
980 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
981 IPCB(skb
)->flags
= 0;
983 err
= ip_local_out(skb
);
984 if (likely(net_xmit_eval(err
) == 0))
987 vport_record_error(vport
, VPORT_E_TX_ERROR
);
994 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
999 static int gre_send(struct vport
*vport
, struct sk_buff
*skb
)
1001 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1002 const struct mutable_config
*mutable = rcu_dereference(gre_vport
->mutable);
1004 struct iphdr
*old_iph
;
1011 /* Validate the protocol headers before we try to use them. */
1012 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
1013 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
1016 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
1017 skb_set_network_header(skb
, VLAN_ETH_HLEN
);
1020 if (skb
->protocol
== htons(ETH_P_IP
)) {
1021 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1022 + sizeof(struct iphdr
) - skb
->data
)))
1025 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1026 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
1027 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1028 + sizeof(struct ipv6hdr
) - skb
->data
)))
1032 old_iph
= ip_hdr(skb
);
1034 iph
.tos
= mutable->port_config
.tos
;
1035 if (mutable->port_config
.flags
& GRE_F_TOS_INHERIT
) {
1036 if (skb
->protocol
== htons(ETH_P_IP
))
1037 iph
.tos
= old_iph
->tos
;
1038 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1039 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1040 iph
.tos
= ipv6_get_dsfield(ipv6_hdr(skb
));
1043 iph
.tos
= ecn_encapsulate(iph
.tos
, skb
);
1046 struct flowi fl
= { .nl_u
= { .ip4_u
=
1047 { .daddr
= mutable->port_config
.daddr
,
1048 .saddr
= mutable->port_config
.saddr
,
1049 .tos
= RT_TOS(iph
.tos
) } },
1050 .proto
= IPPROTO_GRE
};
1052 if (ip_route_output_key(&init_net
, &rt
, &fl
))
1056 iph
.ttl
= mutable->port_config
.ttl
;
1057 if (mutable->port_config
.flags
& GRE_F_TTL_INHERIT
) {
1058 if (skb
->protocol
== htons(ETH_P_IP
))
1059 iph
.ttl
= old_iph
->ttl
;
1060 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1061 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1062 iph
.ttl
= ipv6_hdr(skb
)->hop_limit
;
1066 iph
.ttl
= dst_metric(&rt_dst(rt
), RTAX_HOPLIMIT
);
1068 iph
.frag_off
= (mutable->port_config
.flags
& GRE_F_PMTUD
) ? htons(IP_DF
) : 0;
1070 mtu
= dst_mtu(&rt_dst(rt
))
1072 - mutable->tunnel_hlen
1073 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
1077 if (skb
->protocol
== htons(ETH_P_IP
)) {
1078 iph
.frag_off
|= old_iph
->frag_off
& htons(IP_DF
);
1079 mtu
= max(mtu
, IP_MIN_MTU
);
1081 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1082 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1083 mtu
= max(mtu
, IPV6_MIN_MTU
);
1087 iph
.ihl
= sizeof(struct iphdr
) >> 2;
1088 iph
.protocol
= IPPROTO_GRE
;
1089 iph
.daddr
= rt
->rt_dst
;
1090 iph
.saddr
= rt
->rt_src
;
1095 skb_dst_set(skb
, &rt_dst(rt
));
1097 /* If we are doing GSO on a pskb it is better to make sure that the
1098 * headroom is correct now. We will only have to copy the portion in
1099 * the linear data area and GSO will preserve headroom when it creates
1100 * the segments. This is particularly beneficial on Xen where we get
1101 * lots of GSO pskbs. Conversely, we delay copying if it is just to
1102 * get our own writable clone because GSO may do the copy for us. */
1103 max_headroom
= LL_RESERVED_SPACE(rt_dst(rt
).dev
) + rt_dst(rt
).header_len
1104 + mutable->tunnel_hlen
;
1106 if (skb_headroom(skb
) < max_headroom
) {
1107 skb
= check_headroom(skb
, max_headroom
);
1108 if (unlikely(IS_ERR(skb
))) {
1109 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1114 forward_ip_summed(skb
);
1116 if (unlikely(vswitch_skb_checksum_setup(skb
)))
1119 skb
= handle_gso(skb
);
1120 if (unlikely(IS_ERR(skb
))) {
1121 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1125 /* Process GSO segments. Try to do any work for the entire packet that
1126 * doesn't involve actually writing to it before this point. */
1129 struct sk_buff
*next_skb
= skb
->next
;
1132 orig_len
+= build_packet(vport
, mutable, &iph
, rt
, max_headroom
, mtu
, skb
);
1141 vport_record_error(vport
, VPORT_E_TX_ERROR
);
1146 static struct net_protocol gre_protocol_handlers
= {
1148 .err_handler
= gre_err
,
1151 static int gre_init(void)
1155 err
= inet_add_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1157 printk(KERN_WARNING
"openvswitch: cannot register gre protocol handler\n");
1162 static void gre_exit(void)
1164 tbl_destroy(port_table
, NULL
);
1165 inet_del_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1168 static int set_config(const struct vport
*cur_vport
,
1169 struct mutable_config
*mutable, const void __user
*uconfig
)
1171 const struct vport
*old_vport
;
1172 const struct mutable_config
*old_mutable
;
1175 if (copy_from_user(&mutable->port_config
, uconfig
, sizeof(struct gre_port_config
)))
1178 if (mutable->port_config
.daddr
== 0)
1181 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
1182 port_type
= FIND_PORT_MATCH
;
1183 mutable->port_config
.in_key
= 0;
1185 port_type
= FIND_PORT_KEY
;
1187 old_vport
= find_port(mutable->port_config
.saddr
,
1188 mutable->port_config
.daddr
,
1189 mutable->port_config
.in_key
, port_type
,
1192 if (old_vport
&& old_vport
!= cur_vport
)
1195 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1196 mutable->port_config
.out_key
= 0;
1198 mutable->tunnel_hlen
= sizeof(struct iphdr
) + GRE_HEADER_SECTION
;
1200 if (mutable->port_config
.flags
& GRE_F_CSUM
)
1201 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1203 if (mutable->port_config
.out_key
||
1204 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1205 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1210 static struct vport
*gre_create(const char *name
, const void __user
*config
)
1212 struct vport
*vport
;
1213 struct gre_vport
*gre_vport
;
1216 vport
= vport_alloc(sizeof(struct gre_vport
), &gre_vport_ops
);
1217 if (IS_ERR(vport
)) {
1218 err
= PTR_ERR(vport
);
1222 gre_vport
= gre_vport_priv(vport
);
1224 strcpy(gre_vport
->name
, name
);
1226 gre_vport
->mutable = kmalloc(sizeof(struct mutable_config
), GFP_KERNEL
);
1227 if (!gre_vport
->mutable) {
1229 goto error_free_vport
;
1232 vport_gen_rand_ether_addr(gre_vport
->mutable->eth_addr
);
1233 gre_vport
->mutable->mtu
= ETH_DATA_LEN
;
1235 err
= set_config(NULL
, gre_vport
->mutable, config
);
1237 goto error_free_mutable
;
1239 err
= add_port(vport
);
1241 goto error_free_mutable
;
1246 kfree(gre_vport
->mutable);
1250 return ERR_PTR(err
);
1253 static int gre_modify(struct vport
*vport
, const void __user
*config
)
1255 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1256 struct mutable_config
*mutable;
1258 int update_hash
= 0;
1260 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1266 err
= set_config(vport
, mutable, config
);
1270 /* Only remove the port from the hash table if something that would
1271 * affect the lookup has changed. */
1272 if (gre_vport
->mutable->port_config
.saddr
!= mutable->port_config
.saddr
||
1273 gre_vport
->mutable->port_config
.daddr
!= mutable->port_config
.daddr
||
1274 gre_vport
->mutable->port_config
.in_key
!= mutable->port_config
.in_key
||
1275 (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
1276 (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
))
1280 /* This update is not atomic but the lookup uses the config, which
1281 * serves as an inherent double check. */
1283 err
= del_port(vport
);
1288 assign_config_rcu(vport
, mutable);
1291 err
= add_port(vport
);
1304 static void free_port(struct rcu_head
*rcu
)
1306 struct gre_vport
*gre_vport
= container_of(rcu
, struct gre_vport
, rcu
);
1308 kfree(gre_vport
->mutable);
1309 vport_free(gre_vport_to_vport(gre_vport
));
1312 static int gre_destroy(struct vport
*vport
)
1314 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1316 const struct mutable_config
*old_mutable
;
1318 /* Do a hash table lookup to make sure that the port exists. It should
1319 * exist but might not if a modify failed earlier. */
1320 if (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
1321 port_type
= FIND_PORT_MATCH
;
1323 port_type
= FIND_PORT_KEY
;
1325 if (vport
== find_port(gre_vport
->mutable->port_config
.saddr
,
1326 gre_vport
->mutable->port_config
.daddr
,
1327 gre_vport
->mutable->port_config
.in_key
, port_type
, &old_mutable
))
1330 call_rcu(&gre_vport
->rcu
, free_port
);
1335 static int gre_set_mtu(struct vport
*vport
, int mtu
)
1337 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1338 struct mutable_config
*mutable;
1340 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1345 assign_config_rcu(vport
, mutable);
1350 static int gre_set_addr(struct vport
*vport
, const unsigned char *addr
)
1352 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1353 struct mutable_config
*mutable;
1355 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1359 memcpy(mutable->eth_addr
, addr
, ETH_ALEN
);
1360 assign_config_rcu(vport
, mutable);
1366 static const char *gre_get_name(const struct vport
*vport
)
1368 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1369 return gre_vport
->name
;
1372 static const unsigned char *gre_get_addr(const struct vport
*vport
)
1374 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1375 return rcu_dereference(gre_vport
->mutable)->eth_addr
;
1378 static int gre_get_mtu(const struct vport
*vport
)
1380 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1381 return rcu_dereference(gre_vport
->mutable)->mtu
;
1384 struct vport_ops gre_vport_ops
= {
1386 .flags
= VPORT_F_GEN_STATS
| VPORT_F_TUN_ID
,
1389 .create
= gre_create
,
1390 .modify
= gre_modify
,
1391 .destroy
= gre_destroy
,
1392 .set_mtu
= gre_set_mtu
,
1393 .set_addr
= gre_set_addr
,
1394 .get_name
= gre_get_name
,
1395 .get_addr
= gre_get_addr
,
1396 .get_dev_flags
= vport_gen_get_dev_flags
,
1397 .is_running
= vport_gen_is_running
,
1398 .get_operstate
= vport_gen_get_operstate
,
1399 .get_mtu
= gre_get_mtu
,