2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_vlan.h>
15 #include <linux/in_route.h>
16 #include <linux/jhash.h>
17 #include <linux/kernel.h>
18 #include <linux/version.h>
20 #include <net/dsfield.h>
23 #include <net/inet_ecn.h>
26 #include <net/protocol.h>
27 #include <net/route.h>
32 #include "openvswitch/gre.h"
35 #include "vport-generic.h"
37 /* The absolute minimum fragment size. Note that there are many other
38 * definitions of the minimum MTU. */
41 /* The GRE header is composed of a series of sections: a base and then a variable
42 * number of options. */
43 #define GRE_HEADER_SECTION 4
45 struct mutable_config
{
48 unsigned char eth_addr
[ETH_ALEN
];
50 struct gre_port_config port_config
;
52 int tunnel_hlen
; /* Tunnel header length. */
56 struct tbl_node tbl_node
;
60 /* Protected by RCU. */
61 struct mutable_config
*mutable;
64 struct vport_ops gre_vport_ops
;
66 /* Protected by RCU. */
67 static struct tbl
*port_table
;
69 /* These are just used as an optimization: they don't require any kind of
70 * synchronization because we could have just as easily read the value before
71 * the port change happened. */
72 static unsigned int key_local_remote_ports
;
73 static unsigned int key_remote_ports
;
74 static unsigned int local_remote_ports
;
75 static unsigned int remote_ports
;
77 static inline struct gre_vport
*
78 gre_vport_priv(const struct vport
*vport
)
80 return vport_priv(vport
);
83 static inline struct vport
*
84 gre_vport_to_vport(const struct gre_vport
*gre_vport
)
86 return vport_from_priv(gre_vport
);
89 static inline struct gre_vport
*
90 gre_vport_table_cast(const struct tbl_node
*node
)
92 return container_of(node
, struct gre_vport
, tbl_node
);
97 free_config(struct rcu_head
*rcu
)
99 struct mutable_config
*c
= container_of(rcu
, struct mutable_config
, rcu
);
104 assign_config_rcu(struct vport
*vport
, struct mutable_config
*new_config
)
106 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
107 struct mutable_config
*old_config
;
109 old_config
= rcu_dereference(gre_vport
->mutable);
110 rcu_assign_pointer(gre_vport
->mutable, new_config
);
111 call_rcu(&old_config
->rcu
, free_config
);
114 static unsigned int *
115 find_port_pool(const struct mutable_config
*mutable)
117 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
118 if (mutable->port_config
.saddr
)
119 return &local_remote_ports
;
121 return &remote_ports
;
123 if (mutable->port_config
.saddr
)
124 return &key_local_remote_ports
;
126 return &key_remote_ports
;
137 struct port_lookup_key
{
138 u32 vals
[4]; /* Contains enum lookup_key keys. */
139 const struct mutable_config
*mutable;
142 /* Modifies 'target' to store the rcu_dereferenced pointer that was used to do
143 * the comparision. */
145 port_cmp(const struct tbl_node
*node
, void *target
)
147 const struct gre_vport
*gre_vport
= gre_vport_table_cast(node
);
148 struct port_lookup_key
*lookup
= target
;
150 lookup
->mutable = rcu_dereference(gre_vport
->mutable);
152 return ((lookup
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) ==
153 lookup
->vals
[LOOKUP_KEY_MATCH
]) &&
154 lookup
->mutable->port_config
.daddr
== lookup
->vals
[LOOKUP_DADDR
] &&
155 lookup
->mutable->port_config
.in_key
== lookup
->vals
[LOOKUP_KEY
] &&
156 lookup
->mutable->port_config
.saddr
== lookup
->vals
[LOOKUP_SADDR
];
160 port_hash(struct port_lookup_key
*lookup
)
162 return jhash2(lookup
->vals
, ARRAY_SIZE(lookup
->vals
), 0);
166 add_port(struct vport
*vport
)
168 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
169 struct port_lookup_key lookup
;
173 struct tbl
*new_table
;
175 new_table
= tbl_create(0);
179 rcu_assign_pointer(port_table
, new_table
);
181 } else if (tbl_count(port_table
) > tbl_n_buckets(port_table
)) {
182 struct tbl
*old_table
= port_table
;
183 struct tbl
*new_table
;
185 new_table
= tbl_expand(old_table
);
186 if (IS_ERR(new_table
))
187 return PTR_ERR(new_table
);
189 rcu_assign_pointer(port_table
, new_table
);
190 tbl_deferred_destroy(old_table
, NULL
);
193 lookup
.vals
[LOOKUP_SADDR
] = gre_vport
->mutable->port_config
.saddr
;
194 lookup
.vals
[LOOKUP_DADDR
] = gre_vport
->mutable->port_config
.daddr
;
195 lookup
.vals
[LOOKUP_KEY
] = gre_vport
->mutable->port_config
.in_key
;
196 lookup
.vals
[LOOKUP_KEY_MATCH
] = gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
;
198 err
= tbl_insert(port_table
, &gre_vport
->tbl_node
, port_hash(&lookup
));
202 (*find_port_pool(gre_vport
->mutable))++;
208 del_port(struct vport
*vport
)
210 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
213 err
= tbl_remove(port_table
, &gre_vport
->tbl_node
);
217 (*find_port_pool(gre_vport
->mutable))--;
222 #define FIND_PORT_KEY (1 << 0)
223 #define FIND_PORT_MATCH (1 << 1)
224 #define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH)
226 static struct vport
*
227 find_port(__be32 saddr
, __be32 daddr
, __be32 key
, int port_type
,
228 const struct mutable_config
**mutable)
230 struct port_lookup_key lookup
;
231 struct tbl
*table
= rcu_dereference(port_table
);
232 struct tbl_node
*tbl_node
;
237 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
238 lookup
.vals
[LOOKUP_DADDR
] = daddr
;
240 if (port_type
& FIND_PORT_KEY
) {
241 lookup
.vals
[LOOKUP_KEY
] = key
;
242 lookup
.vals
[LOOKUP_KEY_MATCH
] = 0;
244 if (key_local_remote_ports
) {
245 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
250 if (key_remote_ports
) {
251 lookup
.vals
[LOOKUP_SADDR
] = 0;
253 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
257 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
261 if (port_type
& FIND_PORT_MATCH
) {
262 lookup
.vals
[LOOKUP_KEY
] = 0;
263 lookup
.vals
[LOOKUP_KEY_MATCH
] = GRE_F_IN_KEY_MATCH
;
265 if (local_remote_ports
) {
266 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
272 lookup
.vals
[LOOKUP_SADDR
] = 0;
274 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
283 *mutable = lookup
.mutable;
284 return gre_vport_to_vport(gre_vport_table_cast(tbl_node
));
288 check_ipv4_address(__be32 addr
)
290 if (ipv4_is_multicast(addr
) || ipv4_is_lbcast(addr
)
291 || ipv4_is_loopback(addr
) || ipv4_is_zeronet(addr
))
298 ipv4_should_icmp(struct sk_buff
*skb
)
300 struct iphdr
*old_iph
= ip_hdr(skb
);
302 /* Don't respond to L2 broadcast. */
303 if (is_multicast_ether_addr(eth_hdr(skb
)->h_dest
))
306 /* Don't respond to L3 broadcast or invalid addresses. */
307 if (!check_ipv4_address(old_iph
->daddr
) ||
308 !check_ipv4_address(old_iph
->saddr
))
311 /* Only respond to the first fragment. */
312 if (old_iph
->frag_off
& htons(IP_OFFSET
))
315 /* Don't respond to ICMP error messages. */
316 if (old_iph
->protocol
== IPPROTO_ICMP
) {
317 u8 icmp_type
, *icmp_typep
;
319 icmp_typep
= skb_header_pointer(skb
, (u8
*)old_iph
+
320 (old_iph
->ihl
<< 2) +
321 offsetof(struct icmphdr
, type
) -
322 skb
->data
, sizeof(icmp_type
),
328 if (*icmp_typep
> NR_ICMP_TYPES
329 || (*icmp_typep
<= ICMP_PARAMETERPROB
330 && *icmp_typep
!= ICMP_ECHOREPLY
331 && *icmp_typep
!= ICMP_ECHO
))
339 ipv4_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
,
340 unsigned int mtu
, unsigned int payload_length
)
342 struct iphdr
*iph
, *old_iph
= ip_hdr(skb
);
343 struct icmphdr
*icmph
;
346 iph
= (struct iphdr
*)skb_put(nskb
, sizeof(struct iphdr
));
347 icmph
= (struct icmphdr
*)skb_put(nskb
, sizeof(struct icmphdr
));
348 payload
= skb_put(nskb
, payload_length
);
352 iph
->ihl
= sizeof(struct iphdr
) >> 2;
353 iph
->tos
= (old_iph
->tos
& IPTOS_TOS_MASK
) |
354 IPTOS_PREC_INTERNETCONTROL
;
355 iph
->tot_len
= htons(sizeof(struct iphdr
)
356 + sizeof(struct icmphdr
)
358 get_random_bytes(&iph
->id
, sizeof(iph
->id
));
361 iph
->protocol
= IPPROTO_ICMP
;
362 iph
->daddr
= old_iph
->saddr
;
363 iph
->saddr
= old_iph
->daddr
;
368 icmph
->type
= ICMP_DEST_UNREACH
;
369 icmph
->code
= ICMP_FRAG_NEEDED
;
370 icmph
->un
.gateway
= htonl(mtu
);
373 nskb
->csum
= csum_partial((u8
*)icmph
, sizeof(struct icmphdr
), 0);
374 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_iph
- skb
->data
,
375 payload
, payload_length
,
377 icmph
->checksum
= csum_fold(nskb
->csum
);
381 ipv6_should_icmp(struct sk_buff
*skb
)
383 struct ipv6hdr
*old_ipv6h
= ipv6_hdr(skb
);
385 int payload_off
= (u8
*)(old_ipv6h
+ 1) - skb
->data
;
386 u8 nexthdr
= ipv6_hdr(skb
)->nexthdr
;
388 /* Check source address is valid. */
389 addr_type
= ipv6_addr_type(&old_ipv6h
->saddr
);
390 if (addr_type
& IPV6_ADDR_MULTICAST
|| addr_type
== IPV6_ADDR_ANY
)
393 /* Don't reply to unspecified addresses. */
394 if (ipv6_addr_type(&old_ipv6h
->daddr
) == IPV6_ADDR_ANY
)
397 /* Don't respond to ICMP error messages. */
398 payload_off
= ipv6_skip_exthdr(skb
, payload_off
, &nexthdr
);
402 if (nexthdr
== NEXTHDR_ICMP
) {
403 u8 icmp_type
, *icmp_typep
;
405 icmp_typep
= skb_header_pointer(skb
, payload_off
+
406 offsetof(struct icmp6hdr
,
408 sizeof(icmp_type
), &icmp_type
);
410 if (!icmp_typep
|| !(*icmp_typep
& ICMPV6_INFOMSG_MASK
))
418 ipv6_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
, unsigned int mtu
,
419 unsigned int payload_length
)
421 struct ipv6hdr
*ipv6h
, *old_ipv6h
= ipv6_hdr(skb
);
422 struct icmp6hdr
*icmp6h
;
425 ipv6h
= (struct ipv6hdr
*)skb_put(nskb
, sizeof(struct ipv6hdr
));
426 icmp6h
= (struct icmp6hdr
*)skb_put(nskb
, sizeof(struct icmp6hdr
));
427 payload
= skb_put(nskb
, payload_length
);
432 memset(&ipv6h
->flow_lbl
, 0, sizeof(ipv6h
->flow_lbl
));
433 ipv6h
->payload_len
= htons(sizeof(struct icmp6hdr
)
435 ipv6h
->nexthdr
= NEXTHDR_ICMP
;
436 ipv6h
->hop_limit
= IPV6_DEFAULT_HOPLIMIT
;
437 ipv6_addr_copy(&ipv6h
->daddr
, &old_ipv6h
->saddr
);
438 ipv6_addr_copy(&ipv6h
->saddr
, &old_ipv6h
->daddr
);
441 icmp6h
->icmp6_type
= ICMPV6_PKT_TOOBIG
;
442 icmp6h
->icmp6_code
= 0;
443 icmp6h
->icmp6_cksum
= 0;
444 icmp6h
->icmp6_mtu
= htonl(mtu
);
446 nskb
->csum
= csum_partial((u8
*)icmp6h
, sizeof(struct icmp6hdr
), 0);
447 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_ipv6h
- skb
->data
,
448 payload
, payload_length
,
450 icmp6h
->icmp6_cksum
= csum_ipv6_magic(&ipv6h
->saddr
, &ipv6h
->daddr
,
451 sizeof(struct icmp6hdr
)
453 ipv6h
->nexthdr
, nskb
->csum
);
457 send_frag_needed(struct vport
*vport
, const struct mutable_config
*mutable,
458 struct sk_buff
*skb
, unsigned int mtu
, __be32 flow_key
)
460 unsigned int eth_hdr_len
= ETH_HLEN
;
461 unsigned int total_length
, header_length
, payload_length
;
462 struct ethhdr
*eh
, *old_eh
= eth_hdr(skb
);
463 struct sk_buff
*nskb
;
466 if (skb
->protocol
== htons(ETH_P_IP
)) {
467 if (mtu
< IP_MIN_MTU
)
470 if (!ipv4_should_icmp(skb
))
473 if (mtu
< IPV6_MIN_MTU
)
476 /* In theory we should do PMTUD on IPv6 multicast messages but
477 * we don't have an address to send from so just fragment. */
478 if (ipv6_addr_type(&ipv6_hdr(skb
)->daddr
) & IPV6_ADDR_MULTICAST
)
481 if (!ipv6_should_icmp(skb
))
486 if (old_eh
->h_proto
== htons(ETH_P_8021Q
))
487 eth_hdr_len
= VLAN_ETH_HLEN
;
489 payload_length
= skb
->len
- eth_hdr_len
;
490 if (skb
->protocol
== htons(ETH_P_IP
)) {
491 header_length
= sizeof(struct iphdr
) + sizeof(struct icmphdr
);
492 total_length
= min_t(unsigned int, header_length
+
493 payload_length
, 576);
495 header_length
= sizeof(struct ipv6hdr
) +
496 sizeof(struct icmp6hdr
);
497 total_length
= min_t(unsigned int, header_length
+
498 payload_length
, IPV6_MIN_MTU
);
500 total_length
= min(total_length
, mutable->mtu
);
501 payload_length
= total_length
- header_length
;
503 nskb
= dev_alloc_skb(NET_IP_ALIGN
+ eth_hdr_len
+ header_length
+
508 skb_reserve(nskb
, NET_IP_ALIGN
);
510 /* Ethernet / VLAN */
511 eh
= (struct ethhdr
*)skb_put(nskb
, eth_hdr_len
);
512 memcpy(eh
->h_dest
, old_eh
->h_source
, ETH_ALEN
);
513 memcpy(eh
->h_source
, mutable->eth_addr
, ETH_ALEN
);
514 nskb
->protocol
= eh
->h_proto
= old_eh
->h_proto
;
515 if (old_eh
->h_proto
== htons(ETH_P_8021Q
)) {
516 struct vlan_ethhdr
*vh
= (struct vlan_ethhdr
*)eh
;
518 vh
->h_vlan_TCI
= vlan_eth_hdr(skb
)->h_vlan_TCI
;
519 vh
->h_vlan_encapsulated_proto
= skb
->protocol
;
521 skb_reset_mac_header(nskb
);
524 if (skb
->protocol
== htons(ETH_P_IP
))
525 ipv4_build_icmp(skb
, nskb
, mtu
, payload_length
);
527 ipv6_build_icmp(skb
, nskb
, mtu
, payload_length
);
529 /* Assume that flow based keys are symmetric with respect to input
530 * and output and use the key that we were going to put on the
531 * outgoing packet for the fake received packet. If the keys are
532 * not symmetric then PMTUD needs to be disabled since we won't have
533 * any way of synthesizing packets. */
534 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
&&
535 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
536 OVS_CB(nskb
)->tun_id
= flow_key
;
538 compute_ip_summed(nskb
, false);
539 vport_receive(vport
, nskb
);
544 static struct sk_buff
*
545 check_headroom(struct sk_buff
*skb
, int headroom
)
547 if (skb_headroom(skb
) < headroom
||
548 (skb_cloned(skb
) && !skb_clone_writable(skb
, 0))) {
549 struct sk_buff
*nskb
= skb_realloc_headroom(skb
, headroom
);
552 return ERR_PTR(-ENOMEM
);
555 set_skb_csum_bits(skb
, nskb
);
558 skb_set_owner_w(nskb
, skb
->sk
);
568 create_gre_header(struct sk_buff
*skb
, const struct mutable_config
*mutable)
570 struct iphdr
*iph
= ip_hdr(skb
);
571 __be16
*flags
= (__be16
*)(iph
+ 1);
572 __be16
*protocol
= flags
+ 1;
573 __be32
*options
= (__be32
*)((u8
*)iph
+ mutable->tunnel_hlen
574 - GRE_HEADER_SECTION
);
576 *protocol
= htons(ETH_P_TEB
);
579 /* Work backwards over the options so the checksum is last. */
580 if (mutable->port_config
.out_key
||
581 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
) {
584 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
585 *options
= OVS_CB(skb
)->tun_id
;
587 *options
= mutable->port_config
.out_key
;
592 if (mutable->port_config
.flags
& GRE_F_OUT_CSUM
) {
596 *(__sum16
*)options
= csum_fold(skb_checksum(skb
,
597 sizeof(struct iphdr
),
598 skb
->len
- sizeof(struct iphdr
),
604 check_checksum(struct sk_buff
*skb
)
606 struct iphdr
*iph
= ip_hdr(skb
);
607 __be16 flags
= *(__be16
*)(iph
+ 1);
610 if (flags
& GRE_CSUM
) {
611 switch (skb
->ip_summed
) {
612 case CHECKSUM_COMPLETE
:
613 csum
= csum_fold(skb
->csum
);
621 csum
= __skb_checksum_complete(skb
);
622 skb
->ip_summed
= CHECKSUM_COMPLETE
;
631 parse_gre_header(struct iphdr
*iph
, __be16
*flags
, __be32
*key
)
633 /* IP and ICMP protocol handlers check that the IHL is valid. */
634 __be16
*flagsp
= (__be16
*)((u8
*)iph
+ (iph
->ihl
<< 2));
635 __be16
*protocol
= flagsp
+ 1;
636 __be32
*options
= (__be32
*)(protocol
+ 1);
641 if (*flags
& (GRE_VERSION
| GRE_ROUTING
))
644 if (*protocol
!= htons(ETH_P_TEB
))
647 hdr_len
= GRE_HEADER_SECTION
;
649 if (*flags
& GRE_CSUM
) {
650 hdr_len
+= GRE_HEADER_SECTION
;
654 if (*flags
& GRE_KEY
) {
655 hdr_len
+= GRE_HEADER_SECTION
;
662 if (*flags
& GRE_SEQ
)
663 hdr_len
+= GRE_HEADER_SECTION
;
669 ecn_encapsulate(u8 tos
, struct sk_buff
*skb
)
673 if (skb
->protocol
== htons(ETH_P_IP
))
674 inner
= ((struct iphdr
*)skb_network_header(skb
))->tos
;
675 else if (skb
->protocol
== htons(ETH_P_IPV6
))
676 inner
= ipv6_get_dsfield((struct ipv6hdr
*)skb_network_header(skb
));
680 return INET_ECN_encapsulate(tos
, inner
);
684 ecn_decapsulate(u8 tos
, struct sk_buff
*skb
)
686 if (INET_ECN_is_ce(tos
)) {
687 __be16 protocol
= skb
->protocol
;
688 unsigned int nw_header
= skb_network_header(skb
) - skb
->data
;
690 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
691 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
694 protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
695 nw_header
+= VLAN_HLEN
;
698 if (protocol
== htons(ETH_P_IP
)) {
699 if (unlikely(!pskb_may_pull(skb
, nw_header
700 + sizeof(struct iphdr
))))
703 IP_ECN_set_ce((struct iphdr
*)(nw_header
+ skb
->data
));
704 } else if (protocol
== htons(ETH_P_IPV6
)) {
705 if (unlikely(!pskb_may_pull(skb
, nw_header
706 + sizeof(struct ipv6hdr
))))
709 IP6_ECN_set_ce((struct ipv6hdr
*)(nw_header
715 static struct sk_buff
*
716 handle_gso(struct sk_buff
*skb
)
718 if (skb_is_gso(skb
)) {
719 struct sk_buff
*nskb
= skb_gso_segment(skb
, NETIF_F_SG
);
729 handle_csum_offload(struct sk_buff
*skb
)
731 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
732 return skb_checksum_help(skb
);
734 skb
->ip_summed
= CHECKSUM_NONE
;
739 /* Called with rcu_read_lock. */
741 gre_err(struct sk_buff
*skb
, u32 info
)
744 const struct mutable_config
*mutable;
745 const int type
= icmp_hdr(skb
)->type
;
746 const int code
= icmp_hdr(skb
)->code
;
747 int mtu
= ntohs(icmp_hdr(skb
)->un
.frag
.mtu
);
752 int tunnel_hdr_len
, tot_hdr_len
;
753 unsigned int orig_mac_header
;
754 unsigned int orig_nw_header
;
756 if (type
!= ICMP_DEST_UNREACH
|| code
!= ICMP_FRAG_NEEDED
)
759 /* The mimimum size packet that we would actually be able to process:
760 * encapsulating IP header, minimum GRE header, Ethernet header,
761 * inner IPv4 header. */
762 if (!pskb_may_pull(skb
, sizeof(struct iphdr
) + GRE_HEADER_SECTION
+
763 ETH_HLEN
+ sizeof(struct iphdr
)))
766 iph
= (struct iphdr
*)skb
->data
;
768 tunnel_hdr_len
= parse_gre_header(iph
, &flags
, &key
);
769 if (tunnel_hdr_len
< 0)
772 vport
= find_port(iph
->saddr
, iph
->daddr
, key
, FIND_PORT_ANY
, &mutable);
776 /* Packets received by this function were previously sent by us, so
777 * any comparisons should be to the output values, not the input.
778 * However, it's not really worth it to have a hash table based on
779 * output keys (especially since ICMP error handling of tunneled packets
780 * isn't that reliable anyways). Therefore, we do a lookup based on the
781 * out key as if it were the in key and then check to see if the input
782 * and output keys are the same. */
783 if (mutable->port_config
.in_key
!= mutable->port_config
.out_key
)
786 if (!!(mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
787 !!(mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
))
790 if ((mutable->port_config
.flags
& GRE_F_OUT_CSUM
) && !(flags
& GRE_CSUM
))
793 tunnel_hdr_len
+= iph
->ihl
<< 2;
795 orig_mac_header
= skb_mac_header(skb
) - skb
->data
;
796 orig_nw_header
= skb_network_header(skb
) - skb
->data
;
797 skb_set_mac_header(skb
, tunnel_hdr_len
);
799 tot_hdr_len
= tunnel_hdr_len
+ ETH_HLEN
;
801 skb
->protocol
= eth_hdr(skb
)->h_proto
;
802 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
803 tot_hdr_len
+= VLAN_HLEN
;
804 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
807 skb_set_network_header(skb
, tot_hdr_len
);
810 if (skb
->protocol
== htons(ETH_P_IP
))
811 tot_hdr_len
+= sizeof(struct iphdr
);
812 else if (skb
->protocol
== htons(ETH_P_IPV6
))
813 tot_hdr_len
+= sizeof(struct ipv6hdr
);
817 if (!pskb_may_pull(skb
, tot_hdr_len
))
820 if (skb
->protocol
== htons(ETH_P_IP
)) {
821 if (mtu
< IP_MIN_MTU
) {
822 if (ntohs(ip_hdr(skb
)->tot_len
) >= IP_MIN_MTU
)
828 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
829 if (mtu
< IPV6_MIN_MTU
) {
830 unsigned int packet_length
= sizeof(struct ipv6hdr
) +
831 ntohs(ipv6_hdr(skb
)->payload_len
);
833 if (packet_length
>= IPV6_MIN_MTU
834 || ntohs(ipv6_hdr(skb
)->payload_len
) == 0)
841 __pskb_pull(skb
, tunnel_hdr_len
);
842 send_frag_needed(vport
, mutable, skb
, mtu
, key
);
843 skb_push(skb
, tunnel_hdr_len
);
846 skb_set_mac_header(skb
, orig_mac_header
);
847 skb_set_network_header(skb
, orig_nw_header
);
848 skb
->protocol
= htons(ETH_P_IP
);
851 /* Called with rcu_read_lock. */
853 gre_rcv(struct sk_buff
*skb
)
856 const struct mutable_config
*mutable;
862 if (!pskb_may_pull(skb
, GRE_HEADER_SECTION
+ ETH_HLEN
))
865 if (!check_checksum(skb
))
870 hdr_len
= parse_gre_header(iph
, &flags
, &key
);
874 vport
= find_port(iph
->daddr
, iph
->saddr
, key
, FIND_PORT_ANY
, &mutable);
876 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
880 if ((mutable->port_config
.flags
& GRE_F_IN_CSUM
) && !(flags
& GRE_CSUM
)) {
881 vport_record_error(vport
, VPORT_E_RX_CRC
);
885 if (!pskb_pull(skb
, hdr_len
) || !pskb_may_pull(skb
, ETH_HLEN
)) {
886 vport_record_error(vport
, VPORT_E_RX_ERROR
);
890 skb
->pkt_type
= PACKET_HOST
;
891 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
892 skb_postpull_rcsum(skb
, skb_transport_header(skb
), hdr_len
+ ETH_HLEN
);
897 skb_reset_network_header(skb
);
899 ecn_decapsulate(iph
->tos
, skb
);
901 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
902 OVS_CB(skb
)->tun_id
= key
;
904 OVS_CB(skb
)->tun_id
= 0;
906 skb_push(skb
, ETH_HLEN
);
907 compute_ip_summed(skb
, false);
909 vport_receive(vport
, skb
);
919 build_packet(struct vport
*vport
, const struct mutable_config
*mutable,
920 struct iphdr
*iph
, struct rtable
*rt
, int max_headroom
, int mtu
,
924 struct iphdr
*new_iph
;
925 int orig_len
= skb
->len
;
926 __be16 frag_off
= iph
->frag_off
;
928 skb
= check_headroom(skb
, max_headroom
);
929 if (unlikely(IS_ERR(skb
)))
932 err
= handle_csum_offload(skb
);
936 if (skb
->protocol
== htons(ETH_P_IP
)) {
937 struct iphdr
*old_iph
= ip_hdr(skb
);
939 if ((old_iph
->frag_off
& htons(IP_DF
)) &&
940 mtu
< ntohs(old_iph
->tot_len
)) {
941 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
945 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
946 unsigned int packet_length
= skb
->len
- ETH_HLEN
947 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
949 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
950 if (packet_length
> IPV6_MIN_MTU
)
951 frag_off
= htons(IP_DF
);
953 if (mtu
< packet_length
) {
954 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
959 skb_reset_transport_header(skb
);
960 new_iph
= (struct iphdr
*)skb_push(skb
, mutable->tunnel_hlen
);
961 skb_reset_network_header(skb
);
963 memcpy(new_iph
, iph
, sizeof(struct iphdr
));
964 new_iph
->frag_off
= frag_off
;
965 ip_select_ident(new_iph
, &rt
->u
.dst
, NULL
);
967 create_gre_header(skb
, mutable);
969 /* Allow our local IP stack to fragment the outer packet even if the
970 * DF bit is set as a last resort. */
973 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
974 IPCB(skb
)->flags
= 0;
976 err
= ip_local_out(skb
);
977 if (likely(net_xmit_eval(err
) == 0))
980 vport_record_error(vport
, VPORT_E_TX_ERROR
);
987 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
993 gre_send(struct vport
*vport
, struct sk_buff
*skb
)
995 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
996 const struct mutable_config
*mutable = rcu_dereference(gre_vport
->mutable);
998 struct iphdr
*old_iph
;
999 struct ipv6hdr
*old_ipv6h
;
1006 /* Validate the protocol headers before we try to use them. */
1007 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
1008 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
1011 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
1012 skb_set_network_header(skb
, VLAN_ETH_HLEN
);
1015 if (skb
->protocol
== htons(ETH_P_IP
)) {
1016 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1017 + sizeof(struct iphdr
) - skb
->data
)))
1019 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
1020 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1021 + sizeof(struct ipv6hdr
) - skb
->data
)))
1025 old_iph
= ip_hdr(skb
);
1026 old_ipv6h
= ipv6_hdr(skb
);
1028 iph
.tos
= mutable->port_config
.tos
;
1029 if (mutable->port_config
.flags
& GRE_F_TOS_INHERIT
) {
1030 if (skb
->protocol
== htons(ETH_P_IP
))
1031 iph
.tos
= old_iph
->tos
;
1032 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1033 iph
.tos
= ipv6_get_dsfield(ipv6_hdr(skb
));
1035 iph
.tos
= ecn_encapsulate(iph
.tos
, skb
);
1038 struct flowi fl
= { .nl_u
= { .ip4_u
=
1039 { .daddr
= mutable->port_config
.daddr
,
1040 .saddr
= mutable->port_config
.saddr
,
1041 .tos
= RT_TOS(iph
.tos
) } },
1042 .proto
= IPPROTO_GRE
};
1044 if (ip_route_output_key(&init_net
, &rt
, &fl
))
1048 iph
.ttl
= mutable->port_config
.ttl
;
1049 if (mutable->port_config
.flags
& GRE_F_TTL_INHERIT
) {
1050 if (skb
->protocol
== htons(ETH_P_IP
))
1051 iph
.ttl
= old_iph
->ttl
;
1052 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1053 iph
.ttl
= old_ipv6h
->hop_limit
;
1056 iph
.ttl
= dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
);
1058 iph
.frag_off
= (mutable->port_config
.flags
& GRE_F_PMTUD
) ? htons(IP_DF
) : 0;
1060 mtu
= dst_mtu(&rt
->u
.dst
)
1062 - mutable->tunnel_hlen
1063 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
1067 if (skb
->protocol
== htons(ETH_P_IP
)) {
1068 iph
.frag_off
|= old_iph
->frag_off
& htons(IP_DF
);
1069 mtu
= max(mtu
, IP_MIN_MTU
);
1071 } else if (skb
->protocol
== htons(ETH_P_IPV6
))
1072 mtu
= max(mtu
, IPV6_MIN_MTU
);
1075 iph
.ihl
= sizeof(struct iphdr
) >> 2;
1076 iph
.protocol
= IPPROTO_GRE
;
1077 iph
.daddr
= rt
->rt_dst
;
1078 iph
.saddr
= rt
->rt_src
;
1083 skb_dst_set(skb
, &rt
->u
.dst
);
1085 /* If we are doing GSO on a pskb it is better to make sure that the
1086 * headroom is correct now. We will only have to copy the portion in
1087 * the linear data area and GSO will preserve headroom when it creates
1088 * the segments. This is particularly beneficial on Xen where we get
1089 * lots of GSO pskbs. Conversely, we delay copying if it is just to
1090 * get our own writable clone because GSO may do the copy for us. */
1091 max_headroom
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
) + rt
->u
.dst
.header_len
1092 + mutable->tunnel_hlen
;
1094 if (skb_headroom(skb
) < max_headroom
) {
1095 skb
= check_headroom(skb
, max_headroom
);
1096 if (unlikely(IS_ERR(skb
))) {
1097 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1102 forward_ip_summed(skb
);
1103 vswitch_skb_checksum_setup(skb
);
1105 skb
= handle_gso(skb
);
1106 if (unlikely(IS_ERR(skb
))) {
1107 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1111 /* Process GSO segments. Try to do any work for the entire packet that
1112 * doesn't involve actually writing to it before this point. */
1115 struct sk_buff
*next_skb
= skb
->next
;
1118 orig_len
+= build_packet(vport
, mutable, &iph
, rt
, max_headroom
, mtu
, skb
);
1127 vport_record_error(vport
, VPORT_E_TX_ERROR
);
1132 static struct net_protocol gre_protocol_handlers
= {
1134 .err_handler
= gre_err
,
1142 err
= inet_add_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1144 printk(KERN_WARNING
"openvswitch: cannot register gre protocol handler\n");
1152 tbl_destroy(port_table
, NULL
);
1153 inet_del_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1157 set_config(const struct vport
*cur_vport
, struct mutable_config
*mutable,
1158 const void __user
*uconfig
)
1160 const struct vport
*old_vport
;
1161 const struct mutable_config
*old_mutable
;
1164 if (copy_from_user(&mutable->port_config
, uconfig
, sizeof(struct gre_port_config
)))
1167 if (mutable->port_config
.daddr
== 0)
1170 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
1171 port_type
= FIND_PORT_MATCH
;
1172 mutable->port_config
.in_key
= 0;
1174 port_type
= FIND_PORT_KEY
;
1176 old_vport
= find_port(mutable->port_config
.saddr
,
1177 mutable->port_config
.daddr
,
1178 mutable->port_config
.in_key
, port_type
,
1181 if (old_vport
&& old_vport
!= cur_vport
)
1184 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1185 mutable->port_config
.out_key
= 0;
1187 mutable->tunnel_hlen
= sizeof(struct iphdr
) + GRE_HEADER_SECTION
;
1189 if (mutable->port_config
.flags
& GRE_F_OUT_CSUM
)
1190 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1192 if (mutable->port_config
.out_key
||
1193 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1194 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1199 static struct vport
*
1200 gre_create(const char *name
, const void __user
*config
)
1202 struct vport
*vport
;
1203 struct gre_vport
*gre_vport
;
1206 vport
= vport_alloc(sizeof(struct gre_vport
), &gre_vport_ops
);
1207 if (IS_ERR(vport
)) {
1208 err
= PTR_ERR(vport
);
1212 gre_vport
= gre_vport_priv(vport
);
1214 strcpy(gre_vport
->name
, name
);
1216 gre_vport
->mutable = kmalloc(sizeof(struct mutable_config
), GFP_KERNEL
);
1217 if (!gre_vport
->mutable) {
1219 goto error_free_vport
;
1222 vport_gen_rand_ether_addr(gre_vport
->mutable->eth_addr
);
1223 gre_vport
->mutable->mtu
= ETH_DATA_LEN
;
1225 err
= set_config(NULL
, gre_vport
->mutable, config
);
1227 goto error_free_mutable
;
1229 err
= add_port(vport
);
1231 goto error_free_mutable
;
1236 kfree(gre_vport
->mutable);
1240 return ERR_PTR(err
);
1244 gre_modify(struct vport
*vport
, const void __user
*config
)
1246 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1247 struct mutable_config
*mutable;
1249 int update_hash
= 0;
1251 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1257 err
= set_config(vport
, mutable, config
);
1261 /* Only remove the port from the hash table if something that would
1262 * affect the lookup has changed. */
1263 if (gre_vport
->mutable->port_config
.saddr
!= mutable->port_config
.saddr
||
1264 gre_vport
->mutable->port_config
.daddr
!= mutable->port_config
.daddr
||
1265 gre_vport
->mutable->port_config
.in_key
!= mutable->port_config
.in_key
||
1266 (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
1267 (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
))
1271 /* This update is not atomic but the lookup uses the config, which
1272 * serves as an inherent double check. */
1274 err
= del_port(vport
);
1279 assign_config_rcu(vport
, mutable);
1282 err
= add_port(vport
);
1296 gre_destroy(struct vport
*vport
)
1298 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1300 const struct mutable_config
*old_mutable
;
1302 /* Do a hash table lookup to make sure that the port exists. It should
1303 * exist but might not if a modify failed earlier. */
1304 if (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
1305 port_type
= FIND_PORT_MATCH
;
1307 port_type
= FIND_PORT_KEY
;
1309 if (vport
== find_port(gre_vport
->mutable->port_config
.saddr
,
1310 gre_vport
->mutable->port_config
.daddr
,
1311 gre_vport
->mutable->port_config
.in_key
, port_type
, &old_mutable
))
1314 kfree(gre_vport
->mutable);
1321 gre_set_mtu(struct vport
*vport
, int mtu
)
1323 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1324 struct mutable_config
*mutable;
1326 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1331 assign_config_rcu(vport
, mutable);
1337 gre_set_addr(struct vport
*vport
, const unsigned char *addr
)
1339 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1340 struct mutable_config
*mutable;
1342 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1346 memcpy(mutable->eth_addr
, addr
, ETH_ALEN
);
1347 assign_config_rcu(vport
, mutable);
1354 gre_get_name(const struct vport
*vport
)
1356 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1357 return gre_vport
->name
;
1360 static const unsigned char *
1361 gre_get_addr(const struct vport
*vport
)
1363 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1364 return rcu_dereference(gre_vport
->mutable)->eth_addr
;
1368 gre_get_mtu(const struct vport
*vport
)
1370 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1371 return rcu_dereference(gre_vport
->mutable)->mtu
;
1374 struct vport_ops gre_vport_ops
= {
1376 .flags
= VPORT_F_GEN_STATS
| VPORT_F_TUN_ID
,
1379 .create
= gre_create
,
1380 .modify
= gre_modify
,
1381 .destroy
= gre_destroy
,
1382 .set_mtu
= gre_set_mtu
,
1383 .set_addr
= gre_set_addr
,
1384 .get_name
= gre_get_name
,
1385 .get_addr
= gre_get_addr
,
1386 .get_dev_flags
= vport_gen_get_dev_flags
,
1387 .is_running
= vport_gen_is_running
,
1388 .get_operstate
= vport_gen_get_operstate
,
1389 .get_mtu
= gre_get_mtu
,