2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_vlan.h>
15 #include <linux/in_route.h>
16 #include <linux/jhash.h>
17 #include <linux/kernel.h>
18 #include <linux/version.h>
20 #include <net/dsfield.h>
23 #include <net/inet_ecn.h>
26 #include <net/protocol.h>
27 #include <net/route.h>
32 #include "openvswitch/gre.h"
36 /* The absolute minimum fragment size. Note that there are many other
37 * definitions of the minimum MTU. */
40 /* The GRE header is composed of a series of sections: a base and then a variable
41 * number of options. */
42 #define GRE_HEADER_SECTION 4
44 struct mutable_config
{
47 unsigned char eth_addr
[ETH_ALEN
];
49 struct gre_port_config port_config
;
51 int tunnel_hlen
; /* Tunnel header length. */
55 struct tbl_node tbl_node
;
59 /* Protected by RCU. */
60 struct mutable_config
*mutable;
63 struct vport_ops gre_vport_ops
;
65 /* Protected by RCU. */
66 static struct tbl
*port_table
;
68 /* These are just used as an optimization: they don't require any kind of
69 * synchronization because we could have just as easily read the value before
70 * the port change happened. */
71 static unsigned int key_local_remote_ports
;
72 static unsigned int key_remote_ports
;
73 static unsigned int local_remote_ports
;
74 static unsigned int remote_ports
;
76 static inline struct gre_vport
*
77 gre_vport_priv(const struct vport
*vport
)
79 return vport_priv(vport
);
82 static inline struct vport
*
83 gre_vport_to_vport(const struct gre_vport
*gre_vport
)
85 return vport_from_priv(gre_vport
);
88 static inline struct gre_vport
*
89 gre_vport_table_cast(const struct tbl_node
*node
)
91 return container_of(node
, struct gre_vport
, tbl_node
);
96 free_config(struct rcu_head
*rcu
)
98 struct mutable_config
*c
= container_of(rcu
, struct mutable_config
, rcu
);
103 assign_config_rcu(struct vport
*vport
, struct mutable_config
*new_config
)
105 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
106 struct mutable_config
*old_config
;
108 old_config
= rcu_dereference(gre_vport
->mutable);
109 rcu_assign_pointer(gre_vport
->mutable, new_config
);
110 call_rcu(&old_config
->rcu
, free_config
);
113 static unsigned int *
114 find_port_pool(const struct mutable_config
*mutable)
116 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
117 if (mutable->port_config
.saddr
)
118 return &local_remote_ports
;
120 return &remote_ports
;
122 if (mutable->port_config
.saddr
)
123 return &key_local_remote_ports
;
125 return &key_remote_ports
;
136 struct port_lookup_key
{
137 u32 vals
[4]; /* Contains enum lookup_key keys. */
138 const struct mutable_config
*mutable;
141 /* Modifies 'target' to store the rcu_dereferenced pointer that was used to do
142 * the comparision. */
144 port_cmp(const struct tbl_node
*node
, void *target
)
146 const struct gre_vport
*gre_vport
= gre_vport_table_cast(node
);
147 struct port_lookup_key
*lookup
= target
;
149 lookup
->mutable = rcu_dereference(gre_vport
->mutable);
151 return ((lookup
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) ==
152 lookup
->vals
[LOOKUP_KEY_MATCH
]) &&
153 lookup
->mutable->port_config
.daddr
== lookup
->vals
[LOOKUP_DADDR
] &&
154 lookup
->mutable->port_config
.in_key
== lookup
->vals
[LOOKUP_KEY
] &&
155 lookup
->mutable->port_config
.saddr
== lookup
->vals
[LOOKUP_SADDR
];
159 port_hash(struct port_lookup_key
*lookup
)
161 return jhash2(lookup
->vals
, ARRAY_SIZE(lookup
->vals
), 0);
165 add_port(struct vport
*vport
)
167 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
168 struct port_lookup_key lookup
;
172 struct tbl
*new_table
;
174 new_table
= tbl_create(0);
178 rcu_assign_pointer(port_table
, new_table
);
180 } else if (tbl_count(port_table
) > tbl_n_buckets(port_table
)) {
181 struct tbl
*old_table
= port_table
;
182 struct tbl
*new_table
;
184 new_table
= tbl_expand(old_table
);
185 if (IS_ERR(new_table
))
186 return PTR_ERR(new_table
);
188 rcu_assign_pointer(port_table
, new_table
);
189 tbl_deferred_destroy(old_table
, NULL
);
192 lookup
.vals
[LOOKUP_SADDR
] = gre_vport
->mutable->port_config
.saddr
;
193 lookup
.vals
[LOOKUP_DADDR
] = gre_vport
->mutable->port_config
.daddr
;
194 lookup
.vals
[LOOKUP_KEY
] = gre_vport
->mutable->port_config
.in_key
;
195 lookup
.vals
[LOOKUP_KEY_MATCH
] = gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
;
197 err
= tbl_insert(port_table
, &gre_vport
->tbl_node
, port_hash(&lookup
));
201 (*find_port_pool(gre_vport
->mutable))++;
207 del_port(struct vport
*vport
)
209 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
212 err
= tbl_remove(port_table
, &gre_vport
->tbl_node
);
216 (*find_port_pool(gre_vport
->mutable))--;
221 #define FIND_PORT_KEY (1 << 0)
222 #define FIND_PORT_MATCH (1 << 1)
223 #define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH)
225 static struct vport
*
226 find_port(__be32 saddr
, __be32 daddr
, __be32 key
, int port_type
,
227 const struct mutable_config
**mutable)
229 struct port_lookup_key lookup
;
230 struct tbl
*table
= rcu_dereference(port_table
);
231 struct tbl_node
*tbl_node
;
236 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
237 lookup
.vals
[LOOKUP_DADDR
] = daddr
;
239 if (port_type
& FIND_PORT_KEY
) {
240 lookup
.vals
[LOOKUP_KEY
] = key
;
241 lookup
.vals
[LOOKUP_KEY_MATCH
] = 0;
243 if (key_local_remote_ports
) {
244 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
249 if (key_remote_ports
) {
250 lookup
.vals
[LOOKUP_SADDR
] = 0;
252 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
256 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
260 if (port_type
& FIND_PORT_MATCH
) {
261 lookup
.vals
[LOOKUP_KEY
] = 0;
262 lookup
.vals
[LOOKUP_KEY_MATCH
] = GRE_F_IN_KEY_MATCH
;
264 if (local_remote_ports
) {
265 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
271 lookup
.vals
[LOOKUP_SADDR
] = 0;
273 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
282 *mutable = lookup
.mutable;
283 return gre_vport_to_vport(gre_vport_table_cast(tbl_node
));
287 check_ipv4_address(__be32 addr
)
289 if (ipv4_is_multicast(addr
) || ipv4_is_lbcast(addr
)
290 || ipv4_is_loopback(addr
) || ipv4_is_zeronet(addr
))
297 ipv4_should_icmp(struct sk_buff
*skb
)
299 struct iphdr
*old_iph
= ip_hdr(skb
);
301 /* Don't respond to L2 broadcast. */
302 if (is_multicast_ether_addr(eth_hdr(skb
)->h_dest
))
305 /* Don't respond to L3 broadcast or invalid addresses. */
306 if (!check_ipv4_address(old_iph
->daddr
) ||
307 !check_ipv4_address(old_iph
->saddr
))
310 /* Only respond to the first fragment. */
311 if (old_iph
->frag_off
& htons(IP_OFFSET
))
314 /* Don't respond to ICMP error messages. */
315 if (old_iph
->protocol
== IPPROTO_ICMP
) {
316 u8 icmp_type
, *icmp_typep
;
318 icmp_typep
= skb_header_pointer(skb
, (u8
*)old_iph
+
319 (old_iph
->ihl
<< 2) +
320 offsetof(struct icmphdr
, type
) -
321 skb
->data
, sizeof(icmp_type
),
327 if (*icmp_typep
> NR_ICMP_TYPES
328 || (*icmp_typep
<= ICMP_PARAMETERPROB
329 && *icmp_typep
!= ICMP_ECHOREPLY
330 && *icmp_typep
!= ICMP_ECHO
))
338 ipv4_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
,
339 unsigned int mtu
, unsigned int payload_length
)
341 struct iphdr
*iph
, *old_iph
= ip_hdr(skb
);
342 struct icmphdr
*icmph
;
345 iph
= (struct iphdr
*)skb_put(nskb
, sizeof(struct iphdr
));
346 icmph
= (struct icmphdr
*)skb_put(nskb
, sizeof(struct icmphdr
));
347 payload
= skb_put(nskb
, payload_length
);
351 iph
->ihl
= sizeof(struct iphdr
) >> 2;
352 iph
->tos
= (old_iph
->tos
& IPTOS_TOS_MASK
) |
353 IPTOS_PREC_INTERNETCONTROL
;
354 iph
->tot_len
= htons(sizeof(struct iphdr
)
355 + sizeof(struct icmphdr
)
357 get_random_bytes(&iph
->id
, sizeof(iph
->id
));
360 iph
->protocol
= IPPROTO_ICMP
;
361 iph
->daddr
= old_iph
->saddr
;
362 iph
->saddr
= old_iph
->daddr
;
367 icmph
->type
= ICMP_DEST_UNREACH
;
368 icmph
->code
= ICMP_FRAG_NEEDED
;
369 icmph
->un
.gateway
= htonl(mtu
);
372 nskb
->csum
= csum_partial((u8
*)icmph
, sizeof(struct icmphdr
), 0);
373 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_iph
- skb
->data
,
374 payload
, payload_length
,
376 icmph
->checksum
= csum_fold(nskb
->csum
);
380 ipv6_should_icmp(struct sk_buff
*skb
)
382 struct ipv6hdr
*old_ipv6h
= ipv6_hdr(skb
);
384 int payload_off
= (u8
*)(old_ipv6h
+ 1) - skb
->data
;
385 u8 nexthdr
= ipv6_hdr(skb
)->nexthdr
;
387 /* Check source address is valid. */
388 addr_type
= ipv6_addr_type(&old_ipv6h
->saddr
);
389 if (addr_type
& IPV6_ADDR_MULTICAST
|| addr_type
== IPV6_ADDR_ANY
)
392 /* Don't reply to unspecified addresses. */
393 if (ipv6_addr_type(&old_ipv6h
->daddr
) == IPV6_ADDR_ANY
)
396 /* Don't respond to ICMP error messages. */
397 payload_off
= ipv6_skip_exthdr(skb
, payload_off
, &nexthdr
);
401 if (nexthdr
== NEXTHDR_ICMP
) {
402 u8 icmp_type
, *icmp_typep
;
404 icmp_typep
= skb_header_pointer(skb
, payload_off
+
405 offsetof(struct icmp6hdr
,
407 sizeof(icmp_type
), &icmp_type
);
409 if (!icmp_typep
|| !(*icmp_typep
& ICMPV6_INFOMSG_MASK
))
417 ipv6_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
, unsigned int mtu
,
418 unsigned int payload_length
)
420 struct ipv6hdr
*ipv6h
, *old_ipv6h
= ipv6_hdr(skb
);
421 struct icmp6hdr
*icmp6h
;
424 ipv6h
= (struct ipv6hdr
*)skb_put(nskb
, sizeof(struct ipv6hdr
));
425 icmp6h
= (struct icmp6hdr
*)skb_put(nskb
, sizeof(struct icmp6hdr
));
426 payload
= skb_put(nskb
, payload_length
);
431 memset(&ipv6h
->flow_lbl
, 0, sizeof(ipv6h
->flow_lbl
));
432 ipv6h
->payload_len
= htons(sizeof(struct icmp6hdr
)
434 ipv6h
->nexthdr
= NEXTHDR_ICMP
;
435 ipv6h
->hop_limit
= IPV6_DEFAULT_HOPLIMIT
;
436 ipv6_addr_copy(&ipv6h
->daddr
, &old_ipv6h
->saddr
);
437 ipv6_addr_copy(&ipv6h
->saddr
, &old_ipv6h
->daddr
);
440 icmp6h
->icmp6_type
= ICMPV6_PKT_TOOBIG
;
441 icmp6h
->icmp6_code
= 0;
442 icmp6h
->icmp6_cksum
= 0;
443 icmp6h
->icmp6_mtu
= htonl(mtu
);
445 nskb
->csum
= csum_partial((u8
*)icmp6h
, sizeof(struct icmp6hdr
), 0);
446 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_ipv6h
- skb
->data
,
447 payload
, payload_length
,
449 icmp6h
->icmp6_cksum
= csum_ipv6_magic(&ipv6h
->saddr
, &ipv6h
->daddr
,
450 sizeof(struct icmp6hdr
)
452 ipv6h
->nexthdr
, nskb
->csum
);
456 send_frag_needed(struct vport
*vport
, const struct mutable_config
*mutable,
457 struct sk_buff
*skb
, unsigned int mtu
, __be32 flow_key
)
459 unsigned int eth_hdr_len
= ETH_HLEN
;
460 unsigned int total_length
, header_length
, payload_length
;
461 struct ethhdr
*eh
, *old_eh
= eth_hdr(skb
);
462 struct sk_buff
*nskb
;
465 if (skb
->protocol
== htons(ETH_P_IP
)) {
466 if (mtu
< IP_MIN_MTU
)
469 if (!ipv4_should_icmp(skb
))
472 if (mtu
< IPV6_MIN_MTU
)
475 /* In theory we should do PMTUD on IPv6 multicast messages but
476 * we don't have an address to send from so just fragment. */
477 if (ipv6_addr_type(&ipv6_hdr(skb
)->daddr
) & IPV6_ADDR_MULTICAST
)
480 if (!ipv6_should_icmp(skb
))
485 if (old_eh
->h_proto
== htons(ETH_P_8021Q
))
486 eth_hdr_len
= VLAN_ETH_HLEN
;
488 payload_length
= skb
->len
- eth_hdr_len
;
489 if (skb
->protocol
== htons(ETH_P_IP
)) {
490 header_length
= sizeof(struct iphdr
) + sizeof(struct icmphdr
);
491 total_length
= min_t(unsigned int, header_length
+
492 payload_length
, 576);
494 header_length
= sizeof(struct ipv6hdr
) +
495 sizeof(struct icmp6hdr
);
496 total_length
= min_t(unsigned int, header_length
+
497 payload_length
, IPV6_MIN_MTU
);
499 total_length
= min(total_length
, mutable->mtu
);
500 payload_length
= total_length
- header_length
;
502 nskb
= dev_alloc_skb(NET_IP_ALIGN
+ eth_hdr_len
+ header_length
+
507 skb_reserve(nskb
, NET_IP_ALIGN
);
509 /* Ethernet / VLAN */
510 eh
= (struct ethhdr
*)skb_put(nskb
, eth_hdr_len
);
511 memcpy(eh
->h_dest
, old_eh
->h_source
, ETH_ALEN
);
512 memcpy(eh
->h_source
, mutable->eth_addr
, ETH_ALEN
);
513 nskb
->protocol
= eh
->h_proto
= old_eh
->h_proto
;
514 if (old_eh
->h_proto
== htons(ETH_P_8021Q
)) {
515 struct vlan_ethhdr
*vh
= (struct vlan_ethhdr
*)eh
;
517 vh
->h_vlan_TCI
= vlan_eth_hdr(skb
)->h_vlan_TCI
;
518 vh
->h_vlan_encapsulated_proto
= skb
->protocol
;
520 skb_reset_mac_header(nskb
);
523 if (skb
->protocol
== htons(ETH_P_IP
))
524 ipv4_build_icmp(skb
, nskb
, mtu
, payload_length
);
526 ipv6_build_icmp(skb
, nskb
, mtu
, payload_length
);
528 /* Assume that flow based keys are symmetric with respect to input
529 * and output and use the key that we were going to put on the
530 * outgoing packet for the fake received packet. If the keys are
531 * not symmetric then PMTUD needs to be disabled since we won't have
532 * any way of synthesizing packets. */
533 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
&&
534 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
535 OVS_CB(nskb
)->tun_id
= flow_key
;
537 compute_ip_summed(nskb
, false);
538 vport_receive(vport
, nskb
);
543 static struct sk_buff
*
544 check_headroom(struct sk_buff
*skb
, int headroom
)
546 if (skb_headroom(skb
) < headroom
||
547 (skb_cloned(skb
) && !skb_clone_writable(skb
, 0))) {
548 struct sk_buff
*nskb
= skb_realloc_headroom(skb
, headroom
);
551 return ERR_PTR(-ENOMEM
);
554 set_skb_csum_bits(skb
, nskb
);
557 skb_set_owner_w(nskb
, skb
->sk
);
567 create_gre_header(struct sk_buff
*skb
, const struct mutable_config
*mutable)
569 struct iphdr
*iph
= ip_hdr(skb
);
570 __be16
*flags
= (__be16
*)(iph
+ 1);
571 __be16
*protocol
= flags
+ 1;
572 __be32
*options
= (__be32
*)((u8
*)iph
+ mutable->tunnel_hlen
573 - GRE_HEADER_SECTION
);
575 *protocol
= htons(ETH_P_TEB
);
578 /* Work backwards over the options so the checksum is last. */
579 if (mutable->port_config
.out_key
||
580 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
) {
583 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
584 *options
= OVS_CB(skb
)->tun_id
;
586 *options
= mutable->port_config
.out_key
;
591 if (mutable->port_config
.flags
& GRE_F_OUT_CSUM
) {
595 *(__sum16
*)options
= csum_fold(skb_checksum(skb
,
596 sizeof(struct iphdr
),
597 skb
->len
- sizeof(struct iphdr
),
603 check_checksum(struct sk_buff
*skb
)
605 struct iphdr
*iph
= ip_hdr(skb
);
606 __be16 flags
= *(__be16
*)(iph
+ 1);
609 if (flags
& GRE_CSUM
) {
610 switch (skb
->ip_summed
) {
611 case CHECKSUM_COMPLETE
:
612 csum
= csum_fold(skb
->csum
);
620 csum
= __skb_checksum_complete(skb
);
621 skb
->ip_summed
= CHECKSUM_COMPLETE
;
630 parse_gre_header(struct iphdr
*iph
, __be16
*flags
, __be32
*key
)
632 /* IP and ICMP protocol handlers check that the IHL is valid. */
633 __be16
*flagsp
= (__be16
*)((u8
*)iph
+ (iph
->ihl
<< 2));
634 __be16
*protocol
= flagsp
+ 1;
635 __be32
*options
= (__be32
*)(protocol
+ 1);
640 if (*flags
& (GRE_VERSION
| GRE_ROUTING
))
643 if (*protocol
!= htons(ETH_P_TEB
))
646 hdr_len
= GRE_HEADER_SECTION
;
648 if (*flags
& GRE_CSUM
) {
649 hdr_len
+= GRE_HEADER_SECTION
;
653 if (*flags
& GRE_KEY
) {
654 hdr_len
+= GRE_HEADER_SECTION
;
661 if (*flags
& GRE_SEQ
)
662 hdr_len
+= GRE_HEADER_SECTION
;
668 ecn_encapsulate(u8 tos
, struct sk_buff
*skb
)
672 if (skb
->protocol
== htons(ETH_P_IP
))
673 inner
= ((struct iphdr
*)skb_network_header(skb
))->tos
;
674 else if (skb
->protocol
== htons(ETH_P_IPV6
))
675 inner
= ipv6_get_dsfield((struct ipv6hdr
*)skb_network_header(skb
));
679 return INET_ECN_encapsulate(tos
, inner
);
683 ecn_decapsulate(u8 tos
, struct sk_buff
*skb
)
685 if (INET_ECN_is_ce(tos
)) {
686 __be16 protocol
= skb
->protocol
;
687 unsigned int nw_header
= skb_network_header(skb
) - skb
->data
;
689 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
690 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
693 protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
694 nw_header
+= VLAN_HLEN
;
697 if (protocol
== htons(ETH_P_IP
)) {
698 if (unlikely(!pskb_may_pull(skb
, nw_header
699 + sizeof(struct iphdr
))))
702 IP_ECN_set_ce((struct iphdr
*)(nw_header
+ skb
->data
));
703 } else if (protocol
== htons(ETH_P_IPV6
)) {
704 if (unlikely(!pskb_may_pull(skb
, nw_header
705 + sizeof(struct ipv6hdr
))))
708 IP6_ECN_set_ce((struct ipv6hdr
*)(nw_header
714 static struct sk_buff
*
715 handle_gso(struct sk_buff
*skb
)
717 if (skb_is_gso(skb
)) {
718 struct sk_buff
*nskb
= skb_gso_segment(skb
, NETIF_F_SG
);
728 handle_csum_offload(struct sk_buff
*skb
)
730 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
731 return skb_checksum_help(skb
);
733 skb
->ip_summed
= CHECKSUM_NONE
;
738 /* Called with rcu_read_lock. */
740 gre_err(struct sk_buff
*skb
, u32 info
)
743 const struct mutable_config
*mutable;
744 const int type
= icmp_hdr(skb
)->type
;
745 const int code
= icmp_hdr(skb
)->code
;
746 int mtu
= ntohs(icmp_hdr(skb
)->un
.frag
.mtu
);
751 int tunnel_hdr_len
, tot_hdr_len
;
752 unsigned int orig_mac_header
;
753 unsigned int orig_nw_header
;
755 if (type
!= ICMP_DEST_UNREACH
|| code
!= ICMP_FRAG_NEEDED
)
758 /* The mimimum size packet that we would actually be able to process:
759 * encapsulating IP header, minimum GRE header, Ethernet header,
760 * inner IPv4 header. */
761 if (!pskb_may_pull(skb
, sizeof(struct iphdr
) + GRE_HEADER_SECTION
+
762 ETH_HLEN
+ sizeof(struct iphdr
)))
765 iph
= (struct iphdr
*)skb
->data
;
767 tunnel_hdr_len
= parse_gre_header(iph
, &flags
, &key
);
768 if (tunnel_hdr_len
< 0)
771 vport
= find_port(iph
->saddr
, iph
->daddr
, key
, FIND_PORT_ANY
, &mutable);
775 /* Packets received by this function were previously sent by us, so
776 * any comparisons should be to the output values, not the input.
777 * However, it's not really worth it to have a hash table based on
778 * output keys (especially since ICMP error handling of tunneled packets
779 * isn't that reliable anyways). Therefore, we do a lookup based on the
780 * out key as if it were the in key and then check to see if the input
781 * and output keys are the same. */
782 if (mutable->port_config
.in_key
!= mutable->port_config
.out_key
)
785 if (!!(mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
786 !!(mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
))
789 if ((mutable->port_config
.flags
& GRE_F_OUT_CSUM
) && !(flags
& GRE_CSUM
))
792 tunnel_hdr_len
+= iph
->ihl
<< 2;
794 orig_mac_header
= skb_mac_header(skb
) - skb
->data
;
795 orig_nw_header
= skb_network_header(skb
) - skb
->data
;
796 skb_set_mac_header(skb
, tunnel_hdr_len
);
798 tot_hdr_len
= tunnel_hdr_len
+ ETH_HLEN
;
800 skb
->protocol
= eth_hdr(skb
)->h_proto
;
801 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
802 tot_hdr_len
+= VLAN_HLEN
;
803 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
806 skb_set_network_header(skb
, tot_hdr_len
);
809 if (skb
->protocol
== htons(ETH_P_IP
))
810 tot_hdr_len
+= sizeof(struct iphdr
);
811 else if (skb
->protocol
== htons(ETH_P_IPV6
))
812 tot_hdr_len
+= sizeof(struct ipv6hdr
);
816 if (!pskb_may_pull(skb
, tot_hdr_len
))
819 if (skb
->protocol
== htons(ETH_P_IP
)) {
820 if (mtu
< IP_MIN_MTU
) {
821 if (ntohs(ip_hdr(skb
)->tot_len
) >= IP_MIN_MTU
)
827 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
828 if (mtu
< IPV6_MIN_MTU
) {
829 unsigned int packet_length
= sizeof(struct ipv6hdr
) +
830 ntohs(ipv6_hdr(skb
)->payload_len
);
832 if (packet_length
>= IPV6_MIN_MTU
833 || ntohs(ipv6_hdr(skb
)->payload_len
) == 0)
840 __pskb_pull(skb
, tunnel_hdr_len
);
841 send_frag_needed(vport
, mutable, skb
, mtu
, key
);
842 skb_push(skb
, tunnel_hdr_len
);
845 skb_set_mac_header(skb
, orig_mac_header
);
846 skb_set_network_header(skb
, orig_nw_header
);
847 skb
->protocol
= htons(ETH_P_IP
);
850 /* Called with rcu_read_lock. */
852 gre_rcv(struct sk_buff
*skb
)
855 const struct mutable_config
*mutable;
861 if (!pskb_may_pull(skb
, GRE_HEADER_SECTION
+ ETH_HLEN
))
864 if (!check_checksum(skb
))
869 hdr_len
= parse_gre_header(iph
, &flags
, &key
);
873 vport
= find_port(iph
->daddr
, iph
->saddr
, key
, FIND_PORT_ANY
, &mutable);
875 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
879 if ((mutable->port_config
.flags
& GRE_F_IN_CSUM
) && !(flags
& GRE_CSUM
)) {
880 vport_record_error(vport
, VPORT_E_RX_CRC
);
884 if (!pskb_pull(skb
, hdr_len
) || !pskb_may_pull(skb
, ETH_HLEN
)) {
885 vport_record_error(vport
, VPORT_E_RX_ERROR
);
889 skb
->pkt_type
= PACKET_HOST
;
890 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
891 skb_postpull_rcsum(skb
, skb_transport_header(skb
), hdr_len
+ ETH_HLEN
);
896 skb_reset_network_header(skb
);
898 ecn_decapsulate(iph
->tos
, skb
);
900 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
901 OVS_CB(skb
)->tun_id
= key
;
903 OVS_CB(skb
)->tun_id
= 0;
905 skb_push(skb
, ETH_HLEN
);
906 compute_ip_summed(skb
, false);
908 vport_receive(vport
, skb
);
918 build_packet(struct vport
*vport
, const struct mutable_config
*mutable,
919 struct iphdr
*iph
, struct rtable
*rt
, int max_headroom
, int mtu
,
923 struct iphdr
*new_iph
;
924 int orig_len
= skb
->len
;
925 __be16 frag_off
= iph
->frag_off
;
927 skb
= check_headroom(skb
, max_headroom
);
928 if (unlikely(IS_ERR(skb
)))
931 err
= handle_csum_offload(skb
);
935 if (skb
->protocol
== htons(ETH_P_IP
)) {
936 struct iphdr
*old_iph
= ip_hdr(skb
);
938 if ((old_iph
->frag_off
& htons(IP_DF
)) &&
939 mtu
< ntohs(old_iph
->tot_len
)) {
940 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
944 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
945 unsigned int packet_length
= skb
->len
- ETH_HLEN
946 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
948 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
949 if (packet_length
> IPV6_MIN_MTU
)
950 frag_off
= htons(IP_DF
);
952 if (mtu
< packet_length
) {
953 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
958 skb_reset_transport_header(skb
);
959 new_iph
= (struct iphdr
*)skb_push(skb
, mutable->tunnel_hlen
);
960 skb_reset_network_header(skb
);
962 memcpy(new_iph
, iph
, sizeof(struct iphdr
));
963 new_iph
->frag_off
= frag_off
;
964 ip_select_ident(new_iph
, &rt
->u
.dst
, NULL
);
966 create_gre_header(skb
, mutable);
968 /* Allow our local IP stack to fragment the outer packet even if the
969 * DF bit is set as a last resort. */
972 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
973 IPCB(skb
)->flags
= 0;
975 err
= ip_local_out(skb
);
976 if (likely(net_xmit_eval(err
) == 0))
979 vport_record_error(vport
, VPORT_E_TX_ERROR
);
986 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
992 gre_send(struct vport
*vport
, struct sk_buff
*skb
)
994 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
995 const struct mutable_config
*mutable = rcu_dereference(gre_vport
->mutable);
997 struct iphdr
*old_iph
;
998 struct ipv6hdr
*old_ipv6h
;
1005 /* Validate the protocol headers before we try to use them. */
1006 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
1007 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
1010 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
1011 skb_set_network_header(skb
, VLAN_ETH_HLEN
);
1014 if (skb
->protocol
== htons(ETH_P_IP
)) {
1015 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1016 + sizeof(struct iphdr
) - skb
->data
)))
1018 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
1019 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1020 + sizeof(struct ipv6hdr
) - skb
->data
)))
1024 old_iph
= ip_hdr(skb
);
1025 old_ipv6h
= ipv6_hdr(skb
);
1027 iph
.tos
= mutable->port_config
.tos
;
1028 if (mutable->port_config
.flags
& GRE_F_TOS_INHERIT
) {
1029 if (skb
->protocol
== htons(ETH_P_IP
))
1030 iph
.tos
= old_iph
->tos
;
1031 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1032 iph
.tos
= ipv6_get_dsfield(ipv6_hdr(skb
));
1034 iph
.tos
= ecn_encapsulate(iph
.tos
, skb
);
1037 struct flowi fl
= { .nl_u
= { .ip4_u
=
1038 { .daddr
= mutable->port_config
.daddr
,
1039 .saddr
= mutable->port_config
.saddr
,
1040 .tos
= RT_TOS(iph
.tos
) } },
1041 .proto
= IPPROTO_GRE
};
1043 if (ip_route_output_key(&init_net
, &rt
, &fl
))
1047 iph
.ttl
= mutable->port_config
.ttl
;
1048 if (mutable->port_config
.flags
& GRE_F_TTL_INHERIT
) {
1049 if (skb
->protocol
== htons(ETH_P_IP
))
1050 iph
.ttl
= old_iph
->ttl
;
1051 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1052 iph
.ttl
= old_ipv6h
->hop_limit
;
1055 iph
.ttl
= dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
);
1057 iph
.frag_off
= (mutable->port_config
.flags
& GRE_F_PMTUD
) ? htons(IP_DF
) : 0;
1059 mtu
= dst_mtu(&rt
->u
.dst
)
1061 - mutable->tunnel_hlen
1062 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
1066 if (skb
->protocol
== htons(ETH_P_IP
)) {
1067 iph
.frag_off
|= old_iph
->frag_off
& htons(IP_DF
);
1068 mtu
= max(mtu
, IP_MIN_MTU
);
1070 } else if (skb
->protocol
== htons(ETH_P_IPV6
))
1071 mtu
= max(mtu
, IPV6_MIN_MTU
);
1074 iph
.ihl
= sizeof(struct iphdr
) >> 2;
1075 iph
.protocol
= IPPROTO_GRE
;
1076 iph
.daddr
= rt
->rt_dst
;
1077 iph
.saddr
= rt
->rt_src
;
1082 skb_dst_set(skb
, &rt
->u
.dst
);
1084 /* If we are doing GSO on a pskb it is better to make sure that the
1085 * headroom is correct now. We will only have to copy the portion in
1086 * the linear data area and GSO will preserve headroom when it creates
1087 * the segments. This is particularly beneficial on Xen where we get
1088 * lots of GSO pskbs. Conversely, we delay copying if it is just to
1089 * get our own writable clone because GSO may do the copy for us. */
1090 max_headroom
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
) + rt
->u
.dst
.header_len
1091 + mutable->tunnel_hlen
;
1093 if (skb_headroom(skb
) < max_headroom
) {
1094 skb
= check_headroom(skb
, max_headroom
);
1095 if (unlikely(IS_ERR(skb
))) {
1096 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1101 forward_ip_summed(skb
);
1102 vswitch_skb_checksum_setup(skb
);
1104 skb
= handle_gso(skb
);
1105 if (unlikely(IS_ERR(skb
))) {
1106 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1110 /* Process GSO segments. Try to do any work for the entire packet that
1111 * doesn't involve actually writing to it before this point. */
1114 struct sk_buff
*next_skb
= skb
->next
;
1117 orig_len
+= build_packet(vport
, mutable, &iph
, rt
, max_headroom
, mtu
, skb
);
1126 vport_record_error(vport
, VPORT_E_TX_ERROR
);
1131 static struct net_protocol gre_protocol_handlers
= {
1133 .err_handler
= gre_err
,
1141 err
= inet_add_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1143 printk(KERN_WARNING
"openvswitch: cannot register gre protocol handler\n");
1151 tbl_destroy(port_table
, NULL
);
1152 inet_del_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1156 set_config(const struct vport
*cur_vport
, struct mutable_config
*mutable,
1157 const void __user
*uconfig
)
1159 const struct vport
*old_vport
;
1160 const struct mutable_config
*old_mutable
;
1163 if (copy_from_user(&mutable->port_config
, uconfig
, sizeof(struct gre_port_config
)))
1166 if (mutable->port_config
.daddr
== 0)
1169 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
1170 port_type
= FIND_PORT_MATCH
;
1171 mutable->port_config
.in_key
= 0;
1173 port_type
= FIND_PORT_KEY
;
1175 old_vport
= find_port(mutable->port_config
.saddr
,
1176 mutable->port_config
.daddr
,
1177 mutable->port_config
.in_key
, port_type
,
1180 if (old_vport
&& old_vport
!= cur_vport
)
1183 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1184 mutable->port_config
.out_key
= 0;
1186 mutable->tunnel_hlen
= sizeof(struct iphdr
) + GRE_HEADER_SECTION
;
1188 if (mutable->port_config
.flags
& GRE_F_OUT_CSUM
)
1189 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1191 if (mutable->port_config
.out_key
||
1192 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1193 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1198 static struct vport
*
1199 gre_create(const char *name
, const void __user
*config
)
1201 struct vport
*vport
;
1202 struct gre_vport
*gre_vport
;
1205 vport
= vport_alloc(sizeof(struct gre_vport
), &gre_vport_ops
);
1206 if (IS_ERR(vport
)) {
1207 err
= PTR_ERR(vport
);
1211 gre_vport
= gre_vport_priv(vport
);
1213 strcpy(gre_vport
->name
, name
);
1215 gre_vport
->mutable = kmalloc(sizeof(struct mutable_config
), GFP_KERNEL
);
1216 if (!gre_vport
->mutable) {
1218 goto error_free_vport
;
1221 vport_gen_ether_addr(gre_vport
->mutable->eth_addr
);
1222 gre_vport
->mutable->mtu
= ETH_DATA_LEN
;
1224 err
= set_config(NULL
, gre_vport
->mutable, config
);
1226 goto error_free_mutable
;
1228 err
= add_port(vport
);
1230 goto error_free_mutable
;
1235 kfree(gre_vport
->mutable);
1239 return ERR_PTR(err
);
1243 gre_modify(struct vport
*vport
, const void __user
*config
)
1245 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1246 struct mutable_config
*mutable;
1248 int update_hash
= 0;
1250 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1256 err
= set_config(vport
, mutable, config
);
1260 /* Only remove the port from the hash table if something that would
1261 * affect the lookup has changed. */
1262 if (gre_vport
->mutable->port_config
.saddr
!= mutable->port_config
.saddr
||
1263 gre_vport
->mutable->port_config
.daddr
!= mutable->port_config
.daddr
||
1264 gre_vport
->mutable->port_config
.in_key
!= mutable->port_config
.in_key
||
1265 (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
1266 (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
))
1270 /* This update is not atomic but the lookup uses the config, which
1271 * serves as an inherent double check. */
1273 err
= del_port(vport
);
1278 assign_config_rcu(vport
, mutable);
1281 err
= add_port(vport
);
1295 gre_destroy(struct vport
*vport
)
1297 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1299 const struct mutable_config
*old_mutable
;
1301 /* Do a hash table lookup to make sure that the port exists. It should
1302 * exist but might not if a modify failed earlier. */
1303 if (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
1304 port_type
= FIND_PORT_MATCH
;
1306 port_type
= FIND_PORT_KEY
;
1308 if (vport
== find_port(gre_vport
->mutable->port_config
.saddr
,
1309 gre_vport
->mutable->port_config
.daddr
,
1310 gre_vport
->mutable->port_config
.in_key
, port_type
, &old_mutable
))
1313 kfree(gre_vport
->mutable);
1320 gre_set_mtu(struct vport
*vport
, int mtu
)
1322 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1323 struct mutable_config
*mutable;
1324 struct dp_port
*dp_port
;
1326 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1331 assign_config_rcu(vport
, mutable);
1333 dp_port
= vport_get_dp_port(vport
);
1335 set_internal_devs_mtu(dp_port
->dp
);
1341 gre_set_addr(struct vport
*vport
, const unsigned char *addr
)
1343 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1344 struct mutable_config
*mutable;
1346 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1350 memcpy(mutable->eth_addr
, addr
, ETH_ALEN
);
1351 assign_config_rcu(vport
, mutable);
1358 gre_get_name(const struct vport
*vport
)
1360 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1361 return gre_vport
->name
;
1364 static const unsigned char *
1365 gre_get_addr(const struct vport
*vport
)
1367 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1368 return rcu_dereference(gre_vport
->mutable)->eth_addr
;
1372 gre_get_dev_flags(const struct vport
*vport
)
1374 return IFF_UP
| IFF_RUNNING
| IFF_LOWER_UP
;
1378 gre_is_running(const struct vport
*vport
)
1383 static unsigned char
1384 gre_get_operstate(const struct vport
*vport
)
1390 gre_get_mtu(const struct vport
*vport
)
1392 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1393 return rcu_dereference(gre_vport
->mutable)->mtu
;
1396 struct vport_ops gre_vport_ops
= {
1398 .flags
= VPORT_F_GEN_STATS
| VPORT_F_TUN_ID
,
1401 .create
= gre_create
,
1402 .modify
= gre_modify
,
1403 .destroy
= gre_destroy
,
1404 .set_mtu
= gre_set_mtu
,
1405 .set_addr
= gre_set_addr
,
1406 .get_name
= gre_get_name
,
1407 .get_addr
= gre_get_addr
,
1408 .get_dev_flags
= gre_get_dev_flags
,
1409 .is_running
= gre_is_running
,
1410 .get_operstate
= gre_get_operstate
,
1411 .get_mtu
= gre_get_mtu
,