2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_vlan.h>
15 #include <linux/in_route.h>
16 #include <linux/jhash.h>
17 #include <linux/kernel.h>
18 #include <linux/version.h>
20 #include <net/dsfield.h>
23 #include <net/inet_ecn.h>
25 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
28 #include <net/protocol.h>
29 #include <net/route.h>
34 #include "openvswitch/gre.h"
37 #include "vport-generic.h"
39 /* The absolute minimum fragment size. Note that there are many other
40 * definitions of the minimum MTU. */
43 /* The GRE header is composed of a series of sections: a base and then a variable
44 * number of options. */
45 #define GRE_HEADER_SECTION 4
52 struct mutable_config
{
55 unsigned char eth_addr
[ETH_ALEN
];
57 struct gre_port_config port_config
;
59 int tunnel_hlen
; /* Tunnel header length. */
64 struct tbl_node tbl_node
;
68 /* Protected by RCU. */
69 struct mutable_config
*mutable;
72 /* Protected by RCU. */
73 static struct tbl
*port_table
;
75 /* These are just used as an optimization: they don't require any kind of
76 * synchronization because we could have just as easily read the value before
77 * the port change happened. */
78 static unsigned int key_local_remote_ports
;
79 static unsigned int key_remote_ports
;
80 static unsigned int local_remote_ports
;
81 static unsigned int remote_ports
;
83 static inline struct gre_vport
*gre_vport_priv(const struct vport
*vport
)
85 return vport_priv(vport
);
88 static inline struct vport
*gre_vport_to_vport(const struct gre_vport
*gre_vport
)
90 return vport_from_priv(gre_vport
);
93 static inline struct gre_vport
*gre_vport_table_cast(const struct tbl_node
*node
)
95 return container_of(node
, struct gre_vport
, tbl_node
);
99 static void free_config(struct rcu_head
*rcu
)
101 struct mutable_config
*c
= container_of(rcu
, struct mutable_config
, rcu
);
105 static void assign_config_rcu(struct vport
*vport
,
106 struct mutable_config
*new_config
)
108 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
109 struct mutable_config
*old_config
;
111 old_config
= rcu_dereference(gre_vport
->mutable);
112 rcu_assign_pointer(gre_vport
->mutable, new_config
);
113 call_rcu(&old_config
->rcu
, free_config
);
116 static unsigned int *find_port_pool(const struct mutable_config
*mutable)
118 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
119 if (mutable->port_config
.saddr
)
120 return &local_remote_ports
;
122 return &remote_ports
;
124 if (mutable->port_config
.saddr
)
125 return &key_local_remote_ports
;
127 return &key_remote_ports
;
138 struct port_lookup_key
{
139 u32 vals
[4]; /* Contains enum lookup_key keys. */
140 const struct mutable_config
*mutable;
143 /* Modifies 'target' to store the rcu_dereferenced pointer that was used to do
144 * the comparision. */
145 static int port_cmp(const struct tbl_node
*node
, void *target
)
147 const struct gre_vport
*gre_vport
= gre_vport_table_cast(node
);
148 struct port_lookup_key
*lookup
= target
;
150 lookup
->mutable = rcu_dereference(gre_vport
->mutable);
152 return ((lookup
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) ==
153 lookup
->vals
[LOOKUP_KEY_MATCH
]) &&
154 lookup
->mutable->port_config
.daddr
== lookup
->vals
[LOOKUP_DADDR
] &&
155 lookup
->mutable->port_config
.in_key
== lookup
->vals
[LOOKUP_KEY
] &&
156 lookup
->mutable->port_config
.saddr
== lookup
->vals
[LOOKUP_SADDR
];
159 static u32
port_hash(struct port_lookup_key
*lookup
)
161 return jhash2(lookup
->vals
, ARRAY_SIZE(lookup
->vals
), 0);
164 static int add_port(struct vport
*vport
)
166 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
167 struct port_lookup_key lookup
;
171 struct tbl
*new_table
;
173 new_table
= tbl_create(0);
177 rcu_assign_pointer(port_table
, new_table
);
179 } else if (tbl_count(port_table
) > tbl_n_buckets(port_table
)) {
180 struct tbl
*old_table
= port_table
;
181 struct tbl
*new_table
;
183 new_table
= tbl_expand(old_table
);
184 if (IS_ERR(new_table
))
185 return PTR_ERR(new_table
);
187 rcu_assign_pointer(port_table
, new_table
);
188 tbl_deferred_destroy(old_table
, NULL
);
191 lookup
.vals
[LOOKUP_SADDR
] = gre_vport
->mutable->port_config
.saddr
;
192 lookup
.vals
[LOOKUP_DADDR
] = gre_vport
->mutable->port_config
.daddr
;
193 lookup
.vals
[LOOKUP_KEY
] = gre_vport
->mutable->port_config
.in_key
;
194 lookup
.vals
[LOOKUP_KEY_MATCH
] = gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
;
196 err
= tbl_insert(port_table
, &gre_vport
->tbl_node
, port_hash(&lookup
));
200 (*find_port_pool(gre_vport
->mutable))++;
205 static int del_port(struct vport
*vport
)
207 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
210 err
= tbl_remove(port_table
, &gre_vport
->tbl_node
);
214 (*find_port_pool(gre_vport
->mutable))--;
219 #define FIND_PORT_KEY (1 << 0)
220 #define FIND_PORT_MATCH (1 << 1)
221 #define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH)
223 static struct vport
*find_port(__be32 saddr
, __be32 daddr
, __be32 key
,
225 const struct mutable_config
**mutable)
227 struct port_lookup_key lookup
;
228 struct tbl
*table
= rcu_dereference(port_table
);
229 struct tbl_node
*tbl_node
;
234 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
235 lookup
.vals
[LOOKUP_DADDR
] = daddr
;
237 if (port_type
& FIND_PORT_KEY
) {
238 lookup
.vals
[LOOKUP_KEY
] = key
;
239 lookup
.vals
[LOOKUP_KEY_MATCH
] = 0;
241 if (key_local_remote_ports
) {
242 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
247 if (key_remote_ports
) {
248 lookup
.vals
[LOOKUP_SADDR
] = 0;
250 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
254 lookup
.vals
[LOOKUP_SADDR
] = saddr
;
258 if (port_type
& FIND_PORT_MATCH
) {
259 lookup
.vals
[LOOKUP_KEY
] = 0;
260 lookup
.vals
[LOOKUP_KEY_MATCH
] = GRE_F_IN_KEY_MATCH
;
262 if (local_remote_ports
) {
263 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
269 lookup
.vals
[LOOKUP_SADDR
] = 0;
271 tbl_node
= tbl_lookup(table
, &lookup
, port_hash(&lookup
), port_cmp
);
280 *mutable = lookup
.mutable;
281 return gre_vport_to_vport(gre_vport_table_cast(tbl_node
));
284 static bool check_ipv4_address(__be32 addr
)
286 if (ipv4_is_multicast(addr
) || ipv4_is_lbcast(addr
)
287 || ipv4_is_loopback(addr
) || ipv4_is_zeronet(addr
))
293 static bool ipv4_should_icmp(struct sk_buff
*skb
)
295 struct iphdr
*old_iph
= ip_hdr(skb
);
297 /* Don't respond to L2 broadcast. */
298 if (is_multicast_ether_addr(eth_hdr(skb
)->h_dest
))
301 /* Don't respond to L3 broadcast or invalid addresses. */
302 if (!check_ipv4_address(old_iph
->daddr
) ||
303 !check_ipv4_address(old_iph
->saddr
))
306 /* Only respond to the first fragment. */
307 if (old_iph
->frag_off
& htons(IP_OFFSET
))
310 /* Don't respond to ICMP error messages. */
311 if (old_iph
->protocol
== IPPROTO_ICMP
) {
312 u8 icmp_type
, *icmp_typep
;
314 icmp_typep
= skb_header_pointer(skb
, (u8
*)old_iph
+
315 (old_iph
->ihl
<< 2) +
316 offsetof(struct icmphdr
, type
) -
317 skb
->data
, sizeof(icmp_type
),
323 if (*icmp_typep
> NR_ICMP_TYPES
324 || (*icmp_typep
<= ICMP_PARAMETERPROB
325 && *icmp_typep
!= ICMP_ECHOREPLY
326 && *icmp_typep
!= ICMP_ECHO
))
333 static void ipv4_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
,
334 unsigned int mtu
, unsigned int payload_length
)
336 struct iphdr
*iph
, *old_iph
= ip_hdr(skb
);
337 struct icmphdr
*icmph
;
340 iph
= (struct iphdr
*)skb_put(nskb
, sizeof(struct iphdr
));
341 icmph
= (struct icmphdr
*)skb_put(nskb
, sizeof(struct icmphdr
));
342 payload
= skb_put(nskb
, payload_length
);
346 iph
->ihl
= sizeof(struct iphdr
) >> 2;
347 iph
->tos
= (old_iph
->tos
& IPTOS_TOS_MASK
) |
348 IPTOS_PREC_INTERNETCONTROL
;
349 iph
->tot_len
= htons(sizeof(struct iphdr
)
350 + sizeof(struct icmphdr
)
352 get_random_bytes(&iph
->id
, sizeof(iph
->id
));
355 iph
->protocol
= IPPROTO_ICMP
;
356 iph
->daddr
= old_iph
->saddr
;
357 iph
->saddr
= old_iph
->daddr
;
362 icmph
->type
= ICMP_DEST_UNREACH
;
363 icmph
->code
= ICMP_FRAG_NEEDED
;
364 icmph
->un
.gateway
= htonl(mtu
);
367 nskb
->csum
= csum_partial((u8
*)icmph
, sizeof(struct icmphdr
), 0);
368 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_iph
- skb
->data
,
369 payload
, payload_length
,
371 icmph
->checksum
= csum_fold(nskb
->csum
);
374 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
375 static bool ipv6_should_icmp(struct sk_buff
*skb
)
377 struct ipv6hdr
*old_ipv6h
= ipv6_hdr(skb
);
379 int payload_off
= (u8
*)(old_ipv6h
+ 1) - skb
->data
;
380 u8 nexthdr
= ipv6_hdr(skb
)->nexthdr
;
382 /* Check source address is valid. */
383 addr_type
= ipv6_addr_type(&old_ipv6h
->saddr
);
384 if (addr_type
& IPV6_ADDR_MULTICAST
|| addr_type
== IPV6_ADDR_ANY
)
387 /* Don't reply to unspecified addresses. */
388 if (ipv6_addr_type(&old_ipv6h
->daddr
) == IPV6_ADDR_ANY
)
391 /* Don't respond to ICMP error messages. */
392 payload_off
= ipv6_skip_exthdr(skb
, payload_off
, &nexthdr
);
396 if (nexthdr
== NEXTHDR_ICMP
) {
397 u8 icmp_type
, *icmp_typep
;
399 icmp_typep
= skb_header_pointer(skb
, payload_off
+
400 offsetof(struct icmp6hdr
,
402 sizeof(icmp_type
), &icmp_type
);
404 if (!icmp_typep
|| !(*icmp_typep
& ICMPV6_INFOMSG_MASK
))
411 static void ipv6_build_icmp(struct sk_buff
*skb
, struct sk_buff
*nskb
,
412 unsigned int mtu
, unsigned int payload_length
)
414 struct ipv6hdr
*ipv6h
, *old_ipv6h
= ipv6_hdr(skb
);
415 struct icmp6hdr
*icmp6h
;
418 ipv6h
= (struct ipv6hdr
*)skb_put(nskb
, sizeof(struct ipv6hdr
));
419 icmp6h
= (struct icmp6hdr
*)skb_put(nskb
, sizeof(struct icmp6hdr
));
420 payload
= skb_put(nskb
, payload_length
);
425 memset(&ipv6h
->flow_lbl
, 0, sizeof(ipv6h
->flow_lbl
));
426 ipv6h
->payload_len
= htons(sizeof(struct icmp6hdr
)
428 ipv6h
->nexthdr
= NEXTHDR_ICMP
;
429 ipv6h
->hop_limit
= IPV6_DEFAULT_HOPLIMIT
;
430 ipv6_addr_copy(&ipv6h
->daddr
, &old_ipv6h
->saddr
);
431 ipv6_addr_copy(&ipv6h
->saddr
, &old_ipv6h
->daddr
);
434 icmp6h
->icmp6_type
= ICMPV6_PKT_TOOBIG
;
435 icmp6h
->icmp6_code
= 0;
436 icmp6h
->icmp6_cksum
= 0;
437 icmp6h
->icmp6_mtu
= htonl(mtu
);
439 nskb
->csum
= csum_partial((u8
*)icmp6h
, sizeof(struct icmp6hdr
), 0);
440 nskb
->csum
= skb_copy_and_csum_bits(skb
, (u8
*)old_ipv6h
- skb
->data
,
441 payload
, payload_length
,
443 icmp6h
->icmp6_cksum
= csum_ipv6_magic(&ipv6h
->saddr
, &ipv6h
->daddr
,
444 sizeof(struct icmp6hdr
)
446 ipv6h
->nexthdr
, nskb
->csum
);
450 static bool send_frag_needed(struct vport
*vport
,
451 const struct mutable_config
*mutable,
452 struct sk_buff
*skb
, unsigned int mtu
,
455 unsigned int eth_hdr_len
= ETH_HLEN
;
456 unsigned int total_length
= 0, header_length
= 0, payload_length
;
457 struct ethhdr
*eh
, *old_eh
= eth_hdr(skb
);
458 struct sk_buff
*nskb
;
461 if (skb
->protocol
== htons(ETH_P_IP
)) {
462 if (mtu
< IP_MIN_MTU
)
465 if (!ipv4_should_icmp(skb
))
468 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
469 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
470 if (mtu
< IPV6_MIN_MTU
)
473 /* In theory we should do PMTUD on IPv6 multicast messages but
474 * we don't have an address to send from so just fragment. */
475 if (ipv6_addr_type(&ipv6_hdr(skb
)->daddr
) & IPV6_ADDR_MULTICAST
)
478 if (!ipv6_should_icmp(skb
))
486 if (old_eh
->h_proto
== htons(ETH_P_8021Q
))
487 eth_hdr_len
= VLAN_ETH_HLEN
;
489 payload_length
= skb
->len
- eth_hdr_len
;
490 if (skb
->protocol
== htons(ETH_P_IP
)) {
491 header_length
= sizeof(struct iphdr
) + sizeof(struct icmphdr
);
492 total_length
= min_t(unsigned int, header_length
+
493 payload_length
, 576);
495 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
497 header_length
= sizeof(struct ipv6hdr
) +
498 sizeof(struct icmp6hdr
);
499 total_length
= min_t(unsigned int, header_length
+
500 payload_length
, IPV6_MIN_MTU
);
504 total_length
= min(total_length
, mutable->mtu
);
505 payload_length
= total_length
- header_length
;
507 nskb
= dev_alloc_skb(NET_IP_ALIGN
+ eth_hdr_len
+ header_length
+
512 skb_reserve(nskb
, NET_IP_ALIGN
);
514 /* Ethernet / VLAN */
515 eh
= (struct ethhdr
*)skb_put(nskb
, eth_hdr_len
);
516 memcpy(eh
->h_dest
, old_eh
->h_source
, ETH_ALEN
);
517 memcpy(eh
->h_source
, mutable->eth_addr
, ETH_ALEN
);
518 nskb
->protocol
= eh
->h_proto
= old_eh
->h_proto
;
519 if (old_eh
->h_proto
== htons(ETH_P_8021Q
)) {
520 struct vlan_ethhdr
*vh
= (struct vlan_ethhdr
*)eh
;
522 vh
->h_vlan_TCI
= vlan_eth_hdr(skb
)->h_vlan_TCI
;
523 vh
->h_vlan_encapsulated_proto
= skb
->protocol
;
525 skb_reset_mac_header(nskb
);
528 if (skb
->protocol
== htons(ETH_P_IP
))
529 ipv4_build_icmp(skb
, nskb
, mtu
, payload_length
);
530 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
532 ipv6_build_icmp(skb
, nskb
, mtu
, payload_length
);
535 /* Assume that flow based keys are symmetric with respect to input
536 * and output and use the key that we were going to put on the
537 * outgoing packet for the fake received packet. If the keys are
538 * not symmetric then PMTUD needs to be disabled since we won't have
539 * any way of synthesizing packets. */
540 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
&&
541 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
542 OVS_CB(nskb
)->tun_id
= flow_key
;
544 compute_ip_summed(nskb
, false);
545 vport_receive(vport
, nskb
);
550 static struct sk_buff
*check_headroom(struct sk_buff
*skb
, int headroom
)
552 if (skb_headroom(skb
) < headroom
|| skb_header_cloned(skb
)) {
553 struct sk_buff
*nskb
= skb_realloc_headroom(skb
, headroom
+ 16);
556 return ERR_PTR(-ENOMEM
);
559 set_skb_csum_bits(skb
, nskb
);
562 skb_set_owner_w(nskb
, skb
->sk
);
571 static void create_gre_header(struct sk_buff
*skb
,
572 const struct mutable_config
*mutable)
574 struct iphdr
*iph
= ip_hdr(skb
);
575 struct gre_base_hdr
*greh
= (struct gre_base_hdr
*)(iph
+ 1);
576 __be32
*options
= (__be32
*)((u8
*)iph
+ mutable->tunnel_hlen
577 - GRE_HEADER_SECTION
);
579 greh
->protocol
= htons(ETH_P_TEB
);
582 /* Work backwards over the options so the checksum is last. */
583 if (mutable->port_config
.out_key
||
584 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
) {
585 greh
->flags
|= GRE_KEY
;
587 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
588 *options
= OVS_CB(skb
)->tun_id
;
590 *options
= mutable->port_config
.out_key
;
595 if (mutable->port_config
.flags
& GRE_F_OUT_CSUM
) {
596 greh
->flags
|= GRE_CSUM
;
599 *(__sum16
*)options
= csum_fold(skb_checksum(skb
,
600 sizeof(struct iphdr
),
601 skb
->len
- sizeof(struct iphdr
),
606 static int check_checksum(struct sk_buff
*skb
)
608 struct iphdr
*iph
= ip_hdr(skb
);
609 __be16 flags
= *(__be16
*)(iph
+ 1);
612 if (flags
& GRE_CSUM
) {
613 switch (skb
->ip_summed
) {
614 case CHECKSUM_COMPLETE
:
615 csum
= csum_fold(skb
->csum
);
623 csum
= __skb_checksum_complete(skb
);
624 skb
->ip_summed
= CHECKSUM_COMPLETE
;
632 static int parse_gre_header(struct iphdr
*iph
, __be16
*flags
, __be32
*key
)
634 /* IP and ICMP protocol handlers check that the IHL is valid. */
635 struct gre_base_hdr
*greh
= (struct gre_base_hdr
*)((u8
*)iph
+ (iph
->ihl
<< 2));
636 __be32
*options
= (__be32
*)(greh
+ 1);
639 *flags
= greh
->flags
;
641 if (greh
->flags
& (GRE_VERSION
| GRE_ROUTING
))
644 if (greh
->protocol
!= htons(ETH_P_TEB
))
647 hdr_len
= GRE_HEADER_SECTION
;
649 if (greh
->flags
& GRE_CSUM
) {
650 hdr_len
+= GRE_HEADER_SECTION
;
654 if (greh
->flags
& GRE_KEY
) {
655 hdr_len
+= GRE_HEADER_SECTION
;
662 if (greh
->flags
& GRE_SEQ
)
663 hdr_len
+= GRE_HEADER_SECTION
;
668 static inline u8
ecn_encapsulate(u8 tos
, struct sk_buff
*skb
)
672 if (skb
->protocol
== htons(ETH_P_IP
))
673 inner
= ((struct iphdr
*)skb_network_header(skb
))->tos
;
674 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
675 else if (skb
->protocol
== htons(ETH_P_IPV6
))
676 inner
= ipv6_get_dsfield((struct ipv6hdr
*)skb_network_header(skb
));
681 return INET_ECN_encapsulate(tos
, inner
);
684 static inline void ecn_decapsulate(u8 tos
, struct sk_buff
*skb
)
686 if (INET_ECN_is_ce(tos
)) {
687 __be16 protocol
= skb
->protocol
;
688 unsigned int nw_header
= skb_network_header(skb
) - skb
->data
;
690 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
691 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
694 protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
695 nw_header
+= VLAN_HLEN
;
698 if (protocol
== htons(ETH_P_IP
)) {
699 if (unlikely(!pskb_may_pull(skb
, nw_header
700 + sizeof(struct iphdr
))))
703 IP_ECN_set_ce((struct iphdr
*)(nw_header
+ skb
->data
));
705 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
706 else if (protocol
== htons(ETH_P_IPV6
)) {
707 if (unlikely(!pskb_may_pull(skb
, nw_header
708 + sizeof(struct ipv6hdr
))))
711 IP6_ECN_set_ce((struct ipv6hdr
*)(nw_header
718 static struct sk_buff
*handle_gso(struct sk_buff
*skb
)
720 if (skb_is_gso(skb
)) {
721 struct sk_buff
*nskb
= skb_gso_segment(skb
, 0);
730 static int handle_csum_offload(struct sk_buff
*skb
)
732 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
733 return skb_checksum_help(skb
);
735 skb
->ip_summed
= CHECKSUM_NONE
;
740 /* Called with rcu_read_lock. */
741 static void gre_err(struct sk_buff
*skb
, u32 info
)
744 const struct mutable_config
*mutable;
745 const int type
= icmp_hdr(skb
)->type
;
746 const int code
= icmp_hdr(skb
)->code
;
747 int mtu
= ntohs(icmp_hdr(skb
)->un
.frag
.mtu
);
752 int tunnel_hdr_len
, tot_hdr_len
;
753 unsigned int orig_mac_header
;
754 unsigned int orig_nw_header
;
756 if (type
!= ICMP_DEST_UNREACH
|| code
!= ICMP_FRAG_NEEDED
)
759 /* The mimimum size packet that we would actually be able to process:
760 * encapsulating IP header, minimum GRE header, Ethernet header,
761 * inner IPv4 header. */
762 if (!pskb_may_pull(skb
, sizeof(struct iphdr
) + GRE_HEADER_SECTION
+
763 ETH_HLEN
+ sizeof(struct iphdr
)))
766 iph
= (struct iphdr
*)skb
->data
;
768 tunnel_hdr_len
= parse_gre_header(iph
, &flags
, &key
);
769 if (tunnel_hdr_len
< 0)
772 vport
= find_port(iph
->saddr
, iph
->daddr
, key
, FIND_PORT_ANY
, &mutable);
776 /* Packets received by this function were previously sent by us, so
777 * any comparisons should be to the output values, not the input.
778 * However, it's not really worth it to have a hash table based on
779 * output keys (especially since ICMP error handling of tunneled packets
780 * isn't that reliable anyways). Therefore, we do a lookup based on the
781 * out key as if it were the in key and then check to see if the input
782 * and output keys are the same. */
783 if (mutable->port_config
.in_key
!= mutable->port_config
.out_key
)
786 if (!!(mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
787 !!(mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
))
790 if ((mutable->port_config
.flags
& GRE_F_OUT_CSUM
) && !(flags
& GRE_CSUM
))
793 tunnel_hdr_len
+= iph
->ihl
<< 2;
795 orig_mac_header
= skb_mac_header(skb
) - skb
->data
;
796 orig_nw_header
= skb_network_header(skb
) - skb
->data
;
797 skb_set_mac_header(skb
, tunnel_hdr_len
);
799 tot_hdr_len
= tunnel_hdr_len
+ ETH_HLEN
;
801 skb
->protocol
= eth_hdr(skb
)->h_proto
;
802 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
803 tot_hdr_len
+= VLAN_HLEN
;
804 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
807 skb_set_network_header(skb
, tot_hdr_len
);
810 if (skb
->protocol
== htons(ETH_P_IP
))
811 tot_hdr_len
+= sizeof(struct iphdr
);
812 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
813 else if (skb
->protocol
== htons(ETH_P_IPV6
))
814 tot_hdr_len
+= sizeof(struct ipv6hdr
);
819 if (!pskb_may_pull(skb
, tot_hdr_len
))
822 if (skb
->protocol
== htons(ETH_P_IP
)) {
823 if (mtu
< IP_MIN_MTU
) {
824 if (ntohs(ip_hdr(skb
)->tot_len
) >= IP_MIN_MTU
)
831 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
832 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
833 if (mtu
< IPV6_MIN_MTU
) {
834 unsigned int packet_length
= sizeof(struct ipv6hdr
) +
835 ntohs(ipv6_hdr(skb
)->payload_len
);
837 if (packet_length
>= IPV6_MIN_MTU
838 || ntohs(ipv6_hdr(skb
)->payload_len
) == 0)
846 __pskb_pull(skb
, tunnel_hdr_len
);
847 send_frag_needed(vport
, mutable, skb
, mtu
, key
);
848 skb_push(skb
, tunnel_hdr_len
);
851 skb_set_mac_header(skb
, orig_mac_header
);
852 skb_set_network_header(skb
, orig_nw_header
);
853 skb
->protocol
= htons(ETH_P_IP
);
856 /* Called with rcu_read_lock. */
857 static int gre_rcv(struct sk_buff
*skb
)
860 const struct mutable_config
*mutable;
866 if (!pskb_may_pull(skb
, GRE_HEADER_SECTION
+ ETH_HLEN
))
869 if (!check_checksum(skb
))
874 hdr_len
= parse_gre_header(iph
, &flags
, &key
);
878 vport
= find_port(iph
->daddr
, iph
->saddr
, key
, FIND_PORT_ANY
, &mutable);
880 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
884 if ((mutable->port_config
.flags
& GRE_F_IN_CSUM
) && !(flags
& GRE_CSUM
)) {
885 vport_record_error(vport
, VPORT_E_RX_CRC
);
889 if (!pskb_pull(skb
, hdr_len
) || !pskb_may_pull(skb
, ETH_HLEN
)) {
890 vport_record_error(vport
, VPORT_E_RX_ERROR
);
894 skb
->pkt_type
= PACKET_HOST
;
895 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
896 skb_postpull_rcsum(skb
, skb_transport_header(skb
), hdr_len
+ ETH_HLEN
);
901 skb_reset_network_header(skb
);
903 ecn_decapsulate(iph
->tos
, skb
);
905 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
906 OVS_CB(skb
)->tun_id
= key
;
908 OVS_CB(skb
)->tun_id
= 0;
910 skb_push(skb
, ETH_HLEN
);
911 compute_ip_summed(skb
, false);
913 vport_receive(vport
, skb
);
922 static int build_packet(struct vport
*vport
, const struct mutable_config
*mutable,
923 struct iphdr
*iph
, struct rtable
*rt
, int max_headroom
,
924 int mtu
, struct sk_buff
*skb
)
927 struct iphdr
*new_iph
;
928 int orig_len
= skb
->len
;
929 __be16 frag_off
= iph
->frag_off
;
931 skb
= check_headroom(skb
, max_headroom
);
932 if (unlikely(IS_ERR(skb
)))
935 err
= handle_csum_offload(skb
);
939 if (skb
->protocol
== htons(ETH_P_IP
)) {
940 struct iphdr
*old_iph
= ip_hdr(skb
);
942 if ((old_iph
->frag_off
& htons(IP_DF
)) &&
943 mtu
< ntohs(old_iph
->tot_len
)) {
944 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
949 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
950 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
951 unsigned int packet_length
= skb
->len
- ETH_HLEN
952 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
954 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
955 if (packet_length
> IPV6_MIN_MTU
)
956 frag_off
= htons(IP_DF
);
958 if (mtu
< packet_length
) {
959 if (send_frag_needed(vport
, mutable, skb
, mtu
, OVS_CB(skb
)->tun_id
))
965 skb_reset_transport_header(skb
);
966 new_iph
= (struct iphdr
*)skb_push(skb
, mutable->tunnel_hlen
);
967 skb_reset_network_header(skb
);
969 memcpy(new_iph
, iph
, sizeof(struct iphdr
));
970 new_iph
->frag_off
= frag_off
;
971 ip_select_ident(new_iph
, &rt
->u
.dst
, NULL
);
973 create_gre_header(skb
, mutable);
975 /* Allow our local IP stack to fragment the outer packet even if the
976 * DF bit is set as a last resort. */
979 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
980 IPCB(skb
)->flags
= 0;
982 err
= ip_local_out(skb
);
983 if (likely(net_xmit_eval(err
) == 0))
986 vport_record_error(vport
, VPORT_E_TX_ERROR
);
993 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
998 static int gre_send(struct vport
*vport
, struct sk_buff
*skb
)
1000 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1001 const struct mutable_config
*mutable = rcu_dereference(gre_vport
->mutable);
1003 struct iphdr
*old_iph
;
1010 /* Validate the protocol headers before we try to use them. */
1011 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
1012 if (unlikely(!pskb_may_pull(skb
, VLAN_ETH_HLEN
)))
1015 skb
->protocol
= vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
1016 skb_set_network_header(skb
, VLAN_ETH_HLEN
);
1019 if (skb
->protocol
== htons(ETH_P_IP
)) {
1020 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1021 + sizeof(struct iphdr
) - skb
->data
)))
1024 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1025 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
1026 if (unlikely(!pskb_may_pull(skb
, skb_network_header(skb
)
1027 + sizeof(struct ipv6hdr
) - skb
->data
)))
1031 old_iph
= ip_hdr(skb
);
1033 iph
.tos
= mutable->port_config
.tos
;
1034 if (mutable->port_config
.flags
& GRE_F_TOS_INHERIT
) {
1035 if (skb
->protocol
== htons(ETH_P_IP
))
1036 iph
.tos
= old_iph
->tos
;
1037 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1038 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1039 iph
.tos
= ipv6_get_dsfield(ipv6_hdr(skb
));
1042 iph
.tos
= ecn_encapsulate(iph
.tos
, skb
);
1045 struct flowi fl
= { .nl_u
= { .ip4_u
=
1046 { .daddr
= mutable->port_config
.daddr
,
1047 .saddr
= mutable->port_config
.saddr
,
1048 .tos
= RT_TOS(iph
.tos
) } },
1049 .proto
= IPPROTO_GRE
};
1051 if (ip_route_output_key(&init_net
, &rt
, &fl
))
1055 iph
.ttl
= mutable->port_config
.ttl
;
1056 if (mutable->port_config
.flags
& GRE_F_TTL_INHERIT
) {
1057 if (skb
->protocol
== htons(ETH_P_IP
))
1058 iph
.ttl
= old_iph
->ttl
;
1059 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1060 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1061 iph
.ttl
= ipv6_hdr(skb
)->hop_limit
;
1065 iph
.ttl
= dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
);
1067 iph
.frag_off
= (mutable->port_config
.flags
& GRE_F_PMTUD
) ? htons(IP_DF
) : 0;
1069 mtu
= dst_mtu(&rt
->u
.dst
)
1071 - mutable->tunnel_hlen
1072 - (eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
) ? VLAN_HLEN
: 0);
1076 if (skb
->protocol
== htons(ETH_P_IP
)) {
1077 iph
.frag_off
|= old_iph
->frag_off
& htons(IP_DF
);
1078 mtu
= max(mtu
, IP_MIN_MTU
);
1080 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1081 else if (skb
->protocol
== htons(ETH_P_IPV6
))
1082 mtu
= max(mtu
, IPV6_MIN_MTU
);
1086 iph
.ihl
= sizeof(struct iphdr
) >> 2;
1087 iph
.protocol
= IPPROTO_GRE
;
1088 iph
.daddr
= rt
->rt_dst
;
1089 iph
.saddr
= rt
->rt_src
;
1094 skb_dst_set(skb
, &rt
->u
.dst
);
1096 /* If we are doing GSO on a pskb it is better to make sure that the
1097 * headroom is correct now. We will only have to copy the portion in
1098 * the linear data area and GSO will preserve headroom when it creates
1099 * the segments. This is particularly beneficial on Xen where we get
1100 * lots of GSO pskbs. Conversely, we delay copying if it is just to
1101 * get our own writable clone because GSO may do the copy for us. */
1102 max_headroom
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
) + rt
->u
.dst
.header_len
1103 + mutable->tunnel_hlen
;
1105 if (skb_headroom(skb
) < max_headroom
) {
1106 skb
= check_headroom(skb
, max_headroom
);
1107 if (unlikely(IS_ERR(skb
))) {
1108 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1113 forward_ip_summed(skb
);
1115 if (unlikely(vswitch_skb_checksum_setup(skb
)))
1118 skb
= handle_gso(skb
);
1119 if (unlikely(IS_ERR(skb
))) {
1120 vport_record_error(vport
, VPORT_E_TX_DROPPED
);
1124 /* Process GSO segments. Try to do any work for the entire packet that
1125 * doesn't involve actually writing to it before this point. */
1128 struct sk_buff
*next_skb
= skb
->next
;
1131 orig_len
+= build_packet(vport
, mutable, &iph
, rt
, max_headroom
, mtu
, skb
);
1140 vport_record_error(vport
, VPORT_E_TX_ERROR
);
1145 static struct net_protocol gre_protocol_handlers
= {
1147 .err_handler
= gre_err
,
1150 static int gre_init(void)
1154 err
= inet_add_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1156 printk(KERN_WARNING
"openvswitch: cannot register gre protocol handler\n");
1161 static void gre_exit(void)
1163 tbl_destroy(port_table
, NULL
);
1164 inet_del_protocol(&gre_protocol_handlers
, IPPROTO_GRE
);
1167 static int set_config(const struct vport
*cur_vport
,
1168 struct mutable_config
*mutable, const void __user
*uconfig
)
1170 const struct vport
*old_vport
;
1171 const struct mutable_config
*old_mutable
;
1174 if (copy_from_user(&mutable->port_config
, uconfig
, sizeof(struct gre_port_config
)))
1177 if (mutable->port_config
.daddr
== 0)
1180 if (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) {
1181 port_type
= FIND_PORT_MATCH
;
1182 mutable->port_config
.in_key
= 0;
1184 port_type
= FIND_PORT_KEY
;
1186 old_vport
= find_port(mutable->port_config
.saddr
,
1187 mutable->port_config
.daddr
,
1188 mutable->port_config
.in_key
, port_type
,
1191 if (old_vport
&& old_vport
!= cur_vport
)
1194 if (mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1195 mutable->port_config
.out_key
= 0;
1197 mutable->tunnel_hlen
= sizeof(struct iphdr
) + GRE_HEADER_SECTION
;
1199 if (mutable->port_config
.flags
& GRE_F_OUT_CSUM
)
1200 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1202 if (mutable->port_config
.out_key
||
1203 mutable->port_config
.flags
& GRE_F_OUT_KEY_ACTION
)
1204 mutable->tunnel_hlen
+= GRE_HEADER_SECTION
;
1209 static struct vport
*gre_create(const char *name
, const void __user
*config
)
1211 struct vport
*vport
;
1212 struct gre_vport
*gre_vport
;
1215 vport
= vport_alloc(sizeof(struct gre_vport
), &gre_vport_ops
);
1216 if (IS_ERR(vport
)) {
1217 err
= PTR_ERR(vport
);
1221 gre_vport
= gre_vport_priv(vport
);
1223 strcpy(gre_vport
->name
, name
);
1225 gre_vport
->mutable = kmalloc(sizeof(struct mutable_config
), GFP_KERNEL
);
1226 if (!gre_vport
->mutable) {
1228 goto error_free_vport
;
1231 vport_gen_rand_ether_addr(gre_vport
->mutable->eth_addr
);
1232 gre_vport
->mutable->mtu
= ETH_DATA_LEN
;
1234 err
= set_config(NULL
, gre_vport
->mutable, config
);
1236 goto error_free_mutable
;
1238 err
= add_port(vport
);
1240 goto error_free_mutable
;
1245 kfree(gre_vport
->mutable);
1249 return ERR_PTR(err
);
1252 static int gre_modify(struct vport
*vport
, const void __user
*config
)
1254 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1255 struct mutable_config
*mutable;
1257 int update_hash
= 0;
1259 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1265 err
= set_config(vport
, mutable, config
);
1269 /* Only remove the port from the hash table if something that would
1270 * affect the lookup has changed. */
1271 if (gre_vport
->mutable->port_config
.saddr
!= mutable->port_config
.saddr
||
1272 gre_vport
->mutable->port_config
.daddr
!= mutable->port_config
.daddr
||
1273 gre_vport
->mutable->port_config
.in_key
!= mutable->port_config
.in_key
||
1274 (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
) !=
1275 (mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
))
1279 /* This update is not atomic but the lookup uses the config, which
1280 * serves as an inherent double check. */
1282 err
= del_port(vport
);
1287 assign_config_rcu(vport
, mutable);
1290 err
= add_port(vport
);
1303 static void free_port(struct rcu_head
*rcu
)
1305 struct gre_vport
*gre_vport
= container_of(rcu
, struct gre_vport
, rcu
);
1307 kfree(gre_vport
->mutable);
1308 vport_free(gre_vport_to_vport(gre_vport
));
1311 static int gre_destroy(struct vport
*vport
)
1313 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1315 const struct mutable_config
*old_mutable
;
1317 /* Do a hash table lookup to make sure that the port exists. It should
1318 * exist but might not if a modify failed earlier. */
1319 if (gre_vport
->mutable->port_config
.flags
& GRE_F_IN_KEY_MATCH
)
1320 port_type
= FIND_PORT_MATCH
;
1322 port_type
= FIND_PORT_KEY
;
1324 if (vport
== find_port(gre_vport
->mutable->port_config
.saddr
,
1325 gre_vport
->mutable->port_config
.daddr
,
1326 gre_vport
->mutable->port_config
.in_key
, port_type
, &old_mutable
))
1329 call_rcu(&gre_vport
->rcu
, free_port
);
1334 static int gre_set_mtu(struct vport
*vport
, int mtu
)
1336 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1337 struct mutable_config
*mutable;
1339 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1344 assign_config_rcu(vport
, mutable);
1349 static int gre_set_addr(struct vport
*vport
, const unsigned char *addr
)
1351 struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1352 struct mutable_config
*mutable;
1354 mutable = kmemdup(gre_vport
->mutable, sizeof(struct mutable_config
), GFP_KERNEL
);
1358 memcpy(mutable->eth_addr
, addr
, ETH_ALEN
);
1359 assign_config_rcu(vport
, mutable);
1365 static const char *gre_get_name(const struct vport
*vport
)
1367 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1368 return gre_vport
->name
;
1371 static const unsigned char *gre_get_addr(const struct vport
*vport
)
1373 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1374 return rcu_dereference(gre_vport
->mutable)->eth_addr
;
1377 static int gre_get_mtu(const struct vport
*vport
)
1379 const struct gre_vport
*gre_vport
= gre_vport_priv(vport
);
1380 return rcu_dereference(gre_vport
->mutable)->mtu
;
1383 struct vport_ops gre_vport_ops
= {
1385 .flags
= VPORT_F_GEN_STATS
| VPORT_F_TUN_ID
,
1388 .create
= gre_create
,
1389 .modify
= gre_modify
,
1390 .destroy
= gre_destroy
,
1391 .set_mtu
= gre_set_mtu
,
1392 .set_addr
= gre_set_addr
,
1393 .get_name
= gre_get_name
,
1394 .get_addr
= gre_get_addr
,
1395 .get_dev_flags
= vport_gen_get_dev_flags
,
1396 .is_running
= vport_gen_is_running
,
1397 .get_operstate
= vport_gen_get_operstate
,
1398 .get_mtu
= gre_get_mtu
,