]> git.proxmox.com Git - ovs.git/blame - datapath/tunnel.c
datapath: Abstract tunneling implementation from GRE.
[ovs.git] / datapath / tunnel.c
CommitLineData
d1eb60cc
JG
1/*
2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
4 *
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
7 */
8
9#include <linux/if_arp.h>
10#include <linux/if_ether.h>
11#include <linux/ip.h>
12#include <linux/if_vlan.h>
13#include <linux/in.h>
14#include <linux/in_route.h>
15#include <linux/jhash.h>
16#include <linux/kernel.h>
17#include <linux/version.h>
18
19#include <net/dsfield.h>
20#include <net/dst.h>
21#include <net/icmp.h>
22#include <net/inet_ecn.h>
23#include <net/ip.h>
24#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
25#include <net/ipv6.h>
26#endif
27#include <net/route.h>
28#include <net/xfrm.h>
29
30#include "actions.h"
31#include "datapath.h"
32#include "table.h"
33#include "tunnel.h"
34#include "vport.h"
35#include "vport-generic.h"
36
37/* Protected by RCU. */
38static struct tbl *port_table;
39
40/*
41 * These are just used as an optimization: they don't require any kind of
42 * synchronization because we could have just as easily read the value before
43 * the port change happened.
44 */
45static unsigned int key_local_remote_ports;
46static unsigned int key_remote_ports;
47static unsigned int local_remote_ports;
48static unsigned int remote_ports;
49
50#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
51#define rt_dst(rt) (rt->dst)
52#else
53#define rt_dst(rt) (rt->u.dst)
54#endif
55
56static inline struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport)
57{
58 return vport_from_priv(tnl_vport);
59}
60
61static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node)
62{
63 return container_of(node, struct tnl_vport, tbl_node);
64}
65
66/* RCU callback. */
67static void free_config(struct rcu_head *rcu)
68{
69 struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
70 kfree(c);
71}
72
73static void assign_config_rcu(struct vport *vport,
74 struct tnl_mutable_config *new_config)
75{
76 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
77 struct tnl_mutable_config *old_config;
78
79 old_config = rcu_dereference(tnl_vport->mutable);
80 rcu_assign_pointer(tnl_vport->mutable, new_config);
81 call_rcu(&old_config->rcu, free_config);
82}
83
84static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
85{
86 if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
87 if (mutable->port_config.saddr)
88 return &local_remote_ports;
89 else
90 return &remote_ports;
91 } else {
92 if (mutable->port_config.saddr)
93 return &key_local_remote_ports;
94 else
95 return &key_remote_ports;
96 }
97}
98
99enum lookup_key {
100 LOOKUP_TUNNEL_TYPE = 0,
101 LOOKUP_SADDR = 1,
102 LOOKUP_DADDR = 2,
103 LOOKUP_KEY = 3,
104};
105
106struct port_lookup_key {
107 u32 vals[4]; /* Contains enum lookup_key keys. */
108 const struct tnl_mutable_config *mutable;
109};
110
111/*
112 * Modifies 'target' to store the rcu_dereferenced pointer that was used to do
113 * the comparision.
114 */
115static int port_cmp(const struct tbl_node *node, void *target)
116{
117 const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node);
118 struct port_lookup_key *lookup = target;
119
120 lookup->mutable = rcu_dereference(tnl_vport->mutable);
121
122 return (lookup->mutable->tunnel_type == lookup->vals[LOOKUP_TUNNEL_TYPE]) &&
123 lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] &&
124 lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] &&
125 lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR];
126}
127
128static u32 port_hash(struct port_lookup_key *lookup)
129{
130 return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
131}
132
133static int add_port(struct vport *vport)
134{
135 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
136 struct port_lookup_key lookup;
137 int err;
138
139 if (!port_table) {
140 struct tbl *new_table;
141
142 new_table = tbl_create(0);
143 if (!new_table)
144 return -ENOMEM;
145
146 rcu_assign_pointer(port_table, new_table);
147
148 } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
149 struct tbl *old_table = port_table;
150 struct tbl *new_table;
151
152 new_table = tbl_expand(old_table);
153 if (IS_ERR(new_table))
154 return PTR_ERR(new_table);
155
156 rcu_assign_pointer(port_table, new_table);
157 tbl_deferred_destroy(old_table, NULL);
158 }
159
160 lookup.vals[LOOKUP_SADDR] = tnl_vport->mutable->port_config.saddr;
161 lookup.vals[LOOKUP_DADDR] = tnl_vport->mutable->port_config.daddr;
162 lookup.vals[LOOKUP_KEY] = tnl_vport->mutable->port_config.in_key;
163 lookup.vals[LOOKUP_TUNNEL_TYPE] = tnl_vport->mutable->tunnel_type;
164
165 err = tbl_insert(port_table, &tnl_vport->tbl_node, port_hash(&lookup));
166 if (err)
167 return err;
168
169 (*find_port_pool(tnl_vport->mutable))++;
170
171 return 0;
172}
173
174static int del_port(struct vport *vport)
175{
176 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
177 int err;
178
179 err = tbl_remove(port_table, &tnl_vport->tbl_node);
180 if (err)
181 return err;
182
183 (*find_port_pool(tnl_vport->mutable))--;
184
185 return 0;
186}
187
188struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
189 int tunnel_type,
190 const struct tnl_mutable_config **mutable)
191{
192 struct port_lookup_key lookup;
193 struct tbl *table = rcu_dereference(port_table);
194 struct tbl_node *tbl_node;
195
196 if (!table)
197 return NULL;
198
199 lookup.vals[LOOKUP_SADDR] = saddr;
200 lookup.vals[LOOKUP_DADDR] = daddr;
201
202 if (tunnel_type & TNL_T_KEY_EXACT) {
203 lookup.vals[LOOKUP_KEY] = key;
204 lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_MATCH;
205
206 if (key_local_remote_ports) {
207 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
208 if (tbl_node)
209 goto found;
210 }
211
212 if (key_remote_ports) {
213 lookup.vals[LOOKUP_SADDR] = 0;
214
215 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
216 if (tbl_node)
217 goto found;
218
219 lookup.vals[LOOKUP_SADDR] = saddr;
220 }
221 }
222
223 if (tunnel_type & TNL_T_KEY_MATCH) {
224 lookup.vals[LOOKUP_KEY] = 0;
225 lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_EXACT;
226
227 if (local_remote_ports) {
228 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
229 if (tbl_node)
230 goto found;
231 }
232
233 if (remote_ports) {
234 lookup.vals[LOOKUP_SADDR] = 0;
235
236 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
237 if (tbl_node)
238 goto found;
239 }
240 }
241
242 return NULL;
243
244found:
245 *mutable = lookup.mutable;
246 return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
247}
248
249static bool check_ipv4_address(__be32 addr)
250{
251 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
252 || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
253 return false;
254
255 return true;
256}
257
258static bool ipv4_should_icmp(struct sk_buff *skb)
259{
260 struct iphdr *old_iph = ip_hdr(skb);
261
262 /* Don't respond to L2 broadcast. */
263 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
264 return false;
265
266 /* Don't respond to L3 broadcast or invalid addresses. */
267 if (!check_ipv4_address(old_iph->daddr) ||
268 !check_ipv4_address(old_iph->saddr))
269 return false;
270
271 /* Only respond to the first fragment. */
272 if (old_iph->frag_off & htons(IP_OFFSET))
273 return false;
274
275 /* Don't respond to ICMP error messages. */
276 if (old_iph->protocol == IPPROTO_ICMP) {
277 u8 icmp_type, *icmp_typep;
278
279 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
280 (old_iph->ihl << 2) +
281 offsetof(struct icmphdr, type) -
282 skb->data, sizeof(icmp_type),
283 &icmp_type);
284
285 if (!icmp_typep)
286 return false;
287
288 if (*icmp_typep > NR_ICMP_TYPES
289 || (*icmp_typep <= ICMP_PARAMETERPROB
290 && *icmp_typep != ICMP_ECHOREPLY
291 && *icmp_typep != ICMP_ECHO))
292 return false;
293 }
294
295 return true;
296}
297
298static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
299 unsigned int mtu, unsigned int payload_length)
300{
301 struct iphdr *iph, *old_iph = ip_hdr(skb);
302 struct icmphdr *icmph;
303 u8 *payload;
304
305 iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
306 icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
307 payload = skb_put(nskb, payload_length);
308
309 /* IP */
310 iph->version = 4;
311 iph->ihl = sizeof(struct iphdr) >> 2;
312 iph->tos = (old_iph->tos & IPTOS_TOS_MASK) |
313 IPTOS_PREC_INTERNETCONTROL;
314 iph->tot_len = htons(sizeof(struct iphdr)
315 + sizeof(struct icmphdr)
316 + payload_length);
317 get_random_bytes(&iph->id, sizeof(iph->id));
318 iph->frag_off = 0;
319 iph->ttl = IPDEFTTL;
320 iph->protocol = IPPROTO_ICMP;
321 iph->daddr = old_iph->saddr;
322 iph->saddr = old_iph->daddr;
323
324 ip_send_check(iph);
325
326 /* ICMP */
327 icmph->type = ICMP_DEST_UNREACH;
328 icmph->code = ICMP_FRAG_NEEDED;
329 icmph->un.gateway = htonl(mtu);
330 icmph->checksum = 0;
331
332 nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0);
333 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
334 payload, payload_length,
335 nskb->csum);
336 icmph->checksum = csum_fold(nskb->csum);
337}
338
339#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
340static bool ipv6_should_icmp(struct sk_buff *skb)
341{
342 struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
343 int addr_type;
344 int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
345 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
346
347 /* Check source address is valid. */
348 addr_type = ipv6_addr_type(&old_ipv6h->saddr);
349 if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
350 return false;
351
352 /* Don't reply to unspecified addresses. */
353 if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
354 return false;
355
356 /* Don't respond to ICMP error messages. */
357 payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
358 if (payload_off < 0)
359 return false;
360
361 if (nexthdr == NEXTHDR_ICMP) {
362 u8 icmp_type, *icmp_typep;
363
364 icmp_typep = skb_header_pointer(skb, payload_off +
365 offsetof(struct icmp6hdr,
366 icmp6_type),
367 sizeof(icmp_type), &icmp_type);
368
369 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
370 return false;
371 }
372
373 return true;
374}
375
376static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
377 unsigned int mtu, unsigned int payload_length)
378{
379 struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
380 struct icmp6hdr *icmp6h;
381 u8 *payload;
382
383 ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
384 icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
385 payload = skb_put(nskb, payload_length);
386
387 /* IPv6 */
388 ipv6h->version = 6;
389 ipv6h->priority = 0;
390 memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl));
391 ipv6h->payload_len = htons(sizeof(struct icmp6hdr)
392 + payload_length);
393 ipv6h->nexthdr = NEXTHDR_ICMP;
394 ipv6h->hop_limit = IPV6_DEFAULT_HOPLIMIT;
395 ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
396 ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
397
398 /* ICMPv6 */
399 icmp6h->icmp6_type = ICMPV6_PKT_TOOBIG;
400 icmp6h->icmp6_code = 0;
401 icmp6h->icmp6_cksum = 0;
402 icmp6h->icmp6_mtu = htonl(mtu);
403
404 nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0);
405 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
406 payload, payload_length,
407 nskb->csum);
408 icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
409 sizeof(struct icmp6hdr)
410 + payload_length,
411 ipv6h->nexthdr, nskb->csum);
412}
413#endif /* IPv6 */
414
415bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutable,
416 struct sk_buff *skb, unsigned int mtu, __be32 flow_key)
417{
418 unsigned int eth_hdr_len = ETH_HLEN;
419 unsigned int total_length = 0, header_length = 0, payload_length;
420 struct ethhdr *eh, *old_eh = eth_hdr(skb);
421 struct sk_buff *nskb;
422
423 /* Sanity check */
424 if (skb->protocol == htons(ETH_P_IP)) {
425 if (mtu < IP_MIN_MTU)
426 return false;
427
428 if (!ipv4_should_icmp(skb))
429 return true;
430 }
431#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
432 else if (skb->protocol == htons(ETH_P_IPV6)) {
433 if (mtu < IPV6_MIN_MTU)
434 return false;
435
436 /*
437 * In theory we should do PMTUD on IPv6 multicast messages but
438 * we don't have an address to send from so just fragment.
439 */
440 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
441 return false;
442
443 if (!ipv6_should_icmp(skb))
444 return true;
445 }
446#endif
447 else
448 return false;
449
450 /* Allocate */
451 if (old_eh->h_proto == htons(ETH_P_8021Q))
452 eth_hdr_len = VLAN_ETH_HLEN;
453
454 payload_length = skb->len - eth_hdr_len;
455 if (skb->protocol == htons(ETH_P_IP)) {
456 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
457 total_length = min_t(unsigned int, header_length +
458 payload_length, 576);
459 }
460#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
461 else {
462 header_length = sizeof(struct ipv6hdr) +
463 sizeof(struct icmp6hdr);
464 total_length = min_t(unsigned int, header_length +
465 payload_length, IPV6_MIN_MTU);
466 }
467#endif
468
469 total_length = min(total_length, mutable->mtu);
470 payload_length = total_length - header_length;
471
472 nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length +
473 payload_length);
474 if (!nskb)
475 return false;
476
477 skb_reserve(nskb, NET_IP_ALIGN);
478
479 /* Ethernet / VLAN */
480 eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
481 memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
482 memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN);
483 nskb->protocol = eh->h_proto = old_eh->h_proto;
484 if (old_eh->h_proto == htons(ETH_P_8021Q)) {
485 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
486
487 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
488 vh->h_vlan_encapsulated_proto = skb->protocol;
489 }
490 skb_reset_mac_header(nskb);
491
492 /* Protocol */
493 if (skb->protocol == htons(ETH_P_IP))
494 ipv4_build_icmp(skb, nskb, mtu, payload_length);
495#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
496 else
497 ipv6_build_icmp(skb, nskb, mtu, payload_length);
498#endif
499
500 /*
501 * Assume that flow based keys are symmetric with respect to input
502 * and output and use the key that we were going to put on the
503 * outgoing packet for the fake received packet. If the keys are
504 * not symmetric then PMTUD needs to be disabled since we won't have
505 * any way of synthesizing packets.
506 */
507 if ((mutable->port_config.flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
508 (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
509 OVS_CB(nskb)->tun_id = flow_key;
510
511 compute_ip_summed(nskb, false);
512 vport_receive(vport, nskb);
513
514 return true;
515}
516
517static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
518{
519 if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
520 struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
521 if (unlikely(!nskb)) {
522 kfree_skb(skb);
523 return ERR_PTR(-ENOMEM);
524 }
525
526 set_skb_csum_bits(skb, nskb);
527
528 if (skb->sk)
529 skb_set_owner_w(nskb, skb->sk);
530
531 dev_kfree_skb(skb);
532 return nskb;
533 }
534
535 return skb;
536}
537
538static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb)
539{
540 u8 inner;
541
542 if (skb->protocol == htons(ETH_P_IP))
543 inner = ((struct iphdr *)skb_network_header(skb))->tos;
544#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
545 else if (skb->protocol == htons(ETH_P_IPV6))
546 inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
547#endif
548 else
549 inner = 0;
550
551 return INET_ECN_encapsulate(tos, inner);
552}
553
554static inline void ecn_decapsulate(struct sk_buff *skb)
555{
556 u8 tos = ip_hdr(skb)->tos;
557
558 if (INET_ECN_is_ce(tos)) {
559 __be16 protocol = skb->protocol;
560 unsigned int nw_header = skb_network_header(skb) - skb->data;
561
562 if (skb->protocol == htons(ETH_P_8021Q)) {
563 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
564 return;
565
566 protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
567 nw_header += VLAN_HLEN;
568 }
569
570 if (protocol == htons(ETH_P_IP)) {
571 if (unlikely(!pskb_may_pull(skb, nw_header
572 + sizeof(struct iphdr))))
573 return;
574
575 IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
576 }
577#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
578 else if (protocol == htons(ETH_P_IPV6)) {
579 if (unlikely(!pskb_may_pull(skb, nw_header
580 + sizeof(struct ipv6hdr))))
581 return;
582
583 IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
584 + skb->data));
585 }
586#endif
587 }
588}
589
590static struct sk_buff *handle_gso(struct sk_buff *skb)
591{
592 if (skb_is_gso(skb)) {
593 struct sk_buff *nskb = skb_gso_segment(skb, 0);
594
595 dev_kfree_skb(skb);
596 return nskb;
597 }
598
599 return skb;
600}
601
602static int handle_csum_offload(struct sk_buff *skb)
603{
604 if (skb->ip_summed == CHECKSUM_PARTIAL)
605 return skb_checksum_help(skb);
606 else {
607 skb->ip_summed = CHECKSUM_NONE;
608 return 0;
609 }
610}
611
612/* Called with rcu_read_lock. */
613void tnl_rcv(struct vport *vport, struct sk_buff *skb)
614{
615 skb->pkt_type = PACKET_HOST;
616 skb->protocol = eth_type_trans(skb, skb->dev);
617
618 skb_dst_drop(skb);
619 nf_reset(skb);
620 secpath_reset(skb);
621 skb_reset_network_header(skb);
622
623 ecn_decapsulate(skb);
624
625 skb_push(skb, ETH_HLEN);
626 compute_ip_summed(skb, false);
627
628 vport_receive(vport, skb);
629}
630
631static int build_packet(struct vport *vport, const struct tnl_mutable_config *mutable,
632 struct iphdr *iph, struct rtable *rt, int max_headroom,
633 int mtu, struct sk_buff *skb)
634{
635 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
636 int err;
637 struct iphdr *new_iph;
638 int orig_len = skb->len;
639 __be16 frag_off = iph->frag_off;
640
641 skb = check_headroom(skb, max_headroom);
642 if (unlikely(IS_ERR(skb)))
643 goto error;
644
645 err = handle_csum_offload(skb);
646 if (unlikely(err))
647 goto error_free;
648
649 if (skb->protocol == htons(ETH_P_IP)) {
650 struct iphdr *old_iph = ip_hdr(skb);
651
652 if ((old_iph->frag_off & htons(IP_DF)) &&
653 mtu < ntohs(old_iph->tot_len)) {
654 if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
655 goto error_free;
656 }
657
658 }
659#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
660 else if (skb->protocol == htons(ETH_P_IPV6)) {
661 unsigned int packet_length = skb->len - ETH_HLEN
662 - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
663
664 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
665 if (packet_length > IPV6_MIN_MTU)
666 frag_off = htons(IP_DF);
667
668 if (mtu < packet_length) {
669 if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
670 goto error_free;
671 }
672 }
673#endif
674
675 new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
676 skb_reset_network_header(skb);
677 skb_set_transport_header(skb, sizeof(struct iphdr));
678
679 memcpy(new_iph, iph, sizeof(struct iphdr));
680 new_iph->frag_off = frag_off;
681 ip_select_ident(new_iph, &rt_dst(rt), NULL);
682
683 tnl_vport->tnl_ops->build_header(skb, vport, mutable);
684
685 /* Allow our local IP stack to fragment the outer packet even if the
686 * DF bit is set as a last resort. */
687 skb->local_df = 1;
688
689 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
690 IPCB(skb)->flags = 0;
691
692 err = ip_local_out(skb);
693 if (likely(net_xmit_eval(err) == 0))
694 return orig_len;
695 else {
696 vport_record_error(vport, VPORT_E_TX_ERROR);
697 return 0;
698 }
699
700error_free:
701 kfree_skb(skb);
702error:
703 vport_record_error(vport, VPORT_E_TX_DROPPED);
704
705 return 0;
706}
707
708int tnl_send(struct vport *vport, struct sk_buff *skb)
709{
710 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
711 const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
712
713 struct iphdr *old_iph;
714 int orig_len;
715 struct iphdr iph;
716 struct rtable *rt;
717 int max_headroom;
718 int mtu;
719
720 /* Validate the protocol headers before we try to use them. */
721 if (skb->protocol == htons(ETH_P_8021Q)) {
722 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
723 goto error_free;
724
725 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
726 skb_set_network_header(skb, VLAN_ETH_HLEN);
727 }
728
729 if (skb->protocol == htons(ETH_P_IP)) {
730 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
731 + sizeof(struct iphdr) - skb->data)))
732 skb->protocol = 0;
733 }
734#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
735 else if (skb->protocol == htons(ETH_P_IPV6)) {
736 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
737 + sizeof(struct ipv6hdr) - skb->data)))
738 skb->protocol = 0;
739 }
740#endif
741 old_iph = ip_hdr(skb);
742
743 iph.tos = mutable->port_config.tos;
744 if (mutable->port_config.flags & TNL_F_TOS_INHERIT) {
745 if (skb->protocol == htons(ETH_P_IP))
746 iph.tos = old_iph->tos;
747#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
748 else if (skb->protocol == htons(ETH_P_IPV6))
749 iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
750#endif
751 }
752 iph.tos = ecn_encapsulate(iph.tos, skb);
753
754 {
755 struct flowi fl = { .nl_u = { .ip4_u =
756 { .daddr = mutable->port_config.daddr,
757 .saddr = mutable->port_config.saddr,
758 .tos = RT_TOS(iph.tos) } },
759 .proto = tnl_vport->tnl_ops->ipproto };
760
761 if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
762 goto error_free;
763 }
764
765 iph.ttl = mutable->port_config.ttl;
766 if (mutable->port_config.flags & TNL_F_TTL_INHERIT) {
767 if (skb->protocol == htons(ETH_P_IP))
768 iph.ttl = old_iph->ttl;
769#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
770 else if (skb->protocol == htons(ETH_P_IPV6))
771 iph.ttl = ipv6_hdr(skb)->hop_limit;
772#endif
773 }
774 if (!iph.ttl)
775 iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
776
777 iph.frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
778 if (iph.frag_off)
779 mtu = dst_mtu(&rt_dst(rt))
780 - ETH_HLEN
781 - mutable->tunnel_hlen
782 - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
783 else
784 mtu = mutable->mtu;
785
786 if (skb->protocol == htons(ETH_P_IP)) {
787 iph.frag_off |= old_iph->frag_off & htons(IP_DF);
788 mtu = max(mtu, IP_MIN_MTU);
789 }
790#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
791 else if (skb->protocol == htons(ETH_P_IPV6))
792 mtu = max(mtu, IPV6_MIN_MTU);
793#endif
794
795 iph.version = 4;
796 iph.ihl = sizeof(struct iphdr) >> 2;
797 iph.protocol = tnl_vport->tnl_ops->ipproto;
798 iph.daddr = rt->rt_dst;
799 iph.saddr = rt->rt_src;
800
801 nf_reset(skb);
802 secpath_reset(skb);
803 skb_dst_drop(skb);
804 skb_dst_set(skb, &rt_dst(rt));
805
806 /*
807 * If we are doing GSO on a pskb it is better to make sure that the
808 * headroom is correct now. We will only have to copy the portion in
809 * the linear data area and GSO will preserve headroom when it creates
810 * the segments. This is particularly beneficial on Xen where we get
811 * lots of GSO pskbs. Conversely, we delay copying if it is just to
812 * get our own writable clone because GSO may do the copy for us.
813 */
814 max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
815 + mutable->tunnel_hlen;
816
817 if (skb_headroom(skb) < max_headroom) {
818 skb = check_headroom(skb, max_headroom);
819 if (unlikely(IS_ERR(skb))) {
820 vport_record_error(vport, VPORT_E_TX_DROPPED);
821 goto error;
822 }
823 }
824
825 forward_ip_summed(skb);
826
827 if (unlikely(vswitch_skb_checksum_setup(skb)))
828 goto error_free;
829
830 skb = handle_gso(skb);
831 if (unlikely(IS_ERR(skb))) {
832 vport_record_error(vport, VPORT_E_TX_DROPPED);
833 goto error;
834 }
835
836 /*
837 * Process GSO segments. Try to do any work for the entire packet that
838 * doesn't involve actually writing to it before this point.
839 */
840 orig_len = 0;
841 do {
842 struct sk_buff *next_skb = skb->next;
843 skb->next = NULL;
844
845 orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
846
847 skb = next_skb;
848 } while (skb);
849
850 return orig_len;
851
852error_free:
853 kfree_skb(skb);
854 vport_record_error(vport, VPORT_E_TX_ERROR);
855error:
856 return 0;
857}
858
859int tnl_init(void)
860{
861 return 0;
862}
863
864void tnl_exit(void)
865{
866 tbl_destroy(port_table, NULL);
867 port_table = NULL;
868}
869
870static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
871 const struct vport *cur_vport,
872 struct tnl_mutable_config *mutable)
873{
874 const struct vport *old_vport;
875 const struct tnl_mutable_config *old_mutable;
876
877 if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct tnl_port_config)))
878 return -EFAULT;
879
880 mutable->tunnel_hlen = tnl_ops->hdr_len(&mutable->port_config);
881 if (mutable->tunnel_hlen < 0)
882 return mutable->tunnel_hlen;
883
884 mutable->tunnel_hlen += sizeof(struct iphdr);
885
886 if (mutable->port_config.daddr == 0)
887 return -EINVAL;
888
889 mutable->tunnel_type = tnl_ops->tunnel_type;
890 if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
891 mutable->tunnel_type |= TNL_T_KEY_MATCH;
892 mutable->port_config.in_key = 0;
893 } else
894 mutable->tunnel_type |= TNL_T_KEY_EXACT;
895
896 old_vport = tnl_find_port(mutable->port_config.saddr,
897 mutable->port_config.daddr,
898 mutable->port_config.in_key,
899 mutable->tunnel_type,
900 &old_mutable);
901
902 if (old_vport && old_vport != cur_vport)
903 return -EEXIST;
904
905 if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
906 mutable->port_config.out_key = 0;
907
908 return 0;
909}
910
911struct vport *tnl_create(const char *name, const void __user *config,
912 const struct vport_ops *vport_ops,
913 const struct tnl_ops *tnl_ops)
914{
915 struct vport *vport;
916 struct tnl_vport *tnl_vport;
917 int err;
918
919 vport = vport_alloc(sizeof(struct tnl_vport), vport_ops);
920 if (IS_ERR(vport)) {
921 err = PTR_ERR(vport);
922 goto error;
923 }
924
925 tnl_vport = tnl_vport_priv(vport);
926
927 strcpy(tnl_vport->name, name);
928 tnl_vport->tnl_ops = tnl_ops;
929
930 tnl_vport->mutable = kmalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
931 if (!tnl_vport->mutable) {
932 err = -ENOMEM;
933 goto error_free_vport;
934 }
935
936 vport_gen_rand_ether_addr(tnl_vport->mutable->eth_addr);
937 tnl_vport->mutable->mtu = ETH_DATA_LEN;
938
939 err = set_config(config, tnl_ops, NULL, tnl_vport->mutable);
940 if (err)
941 goto error_free_mutable;
942
943 err = add_port(vport);
944 if (err)
945 goto error_free_mutable;
946
947 return vport;
948
949error_free_mutable:
950 kfree(tnl_vport->mutable);
951error_free_vport:
952 vport_free(vport);
953error:
954 return ERR_PTR(err);
955}
956
957int tnl_modify(struct vport *vport, const void __user *config)
958{
959 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
960 struct tnl_mutable_config *mutable;
961 int err;
962 bool update_hash = false;
963
964 mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
965 if (!mutable) {
966 err = -ENOMEM;
967 goto error;
968 }
969
970 err = set_config(config, tnl_vport->tnl_ops, vport, mutable);
971 if (err)
972 goto error_free;
973
974 /*
975 * Only remove the port from the hash table if something that would
976 * affect the lookup has changed.
977 */
978 if (tnl_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
979 tnl_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
980 tnl_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
981 (tnl_vport->mutable->port_config.flags & TNL_F_IN_KEY_MATCH) !=
982 (mutable->port_config.flags & TNL_F_IN_KEY_MATCH))
983 update_hash = true;
984
985
986 /*
987 * This update is not atomic but the lookup uses the config, which
988 * serves as an inherent double check.
989 */
990 if (update_hash) {
991 err = del_port(vport);
992 if (err)
993 goto error_free;
994 }
995
996 assign_config_rcu(vport, mutable);
997
998 if (update_hash) {
999 err = add_port(vport);
1000 if (err)
1001 goto error_free;
1002 }
1003
1004 return 0;
1005
1006error_free:
1007 kfree(mutable);
1008error:
1009 return err;
1010}
1011
1012static void free_port(struct rcu_head *rcu)
1013{
1014 struct tnl_vport *tnl_vport = container_of(rcu, struct tnl_vport, rcu);
1015
1016 kfree(tnl_vport->mutable);
1017 vport_free(tnl_vport_to_vport(tnl_vport));
1018}
1019
1020int tnl_destroy(struct vport *vport)
1021{
1022 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1023 const struct tnl_mutable_config *old_mutable;
1024
1025 if (vport == tnl_find_port(tnl_vport->mutable->port_config.saddr,
1026 tnl_vport->mutable->port_config.daddr,
1027 tnl_vport->mutable->port_config.in_key,
1028 tnl_vport->mutable->tunnel_type,
1029 &old_mutable))
1030 del_port(vport);
1031
1032 call_rcu(&tnl_vport->rcu, free_port);
1033
1034 return 0;
1035}
1036
1037int tnl_set_mtu(struct vport *vport, int mtu)
1038{
1039 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1040 struct tnl_mutable_config *mutable;
1041
1042 mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
1043 if (!mutable)
1044 return -ENOMEM;
1045
1046 mutable->mtu = mtu;
1047 assign_config_rcu(vport, mutable);
1048
1049 return 0;
1050}
1051
1052int tnl_set_addr(struct vport *vport, const unsigned char *addr)
1053{
1054 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1055 struct tnl_mutable_config *mutable;
1056
1057 mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
1058 if (!mutable)
1059 return -ENOMEM;
1060
1061 memcpy(mutable->eth_addr, addr, ETH_ALEN);
1062 assign_config_rcu(vport, mutable);
1063
1064 return 0;
1065}
1066
1067
1068const char *tnl_get_name(const struct vport *vport)
1069{
1070 const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1071 return tnl_vport->name;
1072}
1073
1074const unsigned char *tnl_get_addr(const struct vport *vport)
1075{
1076 const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1077 return rcu_dereference(tnl_vport->mutable)->eth_addr;
1078}
1079
1080int tnl_get_mtu(const struct vport *vport)
1081{
1082 const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
1083 return rcu_dereference(tnl_vport->mutable)->mtu;
1084}