2 * Stateless TCP Tunnel (STT) vport.
4 * Copyright (c) 2015 Nicira, Inc.
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <asm/unaligned.h>
15 #include <linux/delay.h>
16 #include <linux/flex_array.h>
18 #include <linux/if_vlan.h>
20 #include <linux/ipv6.h>
21 #include <linux/jhash.h>
22 #include <linux/list.h>
23 #include <linux/log2.h>
24 #include <linux/module.h>
25 #include <linux/net.h>
26 #include <linux/netfilter.h>
27 #include <linux/percpu.h>
28 #include <linux/skbuff.h>
29 #include <linux/tcp.h>
30 #include <linux/workqueue.h>
32 #include <net/dst_metadata.h>
34 #include <net/inet_ecn.h>
36 #include <net/ip_tunnels.h>
37 #include <net/ip6_checksum.h>
38 #include <net/net_namespace.h>
39 #include <net/netns/generic.h>
48 #define STT_NETDEV_VER "0.1"
49 #define STT_DST_PORT 7471
54 * We saw better performance with skipping zero copy in case of SLUB.
55 * So skip zero copy for SLUB case.
57 #define SKIP_ZERO_COPY
62 /* @list: Per-net list of STT ports.
63 * @rcv: The callback is called on STT packet recv, STT reassembly can generate
64 * multiple packets, in this case first packet has tunnel outer header, rest
65 * of the packets are inner packet segments with no stt header.
66 * @rcv_data: user data.
67 * @sock: Fake TCP socket for the STT port.
70 struct net_device
*dev
;
72 struct list_head next
;
73 struct list_head up_next
;
78 #define STT_CSUM_VERIFIED BIT(0)
79 #define STT_CSUM_PARTIAL BIT(1)
80 #define STT_PROTO_IPV4 BIT(2)
81 #define STT_PROTO_TCP BIT(3)
82 #define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
84 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
87 /* The length and offset of a fragment are encoded in the sequence number.
88 * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
89 * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
91 #define STT_SEQ_LEN_SHIFT 16
92 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
94 /* The maximum amount of memory used to store packets waiting to be reassembled
95 * on a given CPU. Once this threshold is exceeded we will begin freeing the
96 * least recently used fragments.
98 #define REASM_HI_THRESH (4 * 1024 * 1024)
99 /* The target for the high memory evictor. Once we have exceeded
100 * REASM_HI_THRESH, we will continue freeing fragments until we hit
103 #define REASM_LO_THRESH (3 * 1024 * 1024)
104 /* The length of time a given packet has to be reassembled from the time the
105 * first fragment arrives. Once this limit is exceeded it becomes available
108 #define FRAG_EXP_TIME (30 * HZ)
109 /* Number of hash entries. Each entry has only a single slot to hold a packet
110 * so if there are collisions, we will drop packets. This is allocated
111 * per-cpu and each entry consists of struct pkt_frag.
113 #define FRAG_HASH_SHIFT 8
114 #define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
115 #define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
117 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
127 struct sk_buff
*skbs
;
128 unsigned long timestamp
;
129 struct list_head lru_node
;
134 struct flex_array
*frag_hash
;
135 struct list_head frag_lru
;
136 unsigned int frag_mem_used
;
138 /* Protect frags table. */
143 struct sk_buff
*last_skb
;
144 unsigned int mem_used
;
153 /* Only valid for the first skb in the chain. */
154 struct first_frag first
;
157 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
159 /* per-network namespace private data for this module */
161 struct list_head stt_list
;
162 struct list_head stt_up_list
; /* Devices which are in IFF_UP state. */
164 #ifdef HAVE_NF_REGISTER_NET_HOOK
165 bool nf_hook_reg_done
;
169 static int stt_net_id
;
171 static struct stt_percpu __percpu
*stt_percpu_data __read_mostly
;
172 static u32 frag_hash_seed __read_mostly
;
174 /* Protects sock-hash and refcounts. */
175 static DEFINE_MUTEX(stt_mutex
);
177 static int n_tunnels
;
178 static DEFINE_PER_CPU(u32
, pkt_seq_counter
);
180 static void clean_percpu(struct work_struct
*work
);
181 static DECLARE_DELAYED_WORK(clean_percpu_wq
, clean_percpu
);
183 static struct stt_dev
*stt_find_up_dev(struct net
*net
, __be16 port
)
185 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
186 struct stt_dev
*stt_dev
;
188 list_for_each_entry_rcu(stt_dev
, &sn
->stt_up_list
, up_next
) {
189 if (stt_dev
->dst_port
== port
)
195 static __be32
ack_seq(void)
200 pkt_seq
= this_cpu_read(pkt_seq_counter
);
201 ack
= pkt_seq
<< ilog2(NR_CPUS
) | smp_processor_id();
202 this_cpu_inc(pkt_seq_counter
);
204 return (__force __be32
)ack
;
206 #error "Support for greater than 64k CPUs not implemented"
210 static int clear_gso(struct sk_buff
*skb
)
212 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
215 if (shinfo
->gso_type
== 0 && shinfo
->gso_size
== 0 &&
216 shinfo
->gso_segs
== 0)
219 err
= skb_unclone(skb
, GFP_ATOMIC
);
223 shinfo
= skb_shinfo(skb
);
224 shinfo
->gso_type
= 0;
225 shinfo
->gso_size
= 0;
226 shinfo
->gso_segs
= 0;
230 static void copy_skb_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
232 to
->protocol
= from
->protocol
;
233 to
->tstamp
= from
->tstamp
;
234 to
->priority
= from
->priority
;
235 to
->mark
= from
->mark
;
236 to
->vlan_tci
= from
->vlan_tci
;
237 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
238 to
->vlan_proto
= from
->vlan_proto
;
240 skb_copy_secmark(to
, from
);
243 static void update_headers(struct sk_buff
*skb
, bool head
,
244 unsigned int l4_offset
, unsigned int hdr_len
,
245 bool ipv4
, u32 tcp_seq
)
247 u16 old_len
, new_len
;
253 struct iphdr
*iph
= (struct iphdr
*)(skb
->data
+ ETH_HLEN
);
255 old_len
= ntohs(iph
->tot_len
);
256 new_len
= skb
->len
- ETH_HLEN
;
257 iph
->tot_len
= htons(new_len
);
261 struct ipv6hdr
*ip6h
= (struct ipv6hdr
*)(skb
->data
+ ETH_HLEN
);
263 old_len
= ntohs(ip6h
->payload_len
);
264 new_len
= skb
->len
- ETH_HLEN
- sizeof(struct ipv6hdr
);
265 ip6h
->payload_len
= htons(new_len
);
268 tcph
= (struct tcphdr
*)(skb
->data
+ l4_offset
);
270 tcph
->seq
= htonl(tcp_seq
);
279 delta
= htonl(~old_len
+ new_len
);
280 tcph
->check
= ~csum_fold((__force __wsum
)((__force u32
)tcph
->check
+
281 (__force u32
)delta
));
283 gso_size
= skb_shinfo(skb
)->gso_size
;
284 if (gso_size
&& skb
->len
- hdr_len
<= gso_size
)
285 BUG_ON(clear_gso(skb
));
288 static bool can_segment(struct sk_buff
*head
, bool ipv4
, bool tcp
, bool csum_partial
)
290 /* If no offloading is in use then we don't have enough information
291 * to process the headers.
296 /* Handling UDP packets requires IP fragmentation, which means that
297 * the L4 checksum can no longer be calculated by hardware (since the
298 * fragments are in different packets. If we have to compute the
299 * checksum it's faster just to linearize and large UDP packets are
300 * pretty uncommon anyways, so it's not worth dealing with for now.
306 struct iphdr
*iph
= (struct iphdr
*)(head
->data
+ ETH_HLEN
);
308 /* It's difficult to get the IP IDs exactly right here due to
309 * varying segment sizes and potentially multiple layers of
310 * segmentation. IP ID isn't important when DF is set and DF
311 * is generally set for TCP packets, so just linearize if it's
314 if (!(iph
->frag_off
& htons(IP_DF
)))
317 struct ipv6hdr
*ip6h
= (struct ipv6hdr
*)(head
->data
+ ETH_HLEN
);
319 /* Jumbograms require more processing to update and we'll
320 * probably never see them, so just linearize.
322 if (ip6h
->payload_len
== 0)
331 static int copy_headers(struct sk_buff
*head
, struct sk_buff
*frag
,
336 if (skb_cloned(frag
) || skb_headroom(frag
) < hdr_len
) {
337 int extra_head
= hdr_len
- skb_headroom(frag
);
339 extra_head
= extra_head
> 0 ? extra_head
: 0;
340 if (unlikely(pskb_expand_head(frag
, extra_head
, 0,
345 memcpy(__skb_push(frag
, hdr_len
), head
->data
, hdr_len
);
347 csum_start
= head
->csum_start
- skb_headroom(head
);
348 frag
->csum_start
= skb_headroom(frag
) + csum_start
;
349 frag
->csum_offset
= head
->csum_offset
;
350 frag
->ip_summed
= head
->ip_summed
;
352 skb_shinfo(frag
)->gso_size
= skb_shinfo(head
)->gso_size
;
353 skb_shinfo(frag
)->gso_type
= skb_shinfo(head
)->gso_type
;
354 skb_shinfo(frag
)->gso_segs
= 0;
356 copy_skb_metadata(frag
, head
);
360 static int skb_list_segment(struct sk_buff
*head
, bool ipv4
, int l4_offset
)
369 if (unlikely(!pskb_may_pull(head
, l4_offset
+ sizeof(*tcph
))))
372 tcph
= (struct tcphdr
*)(head
->data
+ l4_offset
);
373 tcp_len
= tcph
->doff
* 4;
374 hdr_len
= l4_offset
+ tcp_len
;
376 if (unlikely((tcp_len
< sizeof(struct tcphdr
)) ||
377 (head
->len
< hdr_len
)))
380 if (unlikely(!pskb_may_pull(head
, hdr_len
)))
383 tcph
= (struct tcphdr
*)(head
->data
+ l4_offset
);
384 /* Update header of each segment. */
385 seq
= ntohl(tcph
->seq
);
386 seg_len
= skb_pagelen(head
) - hdr_len
;
388 skb
= skb_shinfo(head
)->frag_list
;
389 skb_shinfo(head
)->frag_list
= NULL
;
391 for (; skb
; skb
= skb
->next
) {
394 head
->len
-= skb
->len
;
395 head
->data_len
-= skb
->len
;
396 head
->truesize
-= skb
->truesize
;
400 err
= copy_headers(head
, skb
, hdr_len
);
403 update_headers(skb
, false, l4_offset
, hdr_len
, ipv4
, seq
);
405 update_headers(head
, true, l4_offset
, hdr_len
, ipv4
, 0);
409 #ifndef SKIP_ZERO_COPY
410 static struct sk_buff
*normalize_frag_list(struct sk_buff
*head
,
411 struct sk_buff
**skbp
)
413 struct sk_buff
*skb
= *skbp
;
414 struct sk_buff
*last
;
417 struct sk_buff
*frags
;
419 if (skb_shared(skb
)) {
420 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
423 return ERR_PTR(-ENOMEM
);
425 nskb
->next
= skb
->next
;
432 head
->len
-= skb
->len
;
433 head
->data_len
-= skb
->len
;
434 head
->truesize
-= skb
->truesize
;
437 frags
= skb_shinfo(skb
)->frag_list
;
441 err
= skb_unclone(skb
, GFP_ATOMIC
);
445 last
= normalize_frag_list(skb
, &frags
);
449 skb_shinfo(skb
)->frag_list
= NULL
;
450 last
->next
= skb
->next
;
457 } while ((skb
= skb
->next
));
462 /* Takes a linked list of skbs, which potentially contain frag_list
463 * (whose members in turn potentially contain frag_lists, etc.) and
464 * converts them into a single linear linked list.
466 static int straighten_frag_list(struct sk_buff
**skbp
)
468 struct sk_buff
*err_skb
;
470 err_skb
= normalize_frag_list(NULL
, skbp
);
472 return PTR_ERR(err_skb
);
477 static int coalesce_skb(struct sk_buff
**headp
)
479 struct sk_buff
*frag
, *head
, *prev
;
482 err
= straighten_frag_list(headp
);
487 /* Coalesce frag list. */
489 for (frag
= head
->next
; frag
; frag
= frag
->next
) {
493 if (unlikely(skb_unclone(prev
, GFP_ATOMIC
)))
496 if (!skb_try_coalesce(prev
, frag
, &headstolen
, &delta
)) {
501 prev
->next
= frag
->next
;
504 frag
->truesize
-= delta
;
505 kfree_skb_partial(frag
, headstolen
);
512 for (frag
= head
->next
; frag
; frag
= frag
->next
) {
513 head
->len
+= frag
->len
;
514 head
->data_len
+= frag
->len
;
515 head
->truesize
+= frag
->truesize
;
518 skb_shinfo(head
)->frag_list
= head
->next
;
523 static int coalesce_skb(struct sk_buff
**headp
)
525 struct sk_buff
*frag
, *head
= *headp
, *next
;
526 int delta
= FRAG_CB(head
)->first
.tot_len
- skb_headlen(head
);
529 if (unlikely(!head
->next
))
532 err
= pskb_expand_head(head
, 0, delta
, GFP_ATOMIC
);
536 if (unlikely(!__pskb_pull_tail(head
, head
->data_len
)))
539 for (frag
= head
->next
; frag
; frag
= next
) {
540 skb_copy_bits(frag
, 0, skb_put(head
, frag
->len
), frag
->len
);
546 head
->truesize
= SKB_TRUESIZE(head
->len
);
551 static int __try_to_segment(struct sk_buff
*skb
, bool csum_partial
,
552 bool ipv4
, bool tcp
, int l4_offset
)
554 if (can_segment(skb
, ipv4
, tcp
, csum_partial
))
555 return skb_list_segment(skb
, ipv4
, l4_offset
);
557 return skb_linearize(skb
);
560 static int try_to_segment(struct sk_buff
*skb
)
562 #ifdef SKIP_ZERO_COPY
563 /* coalesce_skb() since does not generate frag-list no need to
568 struct stthdr
*stth
= stt_hdr(skb
);
569 bool csum_partial
= !!(stth
->flags
& STT_CSUM_PARTIAL
);
570 bool ipv4
= !!(stth
->flags
& STT_PROTO_IPV4
);
571 bool tcp
= !!(stth
->flags
& STT_PROTO_TCP
);
572 int l4_offset
= stth
->l4_offset
;
574 return __try_to_segment(skb
, csum_partial
, ipv4
, tcp
, l4_offset
);
578 static int segment_skb(struct sk_buff
**headp
, bool csum_partial
,
579 bool ipv4
, bool tcp
, int l4_offset
)
581 #ifndef SKIP_ZERO_COPY
584 err
= coalesce_skb(headp
);
589 if (skb_shinfo(*headp
)->frag_list
)
590 return __try_to_segment(*headp
, csum_partial
,
591 ipv4
, tcp
, l4_offset
);
595 static int __push_stt_header(struct sk_buff
*skb
, __be64 tun_id
,
596 __be16 s_port
, __be16 d_port
,
597 __be32 saddr
, __be32 dst
,
598 __be16 l3_proto
, u8 l4_proto
,
601 int data_len
= skb
->len
+ sizeof(struct stthdr
) + STT_ETH_PAD
;
602 unsigned short encap_mss
;
606 skb_push(skb
, STT_HEADER_LEN
);
607 skb_reset_transport_header(skb
);
609 memset(tcph
, 0, STT_HEADER_LEN
);
612 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
613 stth
->flags
|= STT_CSUM_PARTIAL
;
615 stth
->l4_offset
= skb
->csum_start
-
619 if (l3_proto
== htons(ETH_P_IP
))
620 stth
->flags
|= STT_PROTO_IPV4
;
622 if (l4_proto
== IPPROTO_TCP
)
623 stth
->flags
|= STT_PROTO_TCP
;
625 stth
->mss
= htons(skb_shinfo(skb
)->gso_size
);
626 } else if (skb
->ip_summed
== CHECKSUM_UNNECESSARY
) {
627 stth
->flags
|= STT_CSUM_VERIFIED
;
630 stth
->vlan_tci
= htons(skb
->vlan_tci
);
632 put_unaligned(tun_id
, &stth
->key
);
634 tcph
->source
= s_port
;
636 tcph
->doff
= sizeof(struct tcphdr
) / 4;
639 tcph
->window
= htons(USHRT_MAX
);
640 tcph
->seq
= htonl(data_len
<< STT_SEQ_LEN_SHIFT
);
641 tcph
->ack_seq
= ack_seq();
642 tcph
->check
= ~tcp_v4_check(skb
->len
, saddr
, dst
, 0);
644 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
645 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
646 skb
->ip_summed
= CHECKSUM_PARTIAL
;
648 encap_mss
= dst_mtu
- sizeof(struct iphdr
) - sizeof(struct tcphdr
);
649 if (data_len
> encap_mss
) {
650 if (unlikely(skb_unclone(skb
, GFP_ATOMIC
)))
653 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV4
;
654 skb_shinfo(skb
)->gso_size
= encap_mss
;
655 skb_shinfo(skb
)->gso_segs
= DIV_ROUND_UP(data_len
, encap_mss
);
657 if (unlikely(clear_gso(skb
)))
663 static struct sk_buff
*push_stt_header(struct sk_buff
*head
, __be64 tun_id
,
664 __be16 s_port
, __be16 d_port
,
665 __be32 saddr
, __be32 dst
,
666 __be16 l3_proto
, u8 l4_proto
,
671 if (skb_shinfo(head
)->frag_list
) {
672 bool ipv4
= (l3_proto
== htons(ETH_P_IP
));
673 bool tcp
= (l4_proto
== IPPROTO_TCP
);
674 bool csum_partial
= (head
->ip_summed
== CHECKSUM_PARTIAL
);
675 int l4_offset
= skb_transport_offset(head
);
677 /* Need to call skb_orphan() to report currect true-size.
678 * calling skb_orphan() in this layer is odd but SKB with
679 * frag-list should not be associated with any socket, so
680 * skb-orphan should be no-op. */
682 if (unlikely(segment_skb(&head
, csum_partial
,
683 ipv4
, tcp
, l4_offset
)))
687 for (skb
= head
; skb
; skb
= skb
->next
) {
688 if (__push_stt_header(skb
, tun_id
, s_port
, d_port
, saddr
, dst
,
689 l3_proto
, l4_proto
, dst_mtu
))
695 kfree_skb_list(head
);
699 static int stt_can_offload(struct sk_buff
*skb
, __be16 l3_proto
, u8 l4_proto
)
701 if (skb_is_gso(skb
) && skb
->ip_summed
!= CHECKSUM_PARTIAL
) {
706 if (l4_proto
== IPPROTO_TCP
)
707 csum_offset
= offsetof(struct tcphdr
, check
);
708 else if (l4_proto
== IPPROTO_UDP
)
709 csum_offset
= offsetof(struct udphdr
, check
);
713 len
= skb
->len
- skb_transport_offset(skb
);
714 csum
= (__sum16
*)(skb_transport_header(skb
) + csum_offset
);
716 if (unlikely(!pskb_may_pull(skb
, skb_transport_offset(skb
) +
717 csum_offset
+ sizeof(*csum
))))
720 if (l3_proto
== htons(ETH_P_IP
)) {
721 struct iphdr
*iph
= ip_hdr(skb
);
723 *csum
= ~csum_tcpudp_magic(iph
->saddr
, iph
->daddr
,
725 } else if (l3_proto
== htons(ETH_P_IPV6
)) {
726 struct ipv6hdr
*ip6h
= ipv6_hdr(skb
);
728 *csum
= ~csum_ipv6_magic(&ip6h
->saddr
, &ip6h
->daddr
,
733 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
734 skb
->csum_offset
= csum_offset
;
735 skb
->ip_summed
= CHECKSUM_PARTIAL
;
738 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
739 /* Assume receiver can only offload TCP/UDP over IPv4/6,
740 * and require 802.1Q VLANs to be accelerated.
742 if (l3_proto
!= htons(ETH_P_IP
) &&
743 l3_proto
!= htons(ETH_P_IPV6
))
746 if (l4_proto
!= IPPROTO_TCP
&& l4_proto
!= IPPROTO_UDP
)
749 /* L4 offset must fit in a 1-byte field. */
750 if (skb
->csum_start
- skb_headroom(skb
) > 255)
753 if (skb_shinfo(skb
)->gso_type
& ~SUPPORTED_GSO_TYPES
)
756 /* Total size of encapsulated packet must fit in 16 bits. */
757 if (skb
->len
+ STT_HEADER_LEN
+ sizeof(struct iphdr
) > 65535)
760 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
761 if (skb_vlan_tag_present(skb
) && skb
->vlan_proto
!= htons(ETH_P_8021Q
))
767 static bool need_linearize(const struct sk_buff
*skb
)
769 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
772 if (unlikely(shinfo
->frag_list
))
775 /* Generally speaking we should linearize if there are paged frags.
776 * However, if all of the refcounts are 1 we know nobody else can
777 * change them from underneath us and we can skip the linearization.
779 for (i
= 0; i
< shinfo
->nr_frags
; i
++)
780 if (unlikely(page_count(skb_frag_page(&shinfo
->frags
[i
])) > 1))
786 static struct sk_buff
*handle_offloads(struct sk_buff
*skb
, int min_headroom
)
790 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
791 if (skb_vlan_tag_present(skb
) && skb
->vlan_proto
!= htons(ETH_P_8021Q
)) {
793 min_headroom
+= VLAN_HLEN
;
794 if (skb_headroom(skb
) < min_headroom
) {
795 int head_delta
= SKB_DATA_ALIGN(min_headroom
-
796 skb_headroom(skb
) + 16);
798 err
= pskb_expand_head(skb
, max_t(int, head_delta
, 0),
804 skb
= __vlan_hwaccel_push_inside(skb
);
812 if (skb_is_gso(skb
)) {
813 struct sk_buff
*nskb
;
814 char cb
[sizeof(skb
->cb
)];
816 memcpy(cb
, skb
->cb
, sizeof(cb
));
818 nskb
= __skb_gso_segment(skb
, 0, false);
827 memcpy(nskb
->cb
, cb
, sizeof(cb
));
830 } else if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
831 /* Pages aren't locked and could change at any time.
832 * If this happens after we compute the checksum, the
833 * checksum will be wrong. We linearize now to avoid
836 if (unlikely(need_linearize(skb
))) {
837 err
= __skb_linearize(skb
);
842 err
= skb_checksum_help(skb
);
846 skb
->ip_summed
= CHECKSUM_NONE
;
854 static void skb_list_xmit(struct rtable
*rt
, struct sk_buff
*skb
, __be32 src
,
855 __be32 dst
, __u8 tos
, __u8 ttl
, __be16 df
)
858 struct sk_buff
*next
= skb
->next
;
864 iptunnel_xmit(NULL
, rt
, skb
, src
, dst
, IPPROTO_TCP
,
865 tos
, ttl
, df
, false);
871 static u8
parse_ipv6_l4_proto(struct sk_buff
*skb
)
873 unsigned int nh_ofs
= skb_network_offset(skb
);
879 if (unlikely(!pskb_may_pull(skb
, nh_ofs
+ sizeof(struct ipv6hdr
))))
883 nexthdr
= nh
->nexthdr
;
884 payload_ofs
= (u8
*)(nh
+ 1) - skb
->data
;
886 payload_ofs
= ipv6_skip_exthdr(skb
, payload_ofs
, &nexthdr
, &frag_off
);
887 if (unlikely(payload_ofs
< 0))
893 static u8
skb_get_l4_proto(struct sk_buff
*skb
, __be16 l3_proto
)
895 if (l3_proto
== htons(ETH_P_IP
)) {
896 unsigned int nh_ofs
= skb_network_offset(skb
);
898 if (unlikely(!pskb_may_pull(skb
, nh_ofs
+ sizeof(struct iphdr
))))
901 return ip_hdr(skb
)->protocol
;
902 } else if (l3_proto
== htons(ETH_P_IPV6
)) {
903 return parse_ipv6_l4_proto(skb
);
908 static int stt_xmit_skb(struct sk_buff
*skb
, struct rtable
*rt
,
909 __be32 src
, __be32 dst
, __u8 tos
,
910 __u8 ttl
, __be16 df
, __be16 src_port
, __be16 dst_port
,
913 struct ethhdr
*eh
= eth_hdr(skb
);
914 int ret
= 0, min_headroom
;
915 __be16 inner_l3_proto
;
918 inner_l3_proto
= eh
->h_proto
;
919 inner_l4_proto
= skb_get_l4_proto(skb
, inner_l3_proto
);
921 min_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + rt
->dst
.header_len
922 + STT_HEADER_LEN
+ sizeof(struct iphdr
);
924 if (skb_headroom(skb
) < min_headroom
|| skb_header_cloned(skb
)) {
925 int head_delta
= SKB_DATA_ALIGN(min_headroom
-
929 ret
= pskb_expand_head(skb
, max_t(int, head_delta
, 0),
935 ret
= stt_can_offload(skb
, inner_l3_proto
, inner_l4_proto
);
939 skb
= handle_offloads(skb
, min_headroom
);
949 struct sk_buff
*next_skb
= skb
->next
;
956 /* Push STT and TCP header. */
957 skb
= push_stt_header(skb
, tun_id
, src_port
, dst_port
, src
,
958 dst
, inner_l3_proto
, inner_l4_proto
,
960 if (unlikely(!skb
)) {
965 /* Push IP header. */
966 skb_list_xmit(rt
, skb
, src
, dst
, tos
, ttl
, df
);
980 static struct rtable
*stt_get_rt(struct sk_buff
*skb
,
981 struct net_device
*dev
,
983 const struct ip_tunnel_key
*key
)
985 struct net
*net
= dev_net(dev
);
988 memset(fl
, 0, sizeof(*fl
));
989 fl
->daddr
= key
->u
.ipv4
.dst
;
990 fl
->saddr
= key
->u
.ipv4
.src
;
991 fl
->flowi4_tos
= RT_TOS(key
->tos
);
992 fl
->flowi4_mark
= skb
->mark
;
993 fl
->flowi4_proto
= IPPROTO_TCP
;
995 return ip_route_output_key(net
, fl
);
998 netdev_tx_t
ovs_stt_xmit(struct sk_buff
*skb
)
1000 struct net_device
*dev
= skb
->dev
;
1001 struct stt_dev
*stt_dev
= netdev_priv(dev
);
1002 struct net
*net
= stt_dev
->net
;
1003 __be16 dport
= stt_dev
->dst_port
;
1004 struct ip_tunnel_key
*tun_key
;
1005 struct ip_tunnel_info
*tun_info
;
1012 tun_info
= skb_tunnel_info(skb
);
1013 if (unlikely(!tun_info
)) {
1018 tun_key
= &tun_info
->key
;
1020 rt
= stt_get_rt(skb
, dev
, &fl
, tun_key
);
1026 df
= tun_key
->tun_flags
& TUNNEL_DONT_FRAGMENT
? htons(IP_DF
) : 0;
1027 sport
= udp_flow_src_port(net
, skb
, 1, USHRT_MAX
, true);
1030 stt_xmit_skb(skb
, rt
, fl
.saddr
, tun_key
->u
.ipv4
.dst
,
1031 tun_key
->tos
, tun_key
->ttl
,
1032 df
, sport
, dport
, tun_key
->tun_id
);
1033 return NETDEV_TX_OK
;
1036 dev
->stats
.tx_errors
++;
1037 return NETDEV_TX_OK
;
1039 EXPORT_SYMBOL(ovs_stt_xmit
);
1041 static void free_frag(struct stt_percpu
*stt_percpu
,
1042 struct pkt_frag
*frag
)
1044 stt_percpu
->frag_mem_used
-= FRAG_CB(frag
->skbs
)->first
.mem_used
;
1045 kfree_skb_list(frag
->skbs
);
1046 list_del(&frag
->lru_node
);
1050 static void evict_frags(struct stt_percpu
*stt_percpu
)
1052 while (!list_empty(&stt_percpu
->frag_lru
) &&
1053 stt_percpu
->frag_mem_used
> REASM_LO_THRESH
) {
1054 struct pkt_frag
*frag
;
1056 frag
= list_first_entry(&stt_percpu
->frag_lru
,
1059 free_frag(stt_percpu
, frag
);
1063 static bool pkt_key_match(struct net
*net
,
1064 const struct pkt_frag
*a
, const struct pkt_key
*b
)
1066 return a
->key
.saddr
== b
->saddr
&& a
->key
.daddr
== b
->daddr
&&
1067 a
->key
.pkt_seq
== b
->pkt_seq
&& a
->key
.mark
== b
->mark
&&
1068 net_eq(dev_net(a
->skbs
->dev
), net
);
1071 static u32
pkt_key_hash(const struct net
*net
, const struct pkt_key
*key
)
1073 u32 initval
= frag_hash_seed
^ (u32
)(unsigned long)net
^ key
->mark
;
1075 return jhash_3words((__force u32
)key
->saddr
, (__force u32
)key
->daddr
,
1076 (__force u32
)key
->pkt_seq
, initval
);
1079 static struct pkt_frag
*lookup_frag(struct net
*net
,
1080 struct stt_percpu
*stt_percpu
,
1081 const struct pkt_key
*key
, u32 hash
)
1083 struct pkt_frag
*frag
, *victim_frag
= NULL
;
1086 for (i
= 0; i
< FRAG_HASH_SEGS
; i
++) {
1087 frag
= flex_array_get(stt_percpu
->frag_hash
,
1088 hash
& (FRAG_HASH_ENTRIES
- 1));
1091 time_before(jiffies
, frag
->timestamp
+ FRAG_EXP_TIME
) &&
1092 pkt_key_match(net
, frag
, key
))
1096 (victim_frag
->skbs
&&
1098 time_before(frag
->timestamp
, victim_frag
->timestamp
))))
1101 hash
>>= FRAG_HASH_SHIFT
;
1104 if (victim_frag
->skbs
)
1105 free_frag(stt_percpu
, victim_frag
);
1110 #ifdef SKIP_ZERO_COPY
1111 static int __copy_skb(struct sk_buff
*to
, struct sk_buff
*from
,
1112 int *delta
, bool *headstolen
)
1116 if (unlikely(to
->next
))
1119 if (unlikely(FRAG_CB(to
)->offset
))
1122 if (unlikely(skb_unclone(to
, GFP_ATOMIC
)))
1125 if (skb_try_coalesce(to
, from
, headstolen
, delta
))
1128 *headstolen
= false;
1129 err
= pskb_expand_head(to
, 0, to
->data_len
+ from
->len
, GFP_ATOMIC
);
1133 if (unlikely(!__pskb_pull_tail(to
, to
->data_len
)))
1136 skb_copy_bits(from
, 0, skb_put(to
, from
->len
), from
->len
);
1139 to
->truesize
+= from
->len
;
1143 static int __copy_skb(struct sk_buff
*to
, struct sk_buff
*from
,
1144 int *delta
, bool *headstolen
)
1146 *headstolen
= false;
1151 static struct sk_buff
*reassemble(struct sk_buff
*skb
)
1153 struct iphdr
*iph
= ip_hdr(skb
);
1154 struct tcphdr
*tcph
= tcp_hdr(skb
);
1155 u32 seq
= ntohl(tcph
->seq
);
1156 struct stt_percpu
*stt_percpu
;
1157 struct sk_buff
*last_skb
, *copied_skb
= NULL
;
1158 struct pkt_frag
*frag
;
1160 int tot_len
, delta
= skb
->truesize
;
1164 tot_len
= seq
>> STT_SEQ_LEN_SHIFT
;
1165 FRAG_CB(skb
)->offset
= seq
& STT_SEQ_OFFSET_MASK
;
1167 if (unlikely(skb
->len
== 0))
1170 if (unlikely(FRAG_CB(skb
)->offset
+ skb
->len
> tot_len
))
1173 if (tot_len
== skb
->len
)
1176 key
.saddr
= iph
->saddr
;
1177 key
.daddr
= iph
->daddr
;
1178 key
.pkt_seq
= tcph
->ack_seq
;
1179 key
.mark
= skb
->mark
;
1180 hash
= pkt_key_hash(dev_net(skb
->dev
), &key
);
1182 stt_percpu
= per_cpu_ptr(stt_percpu_data
, smp_processor_id());
1184 spin_lock(&stt_percpu
->lock
);
1186 if (unlikely(stt_percpu
->frag_mem_used
+ skb
->truesize
> REASM_HI_THRESH
))
1187 evict_frags(stt_percpu
);
1189 frag
= lookup_frag(dev_net(skb
->dev
), stt_percpu
, &key
, hash
);
1193 frag
->timestamp
= jiffies
;
1194 FRAG_CB(skb
)->first
.last_skb
= skb
;
1195 FRAG_CB(skb
)->first
.mem_used
= skb
->truesize
;
1196 FRAG_CB(skb
)->first
.tot_len
= tot_len
;
1197 FRAG_CB(skb
)->first
.rcvd_len
= skb
->len
;
1198 FRAG_CB(skb
)->first
.set_ecn_ce
= false;
1199 list_add_tail(&frag
->lru_node
, &stt_percpu
->frag_lru
);
1200 stt_percpu
->frag_mem_used
+= skb
->truesize
;
1205 /* Optimize for the common case where fragments are received in-order
1206 * and not overlapping.
1208 last_skb
= FRAG_CB(frag
->skbs
)->first
.last_skb
;
1209 if (likely(FRAG_CB(last_skb
)->offset
+ last_skb
->len
==
1210 FRAG_CB(skb
)->offset
)) {
1212 if (!__copy_skb(frag
->skbs
, skb
, &delta
, &headstolen
)) {
1215 last_skb
->next
= skb
;
1216 FRAG_CB(frag
->skbs
)->first
.last_skb
= skb
;
1219 struct sk_buff
*prev
= NULL
, *next
;
1221 for (next
= frag
->skbs
; next
; next
= next
->next
) {
1222 if (FRAG_CB(next
)->offset
>= FRAG_CB(skb
)->offset
)
1227 /* Overlapping fragments aren't allowed. We shouldn't start
1228 * before the end of the previous fragment.
1231 FRAG_CB(prev
)->offset
+ prev
->len
> FRAG_CB(skb
)->offset
)
1234 /* We also shouldn't end after the beginning of the next
1238 FRAG_CB(skb
)->offset
+ skb
->len
> FRAG_CB(next
)->offset
)
1244 FRAG_CB(skb
)->first
= FRAG_CB(frag
->skbs
)->first
;
1251 FRAG_CB(frag
->skbs
)->first
.last_skb
= skb
;
1254 FRAG_CB(frag
->skbs
)->first
.set_ecn_ce
|= INET_ECN_is_ce(iph
->tos
);
1255 FRAG_CB(frag
->skbs
)->first
.rcvd_len
+= skb
->len
;
1256 stt_percpu
->frag_mem_used
+= delta
;
1257 FRAG_CB(frag
->skbs
)->first
.mem_used
+= delta
;
1259 if (FRAG_CB(frag
->skbs
)->first
.tot_len
==
1260 FRAG_CB(frag
->skbs
)->first
.rcvd_len
) {
1261 struct sk_buff
*frag_head
= frag
->skbs
;
1263 frag_head
->tstamp
= skb
->tstamp
;
1264 if (FRAG_CB(frag_head
)->first
.set_ecn_ce
)
1265 INET_ECN_set_ce(frag_head
);
1267 list_del(&frag
->lru_node
);
1268 stt_percpu
->frag_mem_used
-= FRAG_CB(frag_head
)->first
.mem_used
;
1272 list_move_tail(&frag
->lru_node
, &stt_percpu
->frag_lru
);
1277 kfree_skb_partial(copied_skb
, headstolen
);
1284 spin_unlock(&stt_percpu
->lock
);
1293 static bool validate_checksum(struct sk_buff
*skb
)
1295 struct iphdr
*iph
= ip_hdr(skb
);
1297 if (skb_csum_unnecessary(skb
))
1300 if (skb
->ip_summed
== CHECKSUM_COMPLETE
&&
1301 !tcp_v4_check(skb
->len
, iph
->saddr
, iph
->daddr
, skb
->csum
))
1304 skb
->csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
, skb
->len
,
1307 return __tcp_checksum_complete(skb
) == 0;
1310 static bool set_offloads(struct sk_buff
*skb
)
1312 struct stthdr
*stth
= stt_hdr(skb
);
1313 unsigned short gso_type
;
1320 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
),
1321 ntohs(stth
->vlan_tci
));
1323 if (!(stth
->flags
& STT_CSUM_PARTIAL
)) {
1324 if (stth
->flags
& STT_CSUM_VERIFIED
)
1325 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1327 skb
->ip_summed
= CHECKSUM_NONE
;
1329 return clear_gso(skb
) == 0;
1332 proto_type
= stth
->flags
& STT_PROTO_TYPES
;
1334 switch (proto_type
) {
1335 case (STT_PROTO_IPV4
| STT_PROTO_TCP
):
1337 csum_offset
= offsetof(struct tcphdr
, check
);
1338 gso_type
= SKB_GSO_TCPV4
;
1339 l3_header_size
= sizeof(struct iphdr
);
1340 l4_header_size
= sizeof(struct tcphdr
);
1341 skb
->protocol
= htons(ETH_P_IP
);
1345 csum_offset
= offsetof(struct tcphdr
, check
);
1346 gso_type
= SKB_GSO_TCPV6
;
1347 l3_header_size
= sizeof(struct ipv6hdr
);
1348 l4_header_size
= sizeof(struct tcphdr
);
1349 skb
->protocol
= htons(ETH_P_IPV6
);
1351 case STT_PROTO_IPV4
:
1353 csum_offset
= offsetof(struct udphdr
, check
);
1354 gso_type
= SKB_GSO_UDP
;
1355 l3_header_size
= sizeof(struct iphdr
);
1356 l4_header_size
= sizeof(struct udphdr
);
1357 skb
->protocol
= htons(ETH_P_IP
);
1361 csum_offset
= offsetof(struct udphdr
, check
);
1362 gso_type
= SKB_GSO_UDP
;
1363 l3_header_size
= sizeof(struct ipv6hdr
);
1364 l4_header_size
= sizeof(struct udphdr
);
1365 skb
->protocol
= htons(ETH_P_IPV6
);
1368 if (unlikely(stth
->l4_offset
< ETH_HLEN
+ l3_header_size
))
1371 if (unlikely(!pskb_may_pull(skb
, stth
->l4_offset
+ l4_header_size
)))
1374 stth
= stt_hdr(skb
);
1376 skb
->csum_start
= skb_headroom(skb
) + stth
->l4_offset
;
1377 skb
->csum_offset
= csum_offset
;
1378 skb
->ip_summed
= CHECKSUM_PARTIAL
;
1381 if (unlikely(skb_unclone(skb
, GFP_ATOMIC
)))
1384 skb_shinfo(skb
)->gso_type
= gso_type
| SKB_GSO_DODGY
;
1385 skb_shinfo(skb
)->gso_size
= ntohs(stth
->mss
);
1386 skb_shinfo(skb
)->gso_segs
= 0;
1388 if (unlikely(clear_gso(skb
)))
1395 static void rcv_list(struct net_device
*dev
, struct sk_buff
*skb
,
1396 struct metadata_dst
*tun_dst
)
1398 struct sk_buff
*next
;
1404 ovs_dst_hold((struct dst_entry
*)tun_dst
);
1405 ovs_skb_dst_set(next
, (struct dst_entry
*)tun_dst
);
1407 ovs_ip_tunnel_rcv(dev
, skb
, tun_dst
);
1408 } while ((skb
= next
));
1411 #ifndef USE_UPSTREAM_TUNNEL
1412 static int __stt_rcv(struct stt_dev
*stt_dev
, struct sk_buff
*skb
)
1414 struct metadata_dst tun_dst
;
1416 ovs_ip_tun_rx_dst(&tun_dst
, skb
, TUNNEL_KEY
| TUNNEL_CSUM
,
1417 get_unaligned(&stt_hdr(skb
)->key
), 0);
1418 tun_dst
.u
.tun_info
.key
.tp_src
= tcp_hdr(skb
)->source
;
1419 tun_dst
.u
.tun_info
.key
.tp_dst
= tcp_hdr(skb
)->dest
;
1421 rcv_list(stt_dev
->dev
, skb
, &tun_dst
);
1425 static int __stt_rcv(struct stt_dev
*stt_dev
, struct sk_buff
*skb
)
1427 struct metadata_dst
*tun_dst
;
1431 flags
= TUNNEL_KEY
| TUNNEL_CSUM
;
1432 tun_id
= get_unaligned(&stt_hdr(skb
)->key
);
1433 tun_dst
= ip_tun_rx_dst(skb
, flags
, tun_id
, 0);
1436 tun_dst
->u
.tun_info
.key
.tp_src
= tcp_hdr(skb
)->source
;
1437 tun_dst
->u
.tun_info
.key
.tp_dst
= tcp_hdr(skb
)->dest
;
1439 rcv_list(stt_dev
->dev
, skb
, tun_dst
);
1444 static void stt_rcv(struct stt_dev
*stt_dev
, struct sk_buff
*skb
)
1448 if (unlikely(!validate_checksum(skb
)))
1451 __skb_pull(skb
, sizeof(struct tcphdr
));
1452 skb
= reassemble(skb
);
1456 if (skb
->next
&& coalesce_skb(&skb
))
1459 err
= iptunnel_pull_header(skb
,
1460 sizeof(struct stthdr
) + STT_ETH_PAD
,
1462 !net_eq(stt_dev
->net
, dev_net(stt_dev
->dev
)));
1466 if (unlikely(stt_hdr(skb
)->version
!= 0))
1469 if (unlikely(!set_offloads(skb
)))
1472 if (skb_shinfo(skb
)->frag_list
&& try_to_segment(skb
))
1475 err
= __stt_rcv(stt_dev
, skb
);
1480 /* Consume bad packet */
1481 kfree_skb_list(skb
);
1482 stt_dev
->dev
->stats
.rx_errors
++;
1485 static void tcp_sock_release(struct socket
*sock
)
1487 kernel_sock_shutdown(sock
, SHUT_RDWR
);
1491 static int tcp_sock_create4(struct net
*net
, __be16 port
,
1492 struct socket
**sockp
)
1494 struct sockaddr_in tcp_addr
;
1495 struct socket
*sock
= NULL
;
1498 err
= sock_create_kern(net
, AF_INET
, SOCK_STREAM
, IPPROTO_TCP
, &sock
);
1502 memset(&tcp_addr
, 0, sizeof(tcp_addr
));
1503 tcp_addr
.sin_family
= AF_INET
;
1504 tcp_addr
.sin_addr
.s_addr
= htonl(INADDR_ANY
);
1505 tcp_addr
.sin_port
= port
;
1506 err
= kernel_bind(sock
, (struct sockaddr
*)&tcp_addr
,
1516 tcp_sock_release(sock
);
1521 static void schedule_clean_percpu(void)
1523 schedule_delayed_work(&clean_percpu_wq
, CLEAN_PERCPU_INTERVAL
);
1526 static void clean_percpu(struct work_struct
*work
)
1530 for_each_possible_cpu(i
) {
1531 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1534 for (j
= 0; j
< FRAG_HASH_ENTRIES
; j
++) {
1535 struct pkt_frag
*frag
;
1537 frag
= flex_array_get(stt_percpu
->frag_hash
, j
);
1539 time_before(jiffies
, frag
->timestamp
+ FRAG_EXP_TIME
))
1542 spin_lock_bh(&stt_percpu
->lock
);
1545 time_after(jiffies
, frag
->timestamp
+ FRAG_EXP_TIME
))
1546 free_frag(stt_percpu
, frag
);
1548 spin_unlock_bh(&stt_percpu
->lock
);
1551 schedule_clean_percpu();
1554 #ifdef HAVE_NF_HOOKFN_ARG_OPS
1555 #define FIRST_PARAM const struct nf_hook_ops *ops
1557 #ifdef HAVE_NF_HOOKFN_ARG_PRIV
1558 #define FIRST_PARAM void *priv
1560 #define FIRST_PARAM unsigned int hooknum
1564 #ifdef HAVE_NF_HOOK_STATE
1565 #if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)
1566 /* RHEL nfhook hacks. */
1567 #ifndef __GENKSYMS__
1568 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1569 const struct nf_hook_state *state
1571 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1572 int (*okfn)(struct sk_buff *)
1575 #define LAST_PARAM const struct nf_hook_state *state
1578 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1579 int (*okfn)(struct sk_buff *)
1582 static unsigned int nf_ip_hook(FIRST_PARAM
, struct sk_buff
*skb
, LAST_PARAM
)
1584 struct stt_dev
*stt_dev
;
1587 if (ip_hdr(skb
)->protocol
!= IPPROTO_TCP
)
1590 ip_hdr_len
= ip_hdrlen(skb
);
1591 if (unlikely(!pskb_may_pull(skb
, ip_hdr_len
+ sizeof(struct tcphdr
))))
1594 skb_set_transport_header(skb
, ip_hdr_len
);
1596 stt_dev
= stt_find_up_dev(dev_net(skb
->dev
), tcp_hdr(skb
)->dest
);
1600 __skb_pull(skb
, ip_hdr_len
);
1601 stt_rcv(stt_dev
, skb
);
1605 static struct nf_hook_ops nf_hook_ops __read_mostly
= {
1607 #ifdef HAVE_NF_HOOKS_OPS_OWNER
1608 .owner
= THIS_MODULE
,
1611 .hooknum
= NF_INET_LOCAL_IN
,
1612 .priority
= INT_MAX
,
1615 static int stt_start(struct net
*net
)
1617 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1625 get_random_bytes(&frag_hash_seed
, sizeof(u32
));
1627 stt_percpu_data
= alloc_percpu(struct stt_percpu
);
1628 if (!stt_percpu_data
) {
1633 for_each_possible_cpu(i
) {
1634 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1635 struct flex_array
*frag_hash
;
1637 spin_lock_init(&stt_percpu
->lock
);
1638 INIT_LIST_HEAD(&stt_percpu
->frag_lru
);
1639 get_random_bytes(&per_cpu(pkt_seq_counter
, i
), sizeof(u32
));
1641 frag_hash
= flex_array_alloc(sizeof(struct pkt_frag
),
1643 GFP_KERNEL
| __GFP_ZERO
);
1648 stt_percpu
->frag_hash
= frag_hash
;
1650 err
= flex_array_prealloc(stt_percpu
->frag_hash
, 0,
1652 GFP_KERNEL
| __GFP_ZERO
);
1656 schedule_clean_percpu();
1659 if (sn
->n_tunnels
) {
1663 #ifdef HAVE_NF_REGISTER_NET_HOOK
1664 /* On kernel which support per net nf-hook, nf_register_hook() takes
1665 * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
1669 if (sn
->nf_hook_reg_done
)
1672 err
= nf_register_net_hook(net
, &nf_hook_ops
);
1674 sn
->nf_hook_reg_done
= true;
1676 /* Register STT only on very first STT device addition. */
1677 if (!list_empty(&nf_hook_ops
.list
))
1680 err
= nf_register_hook(&nf_hook_ops
);
1691 for_each_possible_cpu(i
) {
1692 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1694 if (stt_percpu
->frag_hash
)
1695 flex_array_free(stt_percpu
->frag_hash
);
1698 free_percpu(stt_percpu_data
);
1704 static void stt_cleanup(struct net
*net
)
1706 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1717 cancel_delayed_work_sync(&clean_percpu_wq
);
1718 for_each_possible_cpu(i
) {
1719 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1722 for (j
= 0; j
< FRAG_HASH_ENTRIES
; j
++) {
1723 struct pkt_frag
*frag
;
1725 frag
= flex_array_get(stt_percpu
->frag_hash
, j
);
1726 kfree_skb_list(frag
->skbs
);
1729 flex_array_free(stt_percpu
->frag_hash
);
1732 free_percpu(stt_percpu_data
);
1735 static netdev_tx_t
stt_dev_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
1737 #ifdef USE_UPSTREAM_TUNNEL
1738 return ovs_stt_xmit(skb
);
1740 /* Drop All packets coming from networking stack. OVS-CB is
1741 * not initialized for these packets.
1744 dev
->stats
.tx_dropped
++;
1745 return NETDEV_TX_OK
;
1749 /* Setup stats when device is created */
1750 static int stt_init(struct net_device
*dev
)
1752 dev
->tstats
= (typeof(dev
->tstats
)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats
);
1759 static void stt_uninit(struct net_device
*dev
)
1761 free_percpu(dev
->tstats
);
1764 static int stt_open(struct net_device
*dev
)
1766 struct stt_dev
*stt
= netdev_priv(dev
);
1767 struct net
*net
= stt
->net
;
1768 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1771 err
= stt_start(net
);
1775 err
= tcp_sock_create4(net
, stt
->dst_port
, &stt
->sock
);
1778 list_add_rcu(&stt
->up_next
, &sn
->stt_up_list
);
1782 static int stt_stop(struct net_device
*dev
)
1784 struct stt_dev
*stt_dev
= netdev_priv(dev
);
1785 struct net
*net
= stt_dev
->net
;
1787 list_del_rcu(&stt_dev
->up_next
);
1789 tcp_sock_release(stt_dev
->sock
);
1790 stt_dev
->sock
= NULL
;
1795 static int __stt_change_mtu(struct net_device
*dev
, int new_mtu
, bool strict
)
1797 int max_mtu
= IP_MAX_MTU
- STT_HEADER_LEN
- sizeof(struct iphdr
)
1798 - dev
->hard_header_len
;
1803 if (new_mtu
> max_mtu
) {
1814 static int stt_change_mtu(struct net_device
*dev
, int new_mtu
)
1816 return __stt_change_mtu(dev
, new_mtu
, true);
1819 int ovs_stt_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
1821 struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
1822 struct stt_dev
*stt_dev
= netdev_priv(dev
);
1823 struct net
*net
= stt_dev
->net
;
1824 __be16 dport
= stt_dev
->dst_port
;
1828 if (ip_tunnel_info_af(info
) != AF_INET
)
1831 rt
= stt_get_rt(skb
, dev
, &fl4
, &info
->key
);
1837 info
->key
.u
.ipv4
.src
= fl4
.saddr
;
1838 info
->key
.tp_src
= udp_flow_src_port(net
, skb
, 1, USHRT_MAX
, true);
1839 info
->key
.tp_dst
= dport
;
1842 EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst
);
1844 static const struct net_device_ops stt_netdev_ops
= {
1845 .ndo_init
= stt_init
,
1846 .ndo_uninit
= stt_uninit
,
1847 .ndo_open
= stt_open
,
1848 .ndo_stop
= stt_stop
,
1849 .ndo_start_xmit
= stt_dev_xmit
,
1850 .ndo_get_stats64
= ip_tunnel_get_stats64
,
1851 .ndo_change_mtu
= stt_change_mtu
,
1852 .ndo_validate_addr
= eth_validate_addr
,
1853 .ndo_set_mac_address
= eth_mac_addr
,
1854 #ifdef USE_UPSTREAM_TUNNEL
1855 #ifdef HAVE_NDO_FILL_METADATA_DST
1856 .ndo_fill_metadata_dst
= stt_fill_metadata_dst
,
1861 static void stt_get_drvinfo(struct net_device
*dev
,
1862 struct ethtool_drvinfo
*drvinfo
)
1864 strlcpy(drvinfo
->version
, STT_NETDEV_VER
, sizeof(drvinfo
->version
));
1865 strlcpy(drvinfo
->driver
, "stt", sizeof(drvinfo
->driver
));
1868 static const struct ethtool_ops stt_ethtool_ops
= {
1869 .get_drvinfo
= stt_get_drvinfo
,
1870 .get_link
= ethtool_op_get_link
,
1873 /* Info for udev, that this is a virtual tunnel endpoint */
1874 static struct device_type stt_type
= {
1878 /* Initialize the device structure. */
1879 static void stt_setup(struct net_device
*dev
)
1883 dev
->netdev_ops
= &stt_netdev_ops
;
1884 dev
->ethtool_ops
= &stt_ethtool_ops
;
1885 dev
->destructor
= free_netdev
;
1887 SET_NETDEV_DEVTYPE(dev
, &stt_type
);
1889 dev
->features
|= NETIF_F_LLTX
| NETIF_F_NETNS_LOCAL
;
1890 dev
->features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
;
1891 dev
->features
|= NETIF_F_RXCSUM
;
1892 dev
->features
|= NETIF_F_GSO_SOFTWARE
;
1894 dev
->hw_features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
| NETIF_F_RXCSUM
;
1895 dev
->hw_features
|= NETIF_F_GSO_SOFTWARE
;
1897 #ifdef USE_UPSTREAM_TUNNEL
1898 netif_keep_dst(dev
);
1900 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
| IFF_NO_QUEUE
;
1901 eth_hw_addr_random(dev
);
1904 static const struct nla_policy stt_policy
[IFLA_STT_MAX
+ 1] = {
1905 [IFLA_STT_PORT
] = { .type
= NLA_U16
},
1908 static int stt_validate(struct nlattr
*tb
[], struct nlattr
*data
[])
1910 if (tb
[IFLA_ADDRESS
]) {
1911 if (nla_len(tb
[IFLA_ADDRESS
]) != ETH_ALEN
)
1914 if (!is_valid_ether_addr(nla_data(tb
[IFLA_ADDRESS
])))
1915 return -EADDRNOTAVAIL
;
1921 static struct stt_dev
*find_dev(struct net
*net
, __be16 dst_port
)
1923 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1924 struct stt_dev
*dev
;
1926 list_for_each_entry(dev
, &sn
->stt_list
, next
) {
1927 if (dev
->dst_port
== dst_port
)
1933 static int stt_configure(struct net
*net
, struct net_device
*dev
,
1936 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1937 struct stt_dev
*stt
= netdev_priv(dev
);
1943 stt
->dst_port
= dst_port
;
1945 if (find_dev(net
, dst_port
))
1948 err
= __stt_change_mtu(dev
, IP_MAX_MTU
, false);
1952 err
= register_netdevice(dev
);
1956 list_add(&stt
->next
, &sn
->stt_list
);
1960 static int stt_newlink(struct net
*net
, struct net_device
*dev
,
1961 struct nlattr
*tb
[], struct nlattr
*data
[])
1963 __be16 dst_port
= htons(STT_DST_PORT
);
1965 if (data
[IFLA_STT_PORT
])
1966 dst_port
= nla_get_be16(data
[IFLA_STT_PORT
]);
1968 return stt_configure(net
, dev
, dst_port
);
1971 static void stt_dellink(struct net_device
*dev
, struct list_head
*head
)
1973 struct stt_dev
*stt
= netdev_priv(dev
);
1975 list_del(&stt
->next
);
1976 unregister_netdevice_queue(dev
, head
);
1979 static size_t stt_get_size(const struct net_device
*dev
)
1981 return nla_total_size(sizeof(__be32
)); /* IFLA_STT_PORT */
1984 static int stt_fill_info(struct sk_buff
*skb
, const struct net_device
*dev
)
1986 struct stt_dev
*stt
= netdev_priv(dev
);
1988 if (nla_put_be16(skb
, IFLA_STT_PORT
, stt
->dst_port
))
1989 goto nla_put_failure
;
1997 static struct rtnl_link_ops stt_link_ops __read_mostly
= {
1999 .maxtype
= IFLA_STT_MAX
,
2000 .policy
= stt_policy
,
2001 .priv_size
= sizeof(struct stt_dev
),
2003 .validate
= stt_validate
,
2004 .newlink
= stt_newlink
,
2005 .dellink
= stt_dellink
,
2006 .get_size
= stt_get_size
,
2007 .fill_info
= stt_fill_info
,
2010 struct net_device
*ovs_stt_dev_create_fb(struct net
*net
, const char *name
,
2011 u8 name_assign_type
, u16 dst_port
)
2013 struct nlattr
*tb
[IFLA_MAX
+ 1];
2014 struct net_device
*dev
;
2017 memset(tb
, 0, sizeof(tb
));
2018 dev
= rtnl_create_link(net
, (char *) name
, name_assign_type
,
2023 err
= stt_configure(net
, dev
, htons(dst_port
));
2026 return ERR_PTR(err
);
2030 EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb
);
2032 static int stt_init_net(struct net
*net
)
2034 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
2036 INIT_LIST_HEAD(&sn
->stt_list
);
2037 INIT_LIST_HEAD(&sn
->stt_up_list
);
2038 #ifdef HAVE_NF_REGISTER_NET_HOOK
2039 sn
->nf_hook_reg_done
= false;
2044 static void stt_exit_net(struct net
*net
)
2046 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
2047 struct stt_dev
*stt
, *next
;
2048 struct net_device
*dev
, *aux
;
2051 #ifdef HAVE_NF_REGISTER_NET_HOOK
2052 /* Ideally this should be done from stt_stop(), But on some kernels
2053 * nf-unreg operation needs RTNL-lock, which can cause deallock.
2054 * So it is done from here. */
2055 if (sn
->nf_hook_reg_done
)
2056 nf_unregister_net_hook(net
, &nf_hook_ops
);
2061 /* gather any stt devices that were moved into this ns */
2062 for_each_netdev_safe(net
, dev
, aux
)
2063 if (dev
->rtnl_link_ops
== &stt_link_ops
)
2064 unregister_netdevice_queue(dev
, &list
);
2066 list_for_each_entry_safe(stt
, next
, &sn
->stt_list
, next
) {
2067 /* If stt->dev is in the same netns, it was already added
2068 * to the stt by the previous loop.
2070 if (!net_eq(dev_net(stt
->dev
), net
))
2071 unregister_netdevice_queue(stt
->dev
, &list
);
2074 /* unregister the devices gathered above */
2075 unregister_netdevice_many(&list
);
2079 static struct pernet_operations stt_net_ops
= {
2080 .init
= stt_init_net
,
2081 .exit
= stt_exit_net
,
2083 .size
= sizeof(struct stt_net
),
2086 int stt_init_module(void)
2090 rc
= register_pernet_subsys(&stt_net_ops
);
2094 rc
= rtnl_link_register(&stt_link_ops
);
2098 INIT_LIST_HEAD(&nf_hook_ops
.list
);
2099 pr_info("STT tunneling driver\n");
2102 unregister_pernet_subsys(&stt_net_ops
);
2107 void stt_cleanup_module(void)
2109 #ifndef HAVE_NF_REGISTER_NET_HOOK
2110 if (!list_empty(&nf_hook_ops
.list
))
2111 nf_unregister_hook(&nf_hook_ops
);
2113 rtnl_link_unregister(&stt_link_ops
);
2114 unregister_pernet_subsys(&stt_net_ops
);