2 * Stateless TCP Tunnel (STT) vport.
4 * Copyright (c) 2015 Nicira, Inc.
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <asm/unaligned.h>
15 #include <linux/delay.h>
17 #include <linux/if_vlan.h>
19 #include <linux/ipv6.h>
20 #include <linux/jhash.h>
21 #include <linux/list.h>
22 #include <linux/log2.h>
23 #include <linux/module.h>
24 #include <linux/net.h>
25 #include <linux/netfilter.h>
26 #include <linux/percpu.h>
27 #include <linux/skbuff.h>
28 #include <linux/tcp.h>
29 #include <linux/workqueue.h>
31 #include <net/dst_metadata.h>
33 #include <net/inet_ecn.h>
35 #include <net/ip_tunnels.h>
36 #include <net/ip6_checksum.h>
37 #include <net/net_namespace.h>
38 #include <net/netns/generic.h>
47 #define STT_NETDEV_VER "0.1"
48 #define STT_DST_PORT 7471
53 * We saw better performance with skipping zero copy in case of SLUB.
54 * So skip zero copy for SLUB case.
56 #define SKIP_ZERO_COPY
61 /* @list: Per-net list of STT ports.
62 * @rcv: The callback is called on STT packet recv, STT reassembly can generate
63 * multiple packets, in this case first packet has tunnel outer header, rest
64 * of the packets are inner packet segments with no stt header.
65 * @rcv_data: user data.
66 * @sock: Fake TCP socket for the STT port.
69 struct net_device
*dev
;
71 struct list_head next
;
72 struct list_head up_next
;
77 #define STT_CSUM_VERIFIED BIT(0)
78 #define STT_CSUM_PARTIAL BIT(1)
79 #define STT_PROTO_IPV4 BIT(2)
80 #define STT_PROTO_TCP BIT(3)
81 #define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
83 #ifdef HAVE_SKB_GSO_UDP
84 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
87 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_DODGY | \
91 /* The length and offset of a fragment are encoded in the sequence number.
92 * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
93 * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
95 #define STT_SEQ_LEN_SHIFT 16
96 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
98 /* The maximum amount of memory used to store packets waiting to be reassembled
99 * on a given CPU. Once this threshold is exceeded we will begin freeing the
100 * least recently used fragments.
102 #define REASM_HI_THRESH (4 * 1024 * 1024)
103 /* The target for the high memory evictor. Once we have exceeded
104 * REASM_HI_THRESH, we will continue freeing fragments until we hit
107 #define REASM_LO_THRESH (3 * 1024 * 1024)
108 /* The length of time a given packet has to be reassembled from the time the
109 * first fragment arrives. Once this limit is exceeded it becomes available
112 #define FRAG_EXP_TIME (30 * HZ)
113 /* Number of hash entries. Each entry has only a single slot to hold a packet
114 * so if there are collisions, we will drop packets. This is allocated
115 * per-cpu and each entry consists of struct pkt_frag.
117 #define FRAG_HASH_SHIFT 8
118 #define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
119 #define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
121 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
131 struct sk_buff
*skbs
;
132 unsigned long timestamp
;
133 struct list_head lru_node
;
138 struct pkt_frag
*frag_hash
;
139 struct list_head frag_lru
;
140 unsigned int frag_mem_used
;
142 /* Protect frags table. */
147 struct sk_buff
*last_skb
;
148 unsigned int mem_used
;
157 /* Only valid for the first skb in the chain. */
158 struct first_frag first
;
161 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
163 /* per-network namespace private data for this module */
165 struct list_head stt_list
;
166 struct list_head stt_up_list
; /* Devices which are in IFF_UP state. */
168 #ifdef HAVE_NF_REGISTER_NET_HOOK
169 bool nf_hook_reg_done
;
173 static int stt_net_id
;
175 static struct stt_percpu __percpu
*stt_percpu_data __read_mostly
;
176 static u32 frag_hash_seed __read_mostly
;
178 /* Protects sock-hash and refcounts. */
179 static DEFINE_MUTEX(stt_mutex
);
181 static int n_tunnels
;
182 static DEFINE_PER_CPU(u32
, pkt_seq_counter
);
184 static void clean_percpu(struct work_struct
*work
);
185 static DECLARE_DELAYED_WORK(clean_percpu_wq
, clean_percpu
);
187 static struct stt_dev
*stt_find_up_dev(struct net
*net
, __be16 port
)
189 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
190 struct stt_dev
*stt_dev
;
192 list_for_each_entry_rcu(stt_dev
, &sn
->stt_up_list
, up_next
) {
193 if (stt_dev
->dst_port
== port
)
199 static __be32
ack_seq(void)
204 pkt_seq
= this_cpu_read(pkt_seq_counter
);
205 ack
= pkt_seq
<< ilog2(NR_CPUS
) | smp_processor_id();
206 this_cpu_inc(pkt_seq_counter
);
208 return (__force __be32
)ack
;
210 #error "Support for greater than 64k CPUs not implemented"
214 static int clear_gso(struct sk_buff
*skb
)
216 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
219 if (shinfo
->gso_type
== 0 && shinfo
->gso_size
== 0 &&
220 shinfo
->gso_segs
== 0)
223 err
= skb_unclone(skb
, GFP_ATOMIC
);
227 shinfo
= skb_shinfo(skb
);
228 shinfo
->gso_type
= 0;
229 shinfo
->gso_size
= 0;
230 shinfo
->gso_segs
= 0;
234 static void copy_skb_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
236 to
->protocol
= from
->protocol
;
237 to
->tstamp
= from
->tstamp
;
238 to
->priority
= from
->priority
;
239 to
->mark
= from
->mark
;
240 to
->vlan_tci
= from
->vlan_tci
;
241 to
->vlan_proto
= from
->vlan_proto
;
242 skb_copy_secmark(to
, from
);
245 static void update_headers(struct sk_buff
*skb
, bool head
,
246 unsigned int l4_offset
, unsigned int hdr_len
,
247 bool ipv4
, u32 tcp_seq
)
249 u16 old_len
, new_len
;
255 struct iphdr
*iph
= (struct iphdr
*)(skb
->data
+ ETH_HLEN
);
257 old_len
= ntohs(iph
->tot_len
);
258 new_len
= skb
->len
- ETH_HLEN
;
259 iph
->tot_len
= htons(new_len
);
263 struct ipv6hdr
*ip6h
= (struct ipv6hdr
*)(skb
->data
+ ETH_HLEN
);
265 old_len
= ntohs(ip6h
->payload_len
);
266 new_len
= skb
->len
- ETH_HLEN
- sizeof(struct ipv6hdr
);
267 ip6h
->payload_len
= htons(new_len
);
270 tcph
= (struct tcphdr
*)(skb
->data
+ l4_offset
);
272 tcph
->seq
= htonl(tcp_seq
);
281 delta
= htonl(~old_len
+ new_len
);
282 tcph
->check
= ~csum_fold((__force __wsum
)((__force u32
)tcph
->check
+
283 (__force u32
)delta
));
285 gso_size
= skb_shinfo(skb
)->gso_size
;
286 if (gso_size
&& skb
->len
- hdr_len
<= gso_size
)
287 BUG_ON(clear_gso(skb
));
290 static bool can_segment(struct sk_buff
*head
, bool ipv4
, bool tcp
, bool csum_partial
)
292 /* If no offloading is in use then we don't have enough information
293 * to process the headers.
298 /* Handling UDP packets requires IP fragmentation, which means that
299 * the L4 checksum can no longer be calculated by hardware (since the
300 * fragments are in different packets. If we have to compute the
301 * checksum it's faster just to linearize and large UDP packets are
302 * pretty uncommon anyways, so it's not worth dealing with for now.
308 struct iphdr
*iph
= (struct iphdr
*)(head
->data
+ ETH_HLEN
);
310 /* It's difficult to get the IP IDs exactly right here due to
311 * varying segment sizes and potentially multiple layers of
312 * segmentation. IP ID isn't important when DF is set and DF
313 * is generally set for TCP packets, so just linearize if it's
316 if (!(iph
->frag_off
& htons(IP_DF
)))
319 struct ipv6hdr
*ip6h
= (struct ipv6hdr
*)(head
->data
+ ETH_HLEN
);
321 /* Jumbograms require more processing to update and we'll
322 * probably never see them, so just linearize.
324 if (ip6h
->payload_len
== 0)
333 static int copy_headers(struct sk_buff
*head
, struct sk_buff
*frag
,
338 if (skb_cloned(frag
) || skb_headroom(frag
) < hdr_len
) {
339 int extra_head
= hdr_len
- skb_headroom(frag
);
341 extra_head
= extra_head
> 0 ? extra_head
: 0;
342 if (unlikely(pskb_expand_head(frag
, extra_head
, 0,
347 memcpy(__skb_push(frag
, hdr_len
), head
->data
, hdr_len
);
349 csum_start
= head
->csum_start
- skb_headroom(head
);
350 frag
->csum_start
= skb_headroom(frag
) + csum_start
;
351 frag
->csum_offset
= head
->csum_offset
;
352 frag
->ip_summed
= head
->ip_summed
;
354 skb_shinfo(frag
)->gso_size
= skb_shinfo(head
)->gso_size
;
355 skb_shinfo(frag
)->gso_type
= skb_shinfo(head
)->gso_type
;
356 skb_shinfo(frag
)->gso_segs
= 0;
358 copy_skb_metadata(frag
, head
);
362 static int skb_list_segment(struct sk_buff
*head
, bool ipv4
, int l4_offset
)
371 if (unlikely(!pskb_may_pull(head
, l4_offset
+ sizeof(*tcph
))))
374 tcph
= (struct tcphdr
*)(head
->data
+ l4_offset
);
375 tcp_len
= tcph
->doff
* 4;
376 hdr_len
= l4_offset
+ tcp_len
;
378 if (unlikely((tcp_len
< sizeof(struct tcphdr
)) ||
379 (head
->len
< hdr_len
)))
382 if (unlikely(!pskb_may_pull(head
, hdr_len
)))
385 tcph
= (struct tcphdr
*)(head
->data
+ l4_offset
);
386 /* Update header of each segment. */
387 seq
= ntohl(tcph
->seq
);
388 seg_len
= skb_pagelen(head
) - hdr_len
;
390 skb
= skb_shinfo(head
)->frag_list
;
391 skb_shinfo(head
)->frag_list
= NULL
;
393 for (; skb
; skb
= skb
->next
) {
396 head
->len
-= skb
->len
;
397 head
->data_len
-= skb
->len
;
398 head
->truesize
-= skb
->truesize
;
402 err
= copy_headers(head
, skb
, hdr_len
);
405 update_headers(skb
, false, l4_offset
, hdr_len
, ipv4
, seq
);
407 update_headers(head
, true, l4_offset
, hdr_len
, ipv4
, 0);
411 #ifndef SKIP_ZERO_COPY
412 static struct sk_buff
*normalize_frag_list(struct sk_buff
*head
,
413 struct sk_buff
**skbp
)
415 struct sk_buff
*skb
= *skbp
;
416 struct sk_buff
*last
;
419 struct sk_buff
*frags
;
421 if (skb_shared(skb
)) {
422 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
425 return ERR_PTR(-ENOMEM
);
427 nskb
->next
= skb
->next
;
434 head
->len
-= skb
->len
;
435 head
->data_len
-= skb
->len
;
436 head
->truesize
-= skb
->truesize
;
439 frags
= skb_shinfo(skb
)->frag_list
;
443 err
= skb_unclone(skb
, GFP_ATOMIC
);
447 last
= normalize_frag_list(skb
, &frags
);
451 skb_shinfo(skb
)->frag_list
= NULL
;
452 last
->next
= skb
->next
;
459 } while ((skb
= skb
->next
));
464 /* Takes a linked list of skbs, which potentially contain frag_list
465 * (whose members in turn potentially contain frag_lists, etc.) and
466 * converts them into a single linear linked list.
468 static int straighten_frag_list(struct sk_buff
**skbp
)
470 struct sk_buff
*err_skb
;
472 err_skb
= normalize_frag_list(NULL
, skbp
);
474 return PTR_ERR(err_skb
);
479 static int coalesce_skb(struct sk_buff
**headp
)
481 struct sk_buff
*frag
, *head
, *prev
;
484 err
= straighten_frag_list(headp
);
489 /* Coalesce frag list. */
491 for (frag
= head
->next
; frag
; frag
= frag
->next
) {
495 if (unlikely(skb_unclone(prev
, GFP_ATOMIC
)))
498 if (!skb_try_coalesce(prev
, frag
, &headstolen
, &delta
)) {
503 prev
->next
= frag
->next
;
506 frag
->truesize
-= delta
;
507 kfree_skb_partial(frag
, headstolen
);
514 for (frag
= head
->next
; frag
; frag
= frag
->next
) {
515 head
->len
+= frag
->len
;
516 head
->data_len
+= frag
->len
;
517 head
->truesize
+= frag
->truesize
;
520 skb_shinfo(head
)->frag_list
= head
->next
;
525 static int coalesce_skb(struct sk_buff
**headp
)
527 struct sk_buff
*frag
, *head
= *headp
, *next
;
528 int delta
= FRAG_CB(head
)->first
.tot_len
- skb_headlen(head
);
531 if (unlikely(!head
->next
))
534 err
= pskb_expand_head(head
, 0, delta
, GFP_ATOMIC
);
538 if (unlikely(!__pskb_pull_tail(head
, head
->data_len
)))
541 for (frag
= head
->next
; frag
; frag
= next
) {
542 skb_copy_bits(frag
, 0, skb_put(head
, frag
->len
), frag
->len
);
548 head
->truesize
= SKB_TRUESIZE(head
->len
);
553 static int __try_to_segment(struct sk_buff
*skb
, bool csum_partial
,
554 bool ipv4
, bool tcp
, int l4_offset
)
556 if (can_segment(skb
, ipv4
, tcp
, csum_partial
))
557 return skb_list_segment(skb
, ipv4
, l4_offset
);
559 return skb_linearize(skb
);
562 static int try_to_segment(struct sk_buff
*skb
)
564 struct stthdr
*stth
= stt_hdr(skb
);
565 bool csum_partial
= !!(stth
->flags
& STT_CSUM_PARTIAL
);
566 bool ipv4
= !!(stth
->flags
& STT_PROTO_IPV4
);
567 bool tcp
= !!(stth
->flags
& STT_PROTO_TCP
);
568 int l4_offset
= stth
->l4_offset
;
570 return __try_to_segment(skb
, csum_partial
, ipv4
, tcp
, l4_offset
);
573 static int segment_skb(struct sk_buff
**headp
, bool csum_partial
,
574 bool ipv4
, bool tcp
, int l4_offset
)
576 #ifndef SKIP_ZERO_COPY
579 err
= coalesce_skb(headp
);
584 if (skb_shinfo(*headp
)->frag_list
)
585 return __try_to_segment(*headp
, csum_partial
,
586 ipv4
, tcp
, l4_offset
);
590 static int __push_stt_header(struct sk_buff
*skb
, __be64 tun_id
,
591 __be16 s_port
, __be16 d_port
,
592 __be32 saddr
, __be32 dst
,
593 __be16 l3_proto
, u8 l4_proto
,
596 int data_len
= skb
->len
+ sizeof(struct stthdr
) + STT_ETH_PAD
;
597 unsigned short encap_mss
;
601 skb_push(skb
, STT_HEADER_LEN
);
602 skb_reset_transport_header(skb
);
604 memset(tcph
, 0, STT_HEADER_LEN
);
607 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
608 stth
->flags
|= STT_CSUM_PARTIAL
;
610 stth
->l4_offset
= skb
->csum_start
-
614 if (l3_proto
== htons(ETH_P_IP
))
615 stth
->flags
|= STT_PROTO_IPV4
;
617 if (l4_proto
== IPPROTO_TCP
)
618 stth
->flags
|= STT_PROTO_TCP
;
620 stth
->mss
= htons(skb_shinfo(skb
)->gso_size
);
621 } else if (skb
->ip_summed
== CHECKSUM_UNNECESSARY
) {
622 stth
->flags
|= STT_CSUM_VERIFIED
;
625 stth
->vlan_tci
= htons(skb
->vlan_tci
);
627 put_unaligned(tun_id
, &stth
->key
);
629 tcph
->source
= s_port
;
631 tcph
->doff
= sizeof(struct tcphdr
) / 4;
634 tcph
->window
= htons(USHRT_MAX
);
635 tcph
->seq
= htonl(data_len
<< STT_SEQ_LEN_SHIFT
);
636 tcph
->ack_seq
= ack_seq();
637 tcph
->check
= ~tcp_v4_check(skb
->len
, saddr
, dst
, 0);
639 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
640 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
641 skb
->ip_summed
= CHECKSUM_PARTIAL
;
643 encap_mss
= dst_mtu
- sizeof(struct iphdr
) - sizeof(struct tcphdr
);
644 if (data_len
> encap_mss
) {
645 if (unlikely(skb_unclone(skb
, GFP_ATOMIC
)))
648 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV4
;
649 skb_shinfo(skb
)->gso_size
= encap_mss
;
650 skb_shinfo(skb
)->gso_segs
= DIV_ROUND_UP(data_len
, encap_mss
);
652 if (unlikely(clear_gso(skb
)))
658 static struct sk_buff
*push_stt_header(struct sk_buff
*head
, __be64 tun_id
,
659 __be16 s_port
, __be16 d_port
,
660 __be32 saddr
, __be32 dst
,
661 __be16 l3_proto
, u8 l4_proto
,
666 if (skb_shinfo(head
)->frag_list
) {
667 bool ipv4
= (l3_proto
== htons(ETH_P_IP
));
668 bool tcp
= (l4_proto
== IPPROTO_TCP
);
669 bool csum_partial
= (head
->ip_summed
== CHECKSUM_PARTIAL
);
670 int l4_offset
= skb_transport_offset(head
);
672 /* Need to call skb_orphan() to report currect true-size.
673 * calling skb_orphan() in this layer is odd but SKB with
674 * frag-list should not be associated with any socket, so
675 * skb-orphan should be no-op. */
677 if (unlikely(segment_skb(&head
, csum_partial
,
678 ipv4
, tcp
, l4_offset
)))
682 for (skb
= head
; skb
; skb
= skb
->next
) {
683 if (__push_stt_header(skb
, tun_id
, s_port
, d_port
, saddr
, dst
,
684 l3_proto
, l4_proto
, dst_mtu
))
690 kfree_skb_list(head
);
694 static int stt_can_offload(struct sk_buff
*skb
, __be16 l3_proto
, u8 l4_proto
)
696 if (skb_is_gso(skb
) && skb
->ip_summed
!= CHECKSUM_PARTIAL
) {
701 if (l4_proto
== IPPROTO_TCP
)
702 csum_offset
= offsetof(struct tcphdr
, check
);
703 else if (l4_proto
== IPPROTO_UDP
)
704 csum_offset
= offsetof(struct udphdr
, check
);
708 len
= skb
->len
- skb_transport_offset(skb
);
709 csum
= (__sum16
*)(skb_transport_header(skb
) + csum_offset
);
711 if (unlikely(!pskb_may_pull(skb
, skb_transport_offset(skb
) +
712 csum_offset
+ sizeof(*csum
))))
715 if (l3_proto
== htons(ETH_P_IP
)) {
716 struct iphdr
*iph
= ip_hdr(skb
);
718 *csum
= ~csum_tcpudp_magic(iph
->saddr
, iph
->daddr
,
720 } else if (l3_proto
== htons(ETH_P_IPV6
)) {
721 struct ipv6hdr
*ip6h
= ipv6_hdr(skb
);
723 *csum
= ~csum_ipv6_magic(&ip6h
->saddr
, &ip6h
->daddr
,
728 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
729 skb
->csum_offset
= csum_offset
;
730 skb
->ip_summed
= CHECKSUM_PARTIAL
;
733 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
734 /* Assume receiver can only offload TCP/UDP over IPv4/6,
735 * and require 802.1Q VLANs to be accelerated.
737 if (l3_proto
!= htons(ETH_P_IP
) &&
738 l3_proto
!= htons(ETH_P_IPV6
))
741 if (l4_proto
!= IPPROTO_TCP
&& l4_proto
!= IPPROTO_UDP
)
744 /* L4 offset must fit in a 1-byte field. */
745 if (skb
->csum_start
- skb_headroom(skb
) > 255)
748 if (skb_shinfo(skb
)->gso_type
& ~SUPPORTED_GSO_TYPES
)
751 /* Total size of encapsulated packet must fit in 16 bits. */
752 if (skb
->len
+ STT_HEADER_LEN
+ sizeof(struct iphdr
) > 65535)
755 if (skb_vlan_tag_present(skb
) && skb
->vlan_proto
!= htons(ETH_P_8021Q
))
760 static bool need_linearize(const struct sk_buff
*skb
)
762 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
765 if (unlikely(shinfo
->frag_list
))
768 /* Generally speaking we should linearize if there are paged frags.
769 * However, if all of the refcounts are 1 we know nobody else can
770 * change them from underneath us and we can skip the linearization.
772 for (i
= 0; i
< shinfo
->nr_frags
; i
++)
773 if (unlikely(page_count(skb_frag_page(&shinfo
->frags
[i
])) > 1))
779 static struct sk_buff
*handle_offloads(struct sk_buff
*skb
, int min_headroom
)
783 if (skb_vlan_tag_present(skb
) && skb
->vlan_proto
!= htons(ETH_P_8021Q
)) {
785 min_headroom
+= VLAN_HLEN
;
786 if (skb_headroom(skb
) < min_headroom
) {
787 int head_delta
= SKB_DATA_ALIGN(min_headroom
-
788 skb_headroom(skb
) + 16);
790 err
= pskb_expand_head(skb
, max_t(int, head_delta
, 0),
796 skb
= __vlan_hwaccel_push_inside(skb
);
803 if (skb_is_gso(skb
)) {
804 struct sk_buff
*nskb
;
805 char cb
[sizeof(skb
->cb
)];
807 memcpy(cb
, skb
->cb
, sizeof(cb
));
809 nskb
= __skb_gso_segment(skb
, 0, false);
818 memcpy(nskb
->cb
, cb
, sizeof(cb
));
821 } else if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
822 /* Pages aren't locked and could change at any time.
823 * If this happens after we compute the checksum, the
824 * checksum will be wrong. We linearize now to avoid
827 if (unlikely(need_linearize(skb
))) {
828 err
= __skb_linearize(skb
);
833 err
= skb_checksum_help(skb
);
837 skb
->ip_summed
= CHECKSUM_NONE
;
845 static void skb_list_xmit(struct rtable
*rt
, struct sk_buff
*skb
, __be32 src
,
846 __be32 dst
, __u8 tos
, __u8 ttl
, __be16 df
)
849 struct sk_buff
*next
= skb
->next
;
855 iptunnel_xmit(NULL
, rt
, skb
, src
, dst
, IPPROTO_TCP
,
856 tos
, ttl
, df
, false);
862 static u8
parse_ipv6_l4_proto(struct sk_buff
*skb
)
864 unsigned int nh_ofs
= skb_network_offset(skb
);
870 if (unlikely(!pskb_may_pull(skb
, nh_ofs
+ sizeof(struct ipv6hdr
))))
874 nexthdr
= nh
->nexthdr
;
875 payload_ofs
= (u8
*)(nh
+ 1) - skb
->data
;
877 payload_ofs
= ipv6_skip_exthdr(skb
, payload_ofs
, &nexthdr
, &frag_off
);
878 if (unlikely(payload_ofs
< 0))
884 static u8
skb_get_l4_proto(struct sk_buff
*skb
, __be16 l3_proto
)
886 if (l3_proto
== htons(ETH_P_IP
)) {
887 unsigned int nh_ofs
= skb_network_offset(skb
);
889 if (unlikely(!pskb_may_pull(skb
, nh_ofs
+ sizeof(struct iphdr
))))
892 return ip_hdr(skb
)->protocol
;
893 } else if (l3_proto
== htons(ETH_P_IPV6
)) {
894 return parse_ipv6_l4_proto(skb
);
899 static int stt_xmit_skb(struct sk_buff
*skb
, struct rtable
*rt
,
900 __be32 src
, __be32 dst
, __u8 tos
,
901 __u8 ttl
, __be16 df
, __be16 src_port
, __be16 dst_port
,
904 struct ethhdr
*eh
= eth_hdr(skb
);
905 int ret
= 0, min_headroom
;
906 __be16 inner_l3_proto
;
909 inner_l3_proto
= eh
->h_proto
;
910 inner_l4_proto
= skb_get_l4_proto(skb
, inner_l3_proto
);
912 min_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + rt
->dst
.header_len
913 + STT_HEADER_LEN
+ sizeof(struct iphdr
);
915 if (skb_headroom(skb
) < min_headroom
|| skb_header_cloned(skb
)) {
916 int head_delta
= SKB_DATA_ALIGN(min_headroom
-
920 ret
= pskb_expand_head(skb
, max_t(int, head_delta
, 0),
926 ret
= stt_can_offload(skb
, inner_l3_proto
, inner_l4_proto
);
930 skb
= handle_offloads(skb
, min_headroom
);
940 struct sk_buff
*next_skb
= skb
->next
;
947 /* Push STT and TCP header. */
948 skb
= push_stt_header(skb
, tun_id
, src_port
, dst_port
, src
,
949 dst
, inner_l3_proto
, inner_l4_proto
,
951 if (unlikely(!skb
)) {
956 /* Push IP header. */
957 skb_list_xmit(rt
, skb
, src
, dst
, tos
, ttl
, df
);
971 static struct rtable
*stt_get_rt(struct sk_buff
*skb
,
972 struct net_device
*dev
,
974 const struct ip_tunnel_key
*key
,
975 __be16 dport
, __be16 sport
)
977 struct net
*net
= dev_net(dev
);
980 memset(fl
, 0, sizeof(*fl
));
981 fl
->daddr
= key
->u
.ipv4
.dst
;
982 fl
->saddr
= key
->u
.ipv4
.src
;
983 fl
->flowi4_tos
= RT_TOS(key
->tos
);
984 fl
->flowi4_mark
= skb
->mark
;
985 fl
->flowi4_proto
= IPPROTO_TCP
;
986 fl
->fl4_dport
= dport
;
987 fl
->fl4_sport
= sport
;
989 return ip_route_output_key(net
, fl
);
992 netdev_tx_t
ovs_stt_xmit(struct sk_buff
*skb
)
994 struct net_device
*dev
= skb
->dev
;
995 struct stt_dev
*stt_dev
= netdev_priv(dev
);
996 struct net
*net
= stt_dev
->net
;
997 __be16 dport
= stt_dev
->dst_port
;
998 struct ip_tunnel_key
*tun_key
;
999 struct ip_tunnel_info
*tun_info
;
1006 tun_info
= skb_tunnel_info(skb
);
1007 if (unlikely(!tun_info
)) {
1012 tun_key
= &tun_info
->key
;
1014 sport
= udp_flow_src_port(net
, skb
, 1, USHRT_MAX
, true);
1015 rt
= stt_get_rt(skb
, dev
, &fl
, tun_key
, dport
, sport
);
1021 df
= tun_key
->tun_flags
& TUNNEL_DONT_FRAGMENT
? htons(IP_DF
) : 0;
1024 stt_xmit_skb(skb
, rt
, fl
.saddr
, tun_key
->u
.ipv4
.dst
,
1025 tun_key
->tos
, tun_key
->ttl
,
1026 df
, sport
, dport
, tun_key
->tun_id
);
1027 return NETDEV_TX_OK
;
1030 dev
->stats
.tx_errors
++;
1033 EXPORT_SYMBOL(ovs_stt_xmit
);
1035 static void free_frag(struct stt_percpu
*stt_percpu
,
1036 struct pkt_frag
*frag
)
1038 stt_percpu
->frag_mem_used
-= FRAG_CB(frag
->skbs
)->first
.mem_used
;
1039 kfree_skb_list(frag
->skbs
);
1040 list_del(&frag
->lru_node
);
1044 static void evict_frags(struct stt_percpu
*stt_percpu
)
1046 while (!list_empty(&stt_percpu
->frag_lru
) &&
1047 stt_percpu
->frag_mem_used
> REASM_LO_THRESH
) {
1048 struct pkt_frag
*frag
;
1050 frag
= list_first_entry(&stt_percpu
->frag_lru
,
1053 free_frag(stt_percpu
, frag
);
1057 static bool pkt_key_match(struct net
*net
,
1058 const struct pkt_frag
*a
, const struct pkt_key
*b
)
1060 return a
->key
.saddr
== b
->saddr
&& a
->key
.daddr
== b
->daddr
&&
1061 a
->key
.pkt_seq
== b
->pkt_seq
&& a
->key
.mark
== b
->mark
&&
1062 net_eq(dev_net(a
->skbs
->dev
), net
);
1065 static u32
pkt_key_hash(const struct net
*net
, const struct pkt_key
*key
)
1067 u32 initval
= frag_hash_seed
^ (u32
)(unsigned long)net
^ key
->mark
;
1069 return jhash_3words((__force u32
)key
->saddr
, (__force u32
)key
->daddr
,
1070 (__force u32
)key
->pkt_seq
, initval
);
1073 static struct pkt_frag
*lookup_frag(struct net
*net
,
1074 struct stt_percpu
*stt_percpu
,
1075 const struct pkt_key
*key
, u32 hash
)
1077 struct pkt_frag
*frag
, *victim_frag
= NULL
;
1080 for (i
= 0; i
< FRAG_HASH_SEGS
; i
++) {
1081 frag
= &stt_percpu
->frag_hash
[hash
& (FRAG_HASH_ENTRIES
- 1)];
1084 time_before(jiffies
, frag
->timestamp
+ FRAG_EXP_TIME
) &&
1085 pkt_key_match(net
, frag
, key
))
1089 (victim_frag
->skbs
&&
1091 time_before(frag
->timestamp
, victim_frag
->timestamp
))))
1094 hash
>>= FRAG_HASH_SHIFT
;
1097 if (victim_frag
->skbs
)
1098 free_frag(stt_percpu
, victim_frag
);
1103 #ifdef SKIP_ZERO_COPY
1104 static int __copy_skb(struct sk_buff
*to
, struct sk_buff
*from
,
1105 int *delta
, bool *headstolen
)
1109 if (unlikely(to
->next
))
1112 if (unlikely(FRAG_CB(to
)->offset
))
1115 if (unlikely(skb_unclone(to
, GFP_ATOMIC
)))
1118 if (skb_try_coalesce(to
, from
, headstolen
, delta
))
1121 *headstolen
= false;
1122 err
= pskb_expand_head(to
, 0, to
->data_len
+ from
->len
, GFP_ATOMIC
);
1126 if (unlikely(!__pskb_pull_tail(to
, to
->data_len
)))
1129 skb_copy_bits(from
, 0, skb_put(to
, from
->len
), from
->len
);
1132 to
->truesize
+= from
->len
;
1136 static int __copy_skb(struct sk_buff
*to
, struct sk_buff
*from
,
1137 int *delta
, bool *headstolen
)
1139 *headstolen
= false;
1144 static struct sk_buff
*reassemble(struct sk_buff
*skb
)
1146 struct iphdr
*iph
= ip_hdr(skb
);
1147 struct tcphdr
*tcph
= tcp_hdr(skb
);
1148 u32 seq
= ntohl(tcph
->seq
);
1149 struct stt_percpu
*stt_percpu
;
1150 struct sk_buff
*last_skb
, *copied_skb
= NULL
;
1151 struct pkt_frag
*frag
;
1153 int tot_len
, delta
= skb
->truesize
;
1157 tot_len
= seq
>> STT_SEQ_LEN_SHIFT
;
1158 FRAG_CB(skb
)->offset
= seq
& STT_SEQ_OFFSET_MASK
;
1160 if (unlikely(skb
->len
== 0))
1163 if (unlikely(FRAG_CB(skb
)->offset
+ skb
->len
> tot_len
))
1166 if (tot_len
== skb
->len
)
1169 key
.saddr
= iph
->saddr
;
1170 key
.daddr
= iph
->daddr
;
1171 key
.pkt_seq
= tcph
->ack_seq
;
1172 key
.mark
= skb
->mark
;
1173 hash
= pkt_key_hash(dev_net(skb
->dev
), &key
);
1175 stt_percpu
= per_cpu_ptr(stt_percpu_data
, smp_processor_id());
1177 spin_lock(&stt_percpu
->lock
);
1179 if (unlikely(stt_percpu
->frag_mem_used
+ skb
->truesize
> REASM_HI_THRESH
))
1180 evict_frags(stt_percpu
);
1182 frag
= lookup_frag(dev_net(skb
->dev
), stt_percpu
, &key
, hash
);
1186 frag
->timestamp
= jiffies
;
1187 FRAG_CB(skb
)->first
.last_skb
= skb
;
1188 FRAG_CB(skb
)->first
.mem_used
= skb
->truesize
;
1189 FRAG_CB(skb
)->first
.tot_len
= tot_len
;
1190 FRAG_CB(skb
)->first
.rcvd_len
= skb
->len
;
1191 FRAG_CB(skb
)->first
.set_ecn_ce
= false;
1192 list_add_tail(&frag
->lru_node
, &stt_percpu
->frag_lru
);
1193 stt_percpu
->frag_mem_used
+= skb
->truesize
;
1198 /* Optimize for the common case where fragments are received in-order
1199 * and not overlapping.
1201 last_skb
= FRAG_CB(frag
->skbs
)->first
.last_skb
;
1202 if (likely(FRAG_CB(last_skb
)->offset
+ last_skb
->len
==
1203 FRAG_CB(skb
)->offset
)) {
1205 if (!__copy_skb(frag
->skbs
, skb
, &delta
, &headstolen
)) {
1208 last_skb
->next
= skb
;
1209 FRAG_CB(frag
->skbs
)->first
.last_skb
= skb
;
1212 struct sk_buff
*prev
= NULL
, *next
;
1214 for (next
= frag
->skbs
; next
; next
= next
->next
) {
1215 if (FRAG_CB(next
)->offset
>= FRAG_CB(skb
)->offset
)
1220 /* Overlapping fragments aren't allowed. We shouldn't start
1221 * before the end of the previous fragment.
1224 FRAG_CB(prev
)->offset
+ prev
->len
> FRAG_CB(skb
)->offset
)
1227 /* We also shouldn't end after the beginning of the next
1231 FRAG_CB(skb
)->offset
+ skb
->len
> FRAG_CB(next
)->offset
)
1237 FRAG_CB(skb
)->first
= FRAG_CB(frag
->skbs
)->first
;
1244 FRAG_CB(frag
->skbs
)->first
.last_skb
= skb
;
1247 FRAG_CB(frag
->skbs
)->first
.set_ecn_ce
|= INET_ECN_is_ce(iph
->tos
);
1248 FRAG_CB(frag
->skbs
)->first
.rcvd_len
+= skb
->len
;
1249 stt_percpu
->frag_mem_used
+= delta
;
1250 FRAG_CB(frag
->skbs
)->first
.mem_used
+= delta
;
1252 if (FRAG_CB(frag
->skbs
)->first
.tot_len
==
1253 FRAG_CB(frag
->skbs
)->first
.rcvd_len
) {
1254 struct sk_buff
*frag_head
= frag
->skbs
;
1256 frag_head
->tstamp
= skb
->tstamp
;
1257 if (FRAG_CB(frag_head
)->first
.set_ecn_ce
)
1258 INET_ECN_set_ce(frag_head
);
1260 list_del(&frag
->lru_node
);
1261 stt_percpu
->frag_mem_used
-= FRAG_CB(frag_head
)->first
.mem_used
;
1265 list_move_tail(&frag
->lru_node
, &stt_percpu
->frag_lru
);
1270 kfree_skb_partial(copied_skb
, headstolen
);
1277 spin_unlock(&stt_percpu
->lock
);
1286 static bool validate_checksum(struct sk_buff
*skb
)
1288 struct iphdr
*iph
= ip_hdr(skb
);
1290 if (skb_csum_unnecessary(skb
))
1293 if (skb
->ip_summed
== CHECKSUM_COMPLETE
&&
1294 !tcp_v4_check(skb
->len
, iph
->saddr
, iph
->daddr
, skb
->csum
))
1297 skb
->csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
, skb
->len
,
1300 return __skb_checksum_complete(skb
) == 0;
1303 static bool set_offloads(struct sk_buff
*skb
)
1305 struct stthdr
*stth
= stt_hdr(skb
);
1306 unsigned int gso_type
= 0;
1313 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
),
1314 ntohs(stth
->vlan_tci
));
1316 if (!(stth
->flags
& STT_CSUM_PARTIAL
)) {
1317 if (stth
->flags
& STT_CSUM_VERIFIED
)
1318 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1320 skb
->ip_summed
= CHECKSUM_NONE
;
1322 return clear_gso(skb
) == 0;
1325 proto_type
= stth
->flags
& STT_PROTO_TYPES
;
1327 switch (proto_type
) {
1328 case (STT_PROTO_IPV4
| STT_PROTO_TCP
):
1330 csum_offset
= offsetof(struct tcphdr
, check
);
1331 gso_type
= SKB_GSO_TCPV4
;
1332 l3_header_size
= sizeof(struct iphdr
);
1333 l4_header_size
= sizeof(struct tcphdr
);
1334 skb
->protocol
= htons(ETH_P_IP
);
1338 csum_offset
= offsetof(struct tcphdr
, check
);
1339 gso_type
= SKB_GSO_TCPV6
;
1340 l3_header_size
= sizeof(struct ipv6hdr
);
1341 l4_header_size
= sizeof(struct tcphdr
);
1342 skb
->protocol
= htons(ETH_P_IPV6
);
1344 case STT_PROTO_IPV4
:
1346 csum_offset
= offsetof(struct udphdr
, check
);
1347 #ifdef HAVE_SKB_GSO_UDP
1348 gso_type
= SKB_GSO_UDP
;
1350 l3_header_size
= sizeof(struct iphdr
);
1351 l4_header_size
= sizeof(struct udphdr
);
1352 skb
->protocol
= htons(ETH_P_IP
);
1356 csum_offset
= offsetof(struct udphdr
, check
);
1357 #ifdef HAVE_SKB_GSO_UDP
1358 gso_type
= SKB_GSO_UDP
;
1360 l3_header_size
= sizeof(struct ipv6hdr
);
1361 l4_header_size
= sizeof(struct udphdr
);
1362 skb
->protocol
= htons(ETH_P_IPV6
);
1365 if (unlikely(stth
->l4_offset
< ETH_HLEN
+ l3_header_size
))
1368 if (unlikely(!pskb_may_pull(skb
, stth
->l4_offset
+ l4_header_size
)))
1371 stth
= stt_hdr(skb
);
1373 skb
->csum_start
= skb_headroom(skb
) + stth
->l4_offset
;
1374 skb
->csum_offset
= csum_offset
;
1375 skb
->ip_summed
= CHECKSUM_PARTIAL
;
1378 if (unlikely(skb_unclone(skb
, GFP_ATOMIC
)))
1381 skb_shinfo(skb
)->gso_type
= gso_type
| SKB_GSO_DODGY
;
1382 skb_shinfo(skb
)->gso_size
= ntohs(stth
->mss
);
1383 skb_shinfo(skb
)->gso_segs
= 0;
1385 if (unlikely(clear_gso(skb
)))
1392 static void rcv_list(struct net_device
*dev
, struct sk_buff
*skb
,
1393 struct metadata_dst
*tun_dst
)
1395 struct sk_buff
*next
;
1401 ovs_dst_hold((struct dst_entry
*)tun_dst
);
1402 ovs_skb_dst_set(next
, (struct dst_entry
*)tun_dst
);
1404 ovs_ip_tunnel_rcv(dev
, skb
, tun_dst
);
1405 } while ((skb
= next
));
1408 #ifndef USE_UPSTREAM_TUNNEL
1409 static int __stt_rcv(struct stt_dev
*stt_dev
, struct sk_buff
*skb
)
1411 struct metadata_dst tun_dst
;
1413 ovs_ip_tun_rx_dst(&tun_dst
, skb
, TUNNEL_KEY
| TUNNEL_CSUM
,
1414 get_unaligned(&stt_hdr(skb
)->key
), 0);
1415 tun_dst
.u
.tun_info
.key
.tp_src
= tcp_hdr(skb
)->source
;
1416 tun_dst
.u
.tun_info
.key
.tp_dst
= tcp_hdr(skb
)->dest
;
1418 rcv_list(stt_dev
->dev
, skb
, &tun_dst
);
1422 static int __stt_rcv(struct stt_dev
*stt_dev
, struct sk_buff
*skb
)
1424 struct metadata_dst
*tun_dst
;
1428 flags
= TUNNEL_KEY
| TUNNEL_CSUM
;
1429 tun_id
= get_unaligned(&stt_hdr(skb
)->key
);
1430 tun_dst
= ip_tun_rx_dst(skb
, flags
, tun_id
, 0);
1433 tun_dst
->u
.tun_info
.key
.tp_src
= tcp_hdr(skb
)->source
;
1434 tun_dst
->u
.tun_info
.key
.tp_dst
= tcp_hdr(skb
)->dest
;
1436 rcv_list(stt_dev
->dev
, skb
, tun_dst
);
1441 static void stt_rcv(struct stt_dev
*stt_dev
, struct sk_buff
*skb
)
1445 if (unlikely(!validate_checksum(skb
)))
1448 __skb_pull(skb
, sizeof(struct tcphdr
));
1449 skb
= reassemble(skb
);
1453 if (skb
->next
&& coalesce_skb(&skb
))
1456 err
= iptunnel_pull_header(skb
,
1457 sizeof(struct stthdr
) + STT_ETH_PAD
,
1459 !net_eq(stt_dev
->net
, dev_net(stt_dev
->dev
)));
1463 if (unlikely(stt_hdr(skb
)->version
!= 0))
1466 if (unlikely(!set_offloads(skb
)))
1469 if (skb_shinfo(skb
)->frag_list
&& try_to_segment(skb
))
1472 err
= __stt_rcv(stt_dev
, skb
);
1477 /* Consume bad packet */
1478 kfree_skb_list(skb
);
1479 stt_dev
->dev
->stats
.rx_errors
++;
1482 static void tcp_sock_release(struct socket
*sock
)
1484 kernel_sock_shutdown(sock
, SHUT_RDWR
);
1488 static int tcp_sock_create4(struct net
*net
, __be16 port
,
1489 struct socket
**sockp
)
1491 struct sockaddr_in tcp_addr
;
1492 struct socket
*sock
= NULL
;
1495 err
= sock_create_kern(net
, AF_INET
, SOCK_STREAM
, IPPROTO_TCP
, &sock
);
1499 memset(&tcp_addr
, 0, sizeof(tcp_addr
));
1500 tcp_addr
.sin_family
= AF_INET
;
1501 tcp_addr
.sin_addr
.s_addr
= htonl(INADDR_ANY
);
1502 tcp_addr
.sin_port
= port
;
1503 err
= kernel_bind(sock
, (struct sockaddr
*)&tcp_addr
,
1513 tcp_sock_release(sock
);
1518 static void schedule_clean_percpu(void)
1520 schedule_delayed_work(&clean_percpu_wq
, CLEAN_PERCPU_INTERVAL
);
1523 static void clean_percpu(struct work_struct
*work
)
1527 for_each_possible_cpu(i
) {
1528 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1531 for (j
= 0; j
< FRAG_HASH_ENTRIES
; j
++) {
1532 struct pkt_frag
*frag
;
1534 frag
= &stt_percpu
->frag_hash
[j
];
1536 time_before(jiffies
, frag
->timestamp
+ FRAG_EXP_TIME
))
1539 spin_lock_bh(&stt_percpu
->lock
);
1542 time_after(jiffies
, frag
->timestamp
+ FRAG_EXP_TIME
))
1543 free_frag(stt_percpu
, frag
);
1545 spin_unlock_bh(&stt_percpu
->lock
);
1548 schedule_clean_percpu();
1551 #ifdef HAVE_NF_HOOKFN_ARG_OPS
1552 #define FIRST_PARAM const struct nf_hook_ops *ops
1554 #ifdef HAVE_NF_HOOKFN_ARG_PRIV
1555 #define FIRST_PARAM void *priv
1557 #define FIRST_PARAM unsigned int hooknum
1561 #ifdef HAVE_NF_HOOK_STATE
1562 #if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0)
1563 /* RHEL nfhook hacks. */
1564 #ifndef __GENKSYMS__
1565 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1566 const struct nf_hook_state *state
1568 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1569 int (*okfn)(struct sk_buff *)
1572 #define LAST_PARAM const struct nf_hook_state *state
1575 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1576 int (*okfn)(struct sk_buff *)
1579 static unsigned int nf_ip_hook(FIRST_PARAM
, struct sk_buff
*skb
, LAST_PARAM
)
1581 struct stt_dev
*stt_dev
;
1584 if (ip_hdr(skb
)->protocol
!= IPPROTO_TCP
)
1587 ip_hdr_len
= ip_hdrlen(skb
);
1588 if (unlikely(!pskb_may_pull(skb
, ip_hdr_len
+ sizeof(struct tcphdr
))))
1591 skb_set_transport_header(skb
, ip_hdr_len
);
1593 stt_dev
= stt_find_up_dev(dev_net(skb
->dev
), tcp_hdr(skb
)->dest
);
1597 __skb_pull(skb
, ip_hdr_len
);
1598 stt_rcv(stt_dev
, skb
);
1602 static struct nf_hook_ops nf_hook_ops __read_mostly
= {
1604 #ifdef HAVE_NF_HOOKS_OPS_OWNER
1605 .owner
= THIS_MODULE
,
1608 .hooknum
= NF_INET_LOCAL_IN
,
1609 .priority
= INT_MAX
,
1612 static int stt_start(struct net
*net
)
1614 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1622 get_random_bytes(&frag_hash_seed
, sizeof(u32
));
1624 stt_percpu_data
= alloc_percpu(struct stt_percpu
);
1625 if (!stt_percpu_data
) {
1630 for_each_possible_cpu(i
) {
1631 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1632 struct pkt_frag
*frag_hash
;
1634 spin_lock_init(&stt_percpu
->lock
);
1635 INIT_LIST_HEAD(&stt_percpu
->frag_lru
);
1636 get_random_bytes(&per_cpu(pkt_seq_counter
, i
), sizeof(u32
));
1638 frag_hash
= kvmalloc_array(sizeof(struct pkt_frag
),
1640 GFP_KERNEL
| __GFP_ZERO
);
1645 stt_percpu
->frag_hash
= frag_hash
;
1647 schedule_clean_percpu();
1650 if (sn
->n_tunnels
) {
1654 #ifdef HAVE_NF_REGISTER_NET_HOOK
1655 /* On kernel which support per net nf-hook, nf_register_hook() takes
1656 * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
1660 if (sn
->nf_hook_reg_done
)
1663 err
= nf_register_net_hook(net
, &nf_hook_ops
);
1665 sn
->nf_hook_reg_done
= true;
1667 /* Register STT only on very first STT device addition. */
1668 if (!list_empty(&nf_hook_ops
.list
))
1671 err
= nf_register_hook(&nf_hook_ops
);
1682 for_each_possible_cpu(i
) {
1683 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1685 if (stt_percpu
->frag_hash
)
1686 kvfree(stt_percpu
->frag_hash
);
1689 free_percpu(stt_percpu_data
);
1695 static void stt_cleanup(struct net
*net
)
1697 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1705 cancel_delayed_work_sync(&clean_percpu_wq
);
1706 for_each_possible_cpu(i
) {
1707 struct stt_percpu
*stt_percpu
= per_cpu_ptr(stt_percpu_data
, i
);
1710 for (j
= 0; j
< FRAG_HASH_ENTRIES
; j
++) {
1711 struct pkt_frag
*frag
;
1713 frag
= &stt_percpu
->frag_hash
[j
];
1714 kfree_skb_list(frag
->skbs
);
1717 kvfree(stt_percpu
->frag_hash
);
1720 free_percpu(stt_percpu_data
);
1723 static netdev_tx_t
stt_dev_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
1725 #ifdef USE_UPSTREAM_TUNNEL
1726 return ovs_stt_xmit(skb
);
1728 /* Drop All packets coming from networking stack. OVS-CB is
1729 * not initialized for these packets.
1732 dev
->stats
.tx_dropped
++;
1733 return NETDEV_TX_OK
;
1737 /* Setup stats when device is created */
1738 static int stt_init(struct net_device
*dev
)
1740 dev
->tstats
= (typeof(dev
->tstats
)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats
);
1747 static void stt_uninit(struct net_device
*dev
)
1749 free_percpu(dev
->tstats
);
1752 static int stt_open(struct net_device
*dev
)
1754 struct stt_dev
*stt
= netdev_priv(dev
);
1755 struct net
*net
= stt
->net
;
1756 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1759 err
= stt_start(net
);
1763 err
= tcp_sock_create4(net
, stt
->dst_port
, &stt
->sock
);
1766 list_add_rcu(&stt
->up_next
, &sn
->stt_up_list
);
1770 static int stt_stop(struct net_device
*dev
)
1772 struct stt_dev
*stt_dev
= netdev_priv(dev
);
1773 struct net
*net
= stt_dev
->net
;
1775 list_del_rcu(&stt_dev
->up_next
);
1777 tcp_sock_release(stt_dev
->sock
);
1778 stt_dev
->sock
= NULL
;
1783 static int __stt_change_mtu(struct net_device
*dev
, int new_mtu
, bool strict
)
1785 int max_mtu
= IP_MAX_MTU
- STT_HEADER_LEN
- sizeof(struct iphdr
)
1786 - dev
->hard_header_len
;
1791 if (new_mtu
> max_mtu
) {
1802 static int stt_change_mtu(struct net_device
*dev
, int new_mtu
)
1804 return __stt_change_mtu(dev
, new_mtu
, true);
1807 int ovs_stt_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
1809 struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
1810 struct stt_dev
*stt_dev
= netdev_priv(dev
);
1811 struct net
*net
= stt_dev
->net
;
1812 __be16 dport
= stt_dev
->dst_port
;
1817 if (ip_tunnel_info_af(info
) != AF_INET
)
1820 sport
= udp_flow_src_port(net
, skb
, 1, USHRT_MAX
, true);
1821 rt
= stt_get_rt(skb
, dev
, &fl4
, &info
->key
, dport
, sport
);
1827 info
->key
.u
.ipv4
.src
= fl4
.saddr
;
1828 info
->key
.tp_src
= sport
;
1829 info
->key
.tp_dst
= dport
;
1832 EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst
);
1834 static const struct net_device_ops stt_netdev_ops
= {
1835 .ndo_init
= stt_init
,
1836 .ndo_uninit
= stt_uninit
,
1837 .ndo_open
= stt_open
,
1838 .ndo_stop
= stt_stop
,
1839 .ndo_start_xmit
= stt_dev_xmit
,
1840 .ndo_get_stats64
= ip_tunnel_get_stats64
,
1841 #ifdef HAVE_RHEL7_MAX_MTU
1842 .ndo_size
= sizeof(struct net_device_ops
),
1843 .extended
.ndo_change_mtu
= stt_change_mtu
,
1845 .ndo_change_mtu
= stt_change_mtu
,
1847 .ndo_validate_addr
= eth_validate_addr
,
1848 .ndo_set_mac_address
= eth_mac_addr
,
1849 #ifdef USE_UPSTREAM_TUNNEL
1850 #ifdef HAVE_NDO_FILL_METADATA_DST
1851 .ndo_fill_metadata_dst
= stt_fill_metadata_dst
,
1856 static void stt_get_drvinfo(struct net_device
*dev
,
1857 struct ethtool_drvinfo
*drvinfo
)
1859 strlcpy(drvinfo
->version
, STT_NETDEV_VER
, sizeof(drvinfo
->version
));
1860 strlcpy(drvinfo
->driver
, "stt", sizeof(drvinfo
->driver
));
1863 static const struct ethtool_ops stt_ethtool_ops
= {
1864 .get_drvinfo
= stt_get_drvinfo
,
1865 .get_link
= ethtool_op_get_link
,
1868 /* Info for udev, that this is a virtual tunnel endpoint */
1869 static struct device_type stt_type
= {
1873 /* Initialize the device structure. */
1874 static void stt_setup(struct net_device
*dev
)
1878 dev
->netdev_ops
= &stt_netdev_ops
;
1879 dev
->ethtool_ops
= &stt_ethtool_ops
;
1880 #ifndef HAVE_NEEDS_FREE_NETDEV
1881 dev
->destructor
= free_netdev
;
1883 dev
->needs_free_netdev
= true;
1886 SET_NETDEV_DEVTYPE(dev
, &stt_type
);
1888 dev
->features
|= NETIF_F_LLTX
| NETIF_F_NETNS_LOCAL
;
1889 dev
->features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
;
1890 dev
->features
|= NETIF_F_RXCSUM
;
1891 dev
->features
|= NETIF_F_GSO_SOFTWARE
;
1893 dev
->hw_features
|= NETIF_F_SG
| NETIF_F_HW_CSUM
| NETIF_F_RXCSUM
;
1894 dev
->hw_features
|= NETIF_F_GSO_SOFTWARE
;
1896 #ifdef USE_UPSTREAM_TUNNEL
1897 netif_keep_dst(dev
);
1899 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
| IFF_NO_QUEUE
;
1900 eth_hw_addr_random(dev
);
1903 static const struct nla_policy stt_policy
[IFLA_STT_MAX
+ 1] = {
1904 [IFLA_STT_PORT
] = { .type
= NLA_U16
},
1907 #ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK
1908 static int stt_validate(struct nlattr
*tb
[], struct nlattr
*data
[],
1909 struct netlink_ext_ack __always_unused
*extack
)
1911 static int stt_validate(struct nlattr
*tb
[], struct nlattr
*data
[])
1914 if (tb
[IFLA_ADDRESS
]) {
1915 if (nla_len(tb
[IFLA_ADDRESS
]) != ETH_ALEN
)
1918 if (!is_valid_ether_addr(nla_data(tb
[IFLA_ADDRESS
])))
1919 return -EADDRNOTAVAIL
;
1925 static struct stt_dev
*find_dev(struct net
*net
, __be16 dst_port
)
1927 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1928 struct stt_dev
*dev
;
1930 list_for_each_entry(dev
, &sn
->stt_list
, next
) {
1931 if (dev
->dst_port
== dst_port
)
1937 static int stt_configure(struct net
*net
, struct net_device
*dev
,
1940 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
1941 struct stt_dev
*stt
= netdev_priv(dev
);
1947 stt
->dst_port
= dst_port
;
1949 if (find_dev(net
, dst_port
))
1952 err
= __stt_change_mtu(dev
, IP_MAX_MTU
, false);
1956 err
= register_netdevice(dev
);
1960 list_add(&stt
->next
, &sn
->stt_list
);
1964 #ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS
1965 static int stt_newlink(struct net
*net
, struct net_device
*dev
,
1966 struct nlattr
*tb
[], struct nlattr
*data
[],
1967 struct netlink_ext_ack __always_unused
*extack
)
1969 static int stt_newlink(struct net
*net
, struct net_device
*dev
,
1970 struct nlattr
*tb
[], struct nlattr
*data
[])
1973 __be16 dst_port
= htons(STT_DST_PORT
);
1975 if (data
[IFLA_STT_PORT
])
1976 dst_port
= nla_get_be16(data
[IFLA_STT_PORT
]);
1978 return stt_configure(net
, dev
, dst_port
);
1981 static void stt_dellink(struct net_device
*dev
, struct list_head
*head
)
1983 struct stt_dev
*stt
= netdev_priv(dev
);
1985 list_del(&stt
->next
);
1986 unregister_netdevice_queue(dev
, head
);
1989 static size_t stt_get_size(const struct net_device
*dev
)
1991 return nla_total_size(sizeof(__be32
)); /* IFLA_STT_PORT */
1994 static int stt_fill_info(struct sk_buff
*skb
, const struct net_device
*dev
)
1996 struct stt_dev
*stt
= netdev_priv(dev
);
1998 if (nla_put_be16(skb
, IFLA_STT_PORT
, stt
->dst_port
))
1999 goto nla_put_failure
;
2007 static struct rtnl_link_ops stt_link_ops __read_mostly
= {
2009 .maxtype
= IFLA_STT_MAX
,
2010 .policy
= stt_policy
,
2011 .priv_size
= sizeof(struct stt_dev
),
2013 .validate
= stt_validate
,
2014 .newlink
= stt_newlink
,
2015 .dellink
= stt_dellink
,
2016 .get_size
= stt_get_size
,
2017 .fill_info
= stt_fill_info
,
2020 struct net_device
*ovs_stt_dev_create_fb(struct net
*net
, const char *name
,
2021 u8 name_assign_type
, u16 dst_port
)
2023 struct nlattr
*tb
[IFLA_MAX
+ 1];
2024 struct net_device
*dev
;
2027 memset(tb
, 0, sizeof(tb
));
2028 dev
= rtnl_create_link(net
, (char *) name
, name_assign_type
,
2033 err
= stt_configure(net
, dev
, htons(dst_port
));
2036 return ERR_PTR(err
);
2040 EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb
);
2042 static int stt_init_net(struct net
*net
)
2044 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
2046 INIT_LIST_HEAD(&sn
->stt_list
);
2047 INIT_LIST_HEAD(&sn
->stt_up_list
);
2048 #ifdef HAVE_NF_REGISTER_NET_HOOK
2049 sn
->nf_hook_reg_done
= false;
2054 static void stt_exit_net(struct net
*net
)
2056 struct stt_net
*sn
= net_generic(net
, stt_net_id
);
2057 struct stt_dev
*stt
, *next
;
2058 struct net_device
*dev
, *aux
;
2061 #ifdef HAVE_NF_REGISTER_NET_HOOK
2062 /* Ideally this should be done from stt_stop(), But on some kernels
2063 * nf-unreg operation needs RTNL-lock, which can cause deallock.
2064 * So it is done from here. */
2065 if (sn
->nf_hook_reg_done
)
2066 nf_unregister_net_hook(net
, &nf_hook_ops
);
2071 /* gather any stt devices that were moved into this ns */
2072 for_each_netdev_safe(net
, dev
, aux
)
2073 if (dev
->rtnl_link_ops
== &stt_link_ops
)
2074 unregister_netdevice_queue(dev
, &list
);
2076 list_for_each_entry_safe(stt
, next
, &sn
->stt_list
, next
) {
2077 /* If stt->dev is in the same netns, it was already added
2078 * to the stt by the previous loop.
2080 if (!net_eq(dev_net(stt
->dev
), net
))
2081 unregister_netdevice_queue(stt
->dev
, &list
);
2084 /* unregister the devices gathered above */
2085 unregister_netdevice_many(&list
);
2089 static struct pernet_operations stt_net_ops
= {
2090 .init
= stt_init_net
,
2091 .exit
= stt_exit_net
,
2093 .size
= sizeof(struct stt_net
),
2096 int stt_init_module(void)
2100 rc
= register_pernet_subsys(&stt_net_ops
);
2104 rc
= rtnl_link_register(&stt_link_ops
);
2108 #ifdef HAVE_LIST_IN_NF_HOOK_OPS
2109 INIT_LIST_HEAD(&nf_hook_ops
.list
);
2111 pr_info("STT tunneling driver\n");
2114 unregister_pernet_subsys(&stt_net_ops
);
2116 pr_err("Error while initializing STT %d\n", rc
);
2120 void stt_cleanup_module(void)
2122 #ifndef HAVE_NF_REGISTER_NET_HOOK
2123 if (!list_empty(&nf_hook_ops
.list
))
2124 nf_unregister_hook(&nf_hook_ops
);
2126 rtnl_link_unregister(&stt_link_ops
);
2127 unregister_pernet_subsys(&stt_net_ops
);