]> git.proxmox.com Git - mirror_ovs.git/blob - datapath/linux/compat/stt.c
datapath: Add support for kernel 4.4
[mirror_ovs.git] / datapath / linux / compat / stt.c
1 /*
2 * Stateless TCP Tunnel (STT) vport.
3 *
4 * Copyright (c) 2015 Nicira, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <asm/unaligned.h>
14
15 #include <linux/delay.h>
16 #include <linux/flex_array.h>
17 #include <linux/if.h>
18 #include <linux/if_vlan.h>
19 #include <linux/ip.h>
20 #include <linux/ipv6.h>
21 #include <linux/jhash.h>
22 #include <linux/list.h>
23 #include <linux/log2.h>
24 #include <linux/module.h>
25 #include <linux/net.h>
26 #include <linux/netfilter.h>
27 #include <linux/percpu.h>
28 #include <linux/skbuff.h>
29 #include <linux/tcp.h>
30 #include <linux/workqueue.h>
31
32 #include <net/dst_metadata.h>
33 #include <net/icmp.h>
34 #include <net/inet_ecn.h>
35 #include <net/ip.h>
36 #include <net/ip_tunnels.h>
37 #include <net/ip6_checksum.h>
38 #include <net/net_namespace.h>
39 #include <net/netns/generic.h>
40 #include <net/sock.h>
41 #include <net/stt.h>
42 #include <net/tcp.h>
43 #include <net/udp.h>
44
45 #include "gso.h"
46 #include "compat.h"
47
48 #define STT_NETDEV_VER "0.1"
49 #define STT_DST_PORT 7471
50
51 #ifdef OVS_STT
52 #ifdef CONFIG_SLUB
53 /*
54 * We saw better performance with skipping zero copy in case of SLUB.
55 * So skip zero copy for SLUB case.
56 */
57 #define SKIP_ZERO_COPY
58 #endif
59
60 #define STT_VER 0
61
62 /* @list: Per-net list of STT ports.
63 * @rcv: The callback is called on STT packet recv, STT reassembly can generate
64 * multiple packets, in this case first packet has tunnel outer header, rest
65 * of the packets are inner packet segments with no stt header.
66 * @rcv_data: user data.
67 * @sock: Fake TCP socket for the STT port.
68 */
69 struct stt_dev {
70 struct net_device *dev;
71 struct net *net;
72 struct list_head next;
73 struct list_head up_next;
74 struct socket *sock;
75 __be16 dst_port;
76 };
77
78 #define STT_CSUM_VERIFIED BIT(0)
79 #define STT_CSUM_PARTIAL BIT(1)
80 #define STT_PROTO_IPV4 BIT(2)
81 #define STT_PROTO_TCP BIT(3)
82 #define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
83
84 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
85 SKB_GSO_TCPV6)
86
87 /* The length and offset of a fragment are encoded in the sequence number.
88 * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
89 * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
90 */
91 #define STT_SEQ_LEN_SHIFT 16
92 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
93
94 /* The maximum amount of memory used to store packets waiting to be reassembled
95 * on a given CPU. Once this threshold is exceeded we will begin freeing the
96 * least recently used fragments.
97 */
98 #define REASM_HI_THRESH (4 * 1024 * 1024)
99 /* The target for the high memory evictor. Once we have exceeded
100 * REASM_HI_THRESH, we will continue freeing fragments until we hit
101 * this limit.
102 */
103 #define REASM_LO_THRESH (3 * 1024 * 1024)
104 /* The length of time a given packet has to be reassembled from the time the
105 * first fragment arrives. Once this limit is exceeded it becomes available
106 * for cleaning.
107 */
108 #define FRAG_EXP_TIME (30 * HZ)
109 /* Number of hash entries. Each entry has only a single slot to hold a packet
110 * so if there are collisions, we will drop packets. This is allocated
111 * per-cpu and each entry consists of struct pkt_frag.
112 */
113 #define FRAG_HASH_SHIFT 8
114 #define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
115 #define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
116
117 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
118
119 struct pkt_key {
120 __be32 saddr;
121 __be32 daddr;
122 __be32 pkt_seq;
123 u32 mark;
124 };
125
126 struct pkt_frag {
127 struct sk_buff *skbs;
128 unsigned long timestamp;
129 struct list_head lru_node;
130 struct pkt_key key;
131 };
132
133 struct stt_percpu {
134 struct flex_array *frag_hash;
135 struct list_head frag_lru;
136 unsigned int frag_mem_used;
137
138 /* Protect frags table. */
139 spinlock_t lock;
140 };
141
142 struct first_frag {
143 struct sk_buff *last_skb;
144 unsigned int mem_used;
145 u16 tot_len;
146 u16 rcvd_len;
147 bool set_ecn_ce;
148 };
149
150 struct frag_skb_cb {
151 u16 offset;
152
153 /* Only valid for the first skb in the chain. */
154 struct first_frag first;
155 };
156
157 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
158
159 /* per-network namespace private data for this module */
160 struct stt_net {
161 struct list_head stt_list;
162 struct list_head stt_up_list; /* Devices which are in IFF_UP state. */
163 int n_tunnels;
164 #ifdef HAVE_NF_REGISTER_NET_HOOK
165 bool nf_hook_reg_done;
166 #endif
167 };
168
169 static int stt_net_id;
170
171 static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
172 static u32 frag_hash_seed __read_mostly;
173
174 /* Protects sock-hash and refcounts. */
175 static DEFINE_MUTEX(stt_mutex);
176
177 static int n_tunnels;
178 static DEFINE_PER_CPU(u32, pkt_seq_counter);
179
180 static void clean_percpu(struct work_struct *work);
181 static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
182
183 static struct stt_dev *stt_find_up_dev(struct net *net, __be16 port)
184 {
185 struct stt_net *sn = net_generic(net, stt_net_id);
186 struct stt_dev *stt_dev;
187
188 list_for_each_entry_rcu(stt_dev, &sn->stt_up_list, up_next) {
189 if (stt_dev->dst_port == port)
190 return stt_dev;
191 }
192 return NULL;
193 }
194
195 static __be32 ack_seq(void)
196 {
197 #if NR_CPUS <= 65536
198 u32 pkt_seq, ack;
199
200 pkt_seq = this_cpu_read(pkt_seq_counter);
201 ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
202 this_cpu_inc(pkt_seq_counter);
203
204 return (__force __be32)ack;
205 #else
206 #error "Support for greater than 64k CPUs not implemented"
207 #endif
208 }
209
210 static int clear_gso(struct sk_buff *skb)
211 {
212 struct skb_shared_info *shinfo = skb_shinfo(skb);
213 int err;
214
215 if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
216 shinfo->gso_segs == 0)
217 return 0;
218
219 err = skb_unclone(skb, GFP_ATOMIC);
220 if (unlikely(err))
221 return err;
222
223 shinfo = skb_shinfo(skb);
224 shinfo->gso_type = 0;
225 shinfo->gso_size = 0;
226 shinfo->gso_segs = 0;
227 return 0;
228 }
229
230 static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
231 {
232 to->protocol = from->protocol;
233 to->tstamp = from->tstamp;
234 to->priority = from->priority;
235 to->mark = from->mark;
236 to->vlan_tci = from->vlan_tci;
237 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
238 to->vlan_proto = from->vlan_proto;
239 #endif
240 skb_copy_secmark(to, from);
241 }
242
243 static void update_headers(struct sk_buff *skb, bool head,
244 unsigned int l4_offset, unsigned int hdr_len,
245 bool ipv4, u32 tcp_seq)
246 {
247 u16 old_len, new_len;
248 __be32 delta;
249 struct tcphdr *tcph;
250 int gso_size;
251
252 if (ipv4) {
253 struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
254
255 old_len = ntohs(iph->tot_len);
256 new_len = skb->len - ETH_HLEN;
257 iph->tot_len = htons(new_len);
258
259 ip_send_check(iph);
260 } else {
261 struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
262
263 old_len = ntohs(ip6h->payload_len);
264 new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
265 ip6h->payload_len = htons(new_len);
266 }
267
268 tcph = (struct tcphdr *)(skb->data + l4_offset);
269 if (!head) {
270 tcph->seq = htonl(tcp_seq);
271 tcph->cwr = 0;
272 }
273
274 if (skb->next) {
275 tcph->fin = 0;
276 tcph->psh = 0;
277 }
278
279 delta = htonl(~old_len + new_len);
280 tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
281 (__force u32)delta));
282
283 gso_size = skb_shinfo(skb)->gso_size;
284 if (gso_size && skb->len - hdr_len <= gso_size)
285 BUG_ON(clear_gso(skb));
286 }
287
288 static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
289 {
290 /* If no offloading is in use then we don't have enough information
291 * to process the headers.
292 */
293 if (!csum_partial)
294 goto linearize;
295
296 /* Handling UDP packets requires IP fragmentation, which means that
297 * the L4 checksum can no longer be calculated by hardware (since the
298 * fragments are in different packets. If we have to compute the
299 * checksum it's faster just to linearize and large UDP packets are
300 * pretty uncommon anyways, so it's not worth dealing with for now.
301 */
302 if (!tcp)
303 goto linearize;
304
305 if (ipv4) {
306 struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
307
308 /* It's difficult to get the IP IDs exactly right here due to
309 * varying segment sizes and potentially multiple layers of
310 * segmentation. IP ID isn't important when DF is set and DF
311 * is generally set for TCP packets, so just linearize if it's
312 * not.
313 */
314 if (!(iph->frag_off & htons(IP_DF)))
315 goto linearize;
316 } else {
317 struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
318
319 /* Jumbograms require more processing to update and we'll
320 * probably never see them, so just linearize.
321 */
322 if (ip6h->payload_len == 0)
323 goto linearize;
324 }
325 return true;
326
327 linearize:
328 return false;
329 }
330
331 static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
332 int hdr_len)
333 {
334 u16 csum_start;
335
336 if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
337 int extra_head = hdr_len - skb_headroom(frag);
338
339 extra_head = extra_head > 0 ? extra_head : 0;
340 if (unlikely(pskb_expand_head(frag, extra_head, 0,
341 GFP_ATOMIC)))
342 return -ENOMEM;
343 }
344
345 memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
346
347 csum_start = head->csum_start - skb_headroom(head);
348 frag->csum_start = skb_headroom(frag) + csum_start;
349 frag->csum_offset = head->csum_offset;
350 frag->ip_summed = head->ip_summed;
351
352 skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
353 skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
354 skb_shinfo(frag)->gso_segs = 0;
355
356 copy_skb_metadata(frag, head);
357 return 0;
358 }
359
360 static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
361 {
362 struct sk_buff *skb;
363 struct tcphdr *tcph;
364 int seg_len;
365 int hdr_len;
366 int tcp_len;
367 u32 seq;
368
369 if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
370 return -ENOMEM;
371
372 tcph = (struct tcphdr *)(head->data + l4_offset);
373 tcp_len = tcph->doff * 4;
374 hdr_len = l4_offset + tcp_len;
375
376 if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
377 (head->len < hdr_len)))
378 return -EINVAL;
379
380 if (unlikely(!pskb_may_pull(head, hdr_len)))
381 return -ENOMEM;
382
383 tcph = (struct tcphdr *)(head->data + l4_offset);
384 /* Update header of each segment. */
385 seq = ntohl(tcph->seq);
386 seg_len = skb_pagelen(head) - hdr_len;
387
388 skb = skb_shinfo(head)->frag_list;
389 skb_shinfo(head)->frag_list = NULL;
390 head->next = skb;
391 for (; skb; skb = skb->next) {
392 int err;
393
394 head->len -= skb->len;
395 head->data_len -= skb->len;
396 head->truesize -= skb->truesize;
397
398 seq += seg_len;
399 seg_len = skb->len;
400 err = copy_headers(head, skb, hdr_len);
401 if (err)
402 return err;
403 update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
404 }
405 update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
406 return 0;
407 }
408
409 #ifndef SKIP_ZERO_COPY
410 static struct sk_buff *normalize_frag_list(struct sk_buff *head,
411 struct sk_buff **skbp)
412 {
413 struct sk_buff *skb = *skbp;
414 struct sk_buff *last;
415
416 do {
417 struct sk_buff *frags;
418
419 if (skb_shared(skb)) {
420 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
421
422 if (unlikely(!nskb))
423 return ERR_PTR(-ENOMEM);
424
425 nskb->next = skb->next;
426 consume_skb(skb);
427 skb = nskb;
428 *skbp = skb;
429 }
430
431 if (head) {
432 head->len -= skb->len;
433 head->data_len -= skb->len;
434 head->truesize -= skb->truesize;
435 }
436
437 frags = skb_shinfo(skb)->frag_list;
438 if (frags) {
439 int err;
440
441 err = skb_unclone(skb, GFP_ATOMIC);
442 if (unlikely(err))
443 return ERR_PTR(err);
444
445 last = normalize_frag_list(skb, &frags);
446 if (IS_ERR(last))
447 return last;
448
449 skb_shinfo(skb)->frag_list = NULL;
450 last->next = skb->next;
451 skb->next = frags;
452 } else {
453 last = skb;
454 }
455
456 skbp = &skb->next;
457 } while ((skb = skb->next));
458
459 return last;
460 }
461
462 /* Takes a linked list of skbs, which potentially contain frag_list
463 * (whose members in turn potentially contain frag_lists, etc.) and
464 * converts them into a single linear linked list.
465 */
466 static int straighten_frag_list(struct sk_buff **skbp)
467 {
468 struct sk_buff *err_skb;
469
470 err_skb = normalize_frag_list(NULL, skbp);
471 if (IS_ERR(err_skb))
472 return PTR_ERR(err_skb);
473
474 return 0;
475 }
476
477 static int coalesce_skb(struct sk_buff **headp)
478 {
479 struct sk_buff *frag, *head, *prev;
480 int err;
481
482 err = straighten_frag_list(headp);
483 if (unlikely(err))
484 return err;
485 head = *headp;
486
487 /* Coalesce frag list. */
488 prev = head;
489 for (frag = head->next; frag; frag = frag->next) {
490 bool headstolen;
491 int delta;
492
493 if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
494 return -ENOMEM;
495
496 if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
497 prev = frag;
498 continue;
499 }
500
501 prev->next = frag->next;
502 frag->len = 0;
503 frag->data_len = 0;
504 frag->truesize -= delta;
505 kfree_skb_partial(frag, headstolen);
506 frag = prev;
507 }
508
509 if (!head->next)
510 return 0;
511
512 for (frag = head->next; frag; frag = frag->next) {
513 head->len += frag->len;
514 head->data_len += frag->len;
515 head->truesize += frag->truesize;
516 }
517
518 skb_shinfo(head)->frag_list = head->next;
519 head->next = NULL;
520 return 0;
521 }
522 #else
523 static int coalesce_skb(struct sk_buff **headp)
524 {
525 struct sk_buff *frag, *head = *headp, *next;
526 int delta = FRAG_CB(head)->first.tot_len - skb_headlen(head);
527 int err;
528
529 if (unlikely(!head->next))
530 return 0;
531
532 err = pskb_expand_head(head, 0, delta, GFP_ATOMIC);
533 if (unlikely(err))
534 return err;
535
536 if (unlikely(!__pskb_pull_tail(head, head->data_len)))
537 BUG();
538
539 for (frag = head->next; frag; frag = next) {
540 skb_copy_bits(frag, 0, skb_put(head, frag->len), frag->len);
541 next = frag->next;
542 kfree_skb(frag);
543 }
544
545 head->next = NULL;
546 head->truesize = SKB_TRUESIZE(head->len);
547 return 0;
548 }
549 #endif
550
551 static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
552 bool ipv4, bool tcp, int l4_offset)
553 {
554 if (can_segment(skb, ipv4, tcp, csum_partial))
555 return skb_list_segment(skb, ipv4, l4_offset);
556 else
557 return skb_linearize(skb);
558 }
559
560 static int try_to_segment(struct sk_buff *skb)
561 {
562 #ifdef SKIP_ZERO_COPY
563 /* coalesce_skb() since does not generate frag-list no need to
564 * linearize it here.
565 */
566 return 0;
567 #else
568 struct stthdr *stth = stt_hdr(skb);
569 bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
570 bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
571 bool tcp = !!(stth->flags & STT_PROTO_TCP);
572 int l4_offset = stth->l4_offset;
573
574 return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
575 #endif
576 }
577
578 static int segment_skb(struct sk_buff **headp, bool csum_partial,
579 bool ipv4, bool tcp, int l4_offset)
580 {
581 #ifndef SKIP_ZERO_COPY
582 int err;
583
584 err = coalesce_skb(headp);
585 if (err)
586 return err;
587 #endif
588
589 if (skb_shinfo(*headp)->frag_list)
590 return __try_to_segment(*headp, csum_partial,
591 ipv4, tcp, l4_offset);
592 return 0;
593 }
594
595 static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
596 __be16 s_port, __be16 d_port,
597 __be32 saddr, __be32 dst,
598 __be16 l3_proto, u8 l4_proto,
599 int dst_mtu)
600 {
601 int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
602 unsigned short encap_mss;
603 struct tcphdr *tcph;
604 struct stthdr *stth;
605
606 skb_push(skb, STT_HEADER_LEN);
607 skb_reset_transport_header(skb);
608 tcph = tcp_hdr(skb);
609 memset(tcph, 0, STT_HEADER_LEN);
610 stth = stt_hdr(skb);
611
612 if (skb->ip_summed == CHECKSUM_PARTIAL) {
613 stth->flags |= STT_CSUM_PARTIAL;
614
615 stth->l4_offset = skb->csum_start -
616 (skb_headroom(skb) +
617 STT_HEADER_LEN);
618
619 if (l3_proto == htons(ETH_P_IP))
620 stth->flags |= STT_PROTO_IPV4;
621
622 if (l4_proto == IPPROTO_TCP)
623 stth->flags |= STT_PROTO_TCP;
624
625 stth->mss = htons(skb_shinfo(skb)->gso_size);
626 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
627 stth->flags |= STT_CSUM_VERIFIED;
628 }
629
630 stth->vlan_tci = htons(skb->vlan_tci);
631 skb->vlan_tci = 0;
632 put_unaligned(tun_id, &stth->key);
633
634 tcph->source = s_port;
635 tcph->dest = d_port;
636 tcph->doff = sizeof(struct tcphdr) / 4;
637 tcph->ack = 1;
638 tcph->psh = 1;
639 tcph->window = htons(USHRT_MAX);
640 tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT);
641 tcph->ack_seq = ack_seq();
642 tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0);
643
644 skb->csum_start = skb_transport_header(skb) - skb->head;
645 skb->csum_offset = offsetof(struct tcphdr, check);
646 skb->ip_summed = CHECKSUM_PARTIAL;
647
648 encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
649 if (data_len > encap_mss) {
650 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
651 return -EINVAL;
652
653 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
654 skb_shinfo(skb)->gso_size = encap_mss;
655 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
656 } else {
657 if (unlikely(clear_gso(skb)))
658 return -EINVAL;
659 }
660 return 0;
661 }
662
663 static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
664 __be16 s_port, __be16 d_port,
665 __be32 saddr, __be32 dst,
666 __be16 l3_proto, u8 l4_proto,
667 int dst_mtu)
668 {
669 struct sk_buff *skb;
670
671 if (skb_shinfo(head)->frag_list) {
672 bool ipv4 = (l3_proto == htons(ETH_P_IP));
673 bool tcp = (l4_proto == IPPROTO_TCP);
674 bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
675 int l4_offset = skb_transport_offset(head);
676
677 /* Need to call skb_orphan() to report currect true-size.
678 * calling skb_orphan() in this layer is odd but SKB with
679 * frag-list should not be associated with any socket, so
680 * skb-orphan should be no-op. */
681 skb_orphan(head);
682 if (unlikely(segment_skb(&head, csum_partial,
683 ipv4, tcp, l4_offset)))
684 goto error;
685 }
686
687 for (skb = head; skb; skb = skb->next) {
688 if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
689 l3_proto, l4_proto, dst_mtu))
690 goto error;
691 }
692
693 return head;
694 error:
695 kfree_skb_list(head);
696 return NULL;
697 }
698
699 static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
700 {
701 if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
702 int csum_offset;
703 __sum16 *csum;
704 int len;
705
706 if (l4_proto == IPPROTO_TCP)
707 csum_offset = offsetof(struct tcphdr, check);
708 else if (l4_proto == IPPROTO_UDP)
709 csum_offset = offsetof(struct udphdr, check);
710 else
711 return 0;
712
713 len = skb->len - skb_transport_offset(skb);
714 csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
715
716 if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
717 csum_offset + sizeof(*csum))))
718 return -EINVAL;
719
720 if (l3_proto == htons(ETH_P_IP)) {
721 struct iphdr *iph = ip_hdr(skb);
722
723 *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
724 len, l4_proto, 0);
725 } else if (l3_proto == htons(ETH_P_IPV6)) {
726 struct ipv6hdr *ip6h = ipv6_hdr(skb);
727
728 *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
729 len, l4_proto, 0);
730 } else {
731 return 0;
732 }
733 skb->csum_start = skb_transport_header(skb) - skb->head;
734 skb->csum_offset = csum_offset;
735 skb->ip_summed = CHECKSUM_PARTIAL;
736 }
737
738 if (skb->ip_summed == CHECKSUM_PARTIAL) {
739 /* Assume receiver can only offload TCP/UDP over IPv4/6,
740 * and require 802.1Q VLANs to be accelerated.
741 */
742 if (l3_proto != htons(ETH_P_IP) &&
743 l3_proto != htons(ETH_P_IPV6))
744 return 0;
745
746 if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
747 return 0;
748
749 /* L4 offset must fit in a 1-byte field. */
750 if (skb->csum_start - skb_headroom(skb) > 255)
751 return 0;
752
753 if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
754 return 0;
755 }
756 /* Total size of encapsulated packet must fit in 16 bits. */
757 if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
758 return 0;
759
760 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
761 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
762 return 0;
763 #endif
764 return 1;
765 }
766
767 static bool need_linearize(const struct sk_buff *skb)
768 {
769 struct skb_shared_info *shinfo = skb_shinfo(skb);
770 int i;
771
772 if (unlikely(shinfo->frag_list))
773 return true;
774
775 /* Generally speaking we should linearize if there are paged frags.
776 * However, if all of the refcounts are 1 we know nobody else can
777 * change them from underneath us and we can skip the linearization.
778 */
779 for (i = 0; i < shinfo->nr_frags; i++)
780 if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
781 return true;
782
783 return false;
784 }
785
786 static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
787 {
788 int err;
789
790 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
791 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
792
793 min_headroom += VLAN_HLEN;
794 if (skb_headroom(skb) < min_headroom) {
795 int head_delta = SKB_DATA_ALIGN(min_headroom -
796 skb_headroom(skb) + 16);
797
798 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
799 0, GFP_ATOMIC);
800 if (unlikely(err))
801 goto error;
802 }
803
804 skb = __vlan_hwaccel_push_inside(skb);
805 if (!skb) {
806 err = -ENOMEM;
807 goto error;
808 }
809 }
810 #endif
811
812 if (skb_is_gso(skb)) {
813 struct sk_buff *nskb;
814 char cb[sizeof(skb->cb)];
815
816 memcpy(cb, skb->cb, sizeof(cb));
817
818 nskb = __skb_gso_segment(skb, 0, false);
819 if (IS_ERR(nskb)) {
820 err = PTR_ERR(nskb);
821 goto error;
822 }
823
824 consume_skb(skb);
825 skb = nskb;
826 while (nskb) {
827 memcpy(nskb->cb, cb, sizeof(cb));
828 nskb = nskb->next;
829 }
830 } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
831 /* Pages aren't locked and could change at any time.
832 * If this happens after we compute the checksum, the
833 * checksum will be wrong. We linearize now to avoid
834 * this problem.
835 */
836 if (unlikely(need_linearize(skb))) {
837 err = __skb_linearize(skb);
838 if (unlikely(err))
839 goto error;
840 }
841
842 err = skb_checksum_help(skb);
843 if (unlikely(err))
844 goto error;
845 }
846 skb->ip_summed = CHECKSUM_NONE;
847
848 return skb;
849 error:
850 kfree_skb(skb);
851 return ERR_PTR(err);
852 }
853
854 static void skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
855 __be32 dst, __u8 tos, __u8 ttl, __be16 df)
856 {
857 while (skb) {
858 struct sk_buff *next = skb->next;
859
860 if (next)
861 dst_clone(&rt->dst);
862
863 skb->next = NULL;
864 iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
865 tos, ttl, df, false);
866
867 skb = next;
868 }
869 }
870
871 static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
872 {
873 unsigned int nh_ofs = skb_network_offset(skb);
874 int payload_ofs;
875 struct ipv6hdr *nh;
876 uint8_t nexthdr;
877 __be16 frag_off;
878
879 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
880 return 0;
881
882 nh = ipv6_hdr(skb);
883 nexthdr = nh->nexthdr;
884 payload_ofs = (u8 *)(nh + 1) - skb->data;
885
886 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
887 if (unlikely(payload_ofs < 0))
888 return 0;
889
890 return nexthdr;
891 }
892
893 static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
894 {
895 if (l3_proto == htons(ETH_P_IP)) {
896 unsigned int nh_ofs = skb_network_offset(skb);
897
898 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
899 return 0;
900
901 return ip_hdr(skb)->protocol;
902 } else if (l3_proto == htons(ETH_P_IPV6)) {
903 return parse_ipv6_l4_proto(skb);
904 }
905 return 0;
906 }
907
908 static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
909 __be32 src, __be32 dst, __u8 tos,
910 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
911 __be64 tun_id)
912 {
913 struct ethhdr *eh = eth_hdr(skb);
914 int ret = 0, min_headroom;
915 __be16 inner_l3_proto;
916 u8 inner_l4_proto;
917
918 inner_l3_proto = eh->h_proto;
919 inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
920
921 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
922 + STT_HEADER_LEN + sizeof(struct iphdr);
923
924 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
925 int head_delta = SKB_DATA_ALIGN(min_headroom -
926 skb_headroom(skb) +
927 16);
928
929 ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
930 0, GFP_ATOMIC);
931 if (unlikely(ret))
932 goto err_free_rt;
933 }
934
935 ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
936 if (ret < 0)
937 goto err_free_rt;
938 if (!ret) {
939 skb = handle_offloads(skb, min_headroom);
940 if (IS_ERR(skb)) {
941 ret = PTR_ERR(skb);
942 skb = NULL;
943 goto err_free_rt;
944 }
945 }
946
947 ret = 0;
948 while (skb) {
949 struct sk_buff *next_skb = skb->next;
950
951 skb->next = NULL;
952
953 if (next_skb)
954 dst_clone(&rt->dst);
955
956 /* Push STT and TCP header. */
957 skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
958 dst, inner_l3_proto, inner_l4_proto,
959 dst_mtu(&rt->dst));
960 if (unlikely(!skb)) {
961 ip_rt_put(rt);
962 goto next;
963 }
964
965 /* Push IP header. */
966 skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
967
968 next:
969 skb = next_skb;
970 }
971
972 return 0;
973
974 err_free_rt:
975 ip_rt_put(rt);
976 kfree_skb(skb);
977 return ret;
978 }
979
980 static struct rtable *stt_get_rt(struct sk_buff *skb,
981 struct net_device *dev,
982 struct flowi4 *fl,
983 const struct ip_tunnel_key *key)
984 {
985 struct net *net = dev_net(dev);
986
987 /* Route lookup */
988 memset(fl, 0, sizeof(*fl));
989 fl->daddr = key->u.ipv4.dst;
990 fl->saddr = key->u.ipv4.src;
991 fl->flowi4_tos = RT_TOS(key->tos);
992 fl->flowi4_mark = skb->mark;
993 fl->flowi4_proto = IPPROTO_TCP;
994
995 return ip_route_output_key(net, fl);
996 }
997
998 netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
999 {
1000 struct net_device *dev = skb->dev;
1001 struct stt_dev *stt_dev = netdev_priv(dev);
1002 struct net *net = stt_dev->net;
1003 __be16 dport = stt_dev->dst_port;
1004 struct ip_tunnel_key *tun_key;
1005 struct ip_tunnel_info *tun_info;
1006 struct rtable *rt;
1007 struct flowi4 fl;
1008 __be16 sport;
1009 __be16 df;
1010 int err;
1011
1012 tun_info = skb_tunnel_info(skb);
1013 if (unlikely(!tun_info)) {
1014 err = -EINVAL;
1015 goto error;
1016 }
1017
1018 tun_key = &tun_info->key;
1019
1020 rt = stt_get_rt(skb, dev, &fl, tun_key);
1021 if (IS_ERR(rt)) {
1022 err = PTR_ERR(rt);
1023 goto error;
1024 }
1025
1026 df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
1027 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
1028 skb->ignore_df = 1;
1029
1030 stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
1031 tun_key->tos, tun_key->ttl,
1032 df, sport, dport, tun_key->tun_id);
1033 return NETDEV_TX_OK;
1034 error:
1035 kfree_skb(skb);
1036 dev->stats.tx_errors++;
1037 return NETDEV_TX_OK;
1038 }
1039 EXPORT_SYMBOL(ovs_stt_xmit);
1040
1041 static void free_frag(struct stt_percpu *stt_percpu,
1042 struct pkt_frag *frag)
1043 {
1044 stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
1045 kfree_skb_list(frag->skbs);
1046 list_del(&frag->lru_node);
1047 frag->skbs = NULL;
1048 }
1049
1050 static void evict_frags(struct stt_percpu *stt_percpu)
1051 {
1052 while (!list_empty(&stt_percpu->frag_lru) &&
1053 stt_percpu->frag_mem_used > REASM_LO_THRESH) {
1054 struct pkt_frag *frag;
1055
1056 frag = list_first_entry(&stt_percpu->frag_lru,
1057 struct pkt_frag,
1058 lru_node);
1059 free_frag(stt_percpu, frag);
1060 }
1061 }
1062
1063 static bool pkt_key_match(struct net *net,
1064 const struct pkt_frag *a, const struct pkt_key *b)
1065 {
1066 return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
1067 a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
1068 net_eq(dev_net(a->skbs->dev), net);
1069 }
1070
1071 static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
1072 {
1073 u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
1074
1075 return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
1076 (__force u32)key->pkt_seq, initval);
1077 }
1078
1079 static struct pkt_frag *lookup_frag(struct net *net,
1080 struct stt_percpu *stt_percpu,
1081 const struct pkt_key *key, u32 hash)
1082 {
1083 struct pkt_frag *frag, *victim_frag = NULL;
1084 int i;
1085
1086 for (i = 0; i < FRAG_HASH_SEGS; i++) {
1087 frag = flex_array_get(stt_percpu->frag_hash,
1088 hash & (FRAG_HASH_ENTRIES - 1));
1089
1090 if (frag->skbs &&
1091 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
1092 pkt_key_match(net, frag, key))
1093 return frag;
1094
1095 if (!victim_frag ||
1096 (victim_frag->skbs &&
1097 (!frag->skbs ||
1098 time_before(frag->timestamp, victim_frag->timestamp))))
1099 victim_frag = frag;
1100
1101 hash >>= FRAG_HASH_SHIFT;
1102 }
1103
1104 if (victim_frag->skbs)
1105 free_frag(stt_percpu, victim_frag);
1106
1107 return victim_frag;
1108 }
1109
1110 #ifdef SKIP_ZERO_COPY
1111 static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
1112 int *delta, bool *headstolen)
1113 {
1114 int err;
1115
1116 if (unlikely(to->next))
1117 return -EINVAL;
1118
1119 if (unlikely(FRAG_CB(to)->offset))
1120 return -EINVAL;
1121
1122 if (unlikely(skb_unclone(to, GFP_ATOMIC)))
1123 return -ENOMEM;
1124
1125 if (skb_try_coalesce(to, from, headstolen, delta))
1126 return 0;
1127
1128 *headstolen = false;
1129 err = pskb_expand_head(to, 0, to->data_len + from->len, GFP_ATOMIC);
1130 if (unlikely(err))
1131 return err;
1132
1133 if (unlikely(!__pskb_pull_tail(to, to->data_len)))
1134 BUG();
1135
1136 skb_copy_bits(from, 0, skb_put(to, from->len), from->len);
1137
1138 *delta = from->len;
1139 to->truesize += from->len;
1140 return 0;
1141 }
1142 #else
1143 static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
1144 int *delta, bool *headstolen)
1145 {
1146 *headstolen = false;
1147 return -EINVAL;
1148 }
1149 #endif
1150
1151 static struct sk_buff *reassemble(struct sk_buff *skb)
1152 {
1153 struct iphdr *iph = ip_hdr(skb);
1154 struct tcphdr *tcph = tcp_hdr(skb);
1155 u32 seq = ntohl(tcph->seq);
1156 struct stt_percpu *stt_percpu;
1157 struct sk_buff *last_skb, *copied_skb = NULL;
1158 struct pkt_frag *frag;
1159 struct pkt_key key;
1160 int tot_len, delta = skb->truesize;
1161 bool headstolen;
1162 u32 hash;
1163
1164 tot_len = seq >> STT_SEQ_LEN_SHIFT;
1165 FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
1166
1167 if (unlikely(skb->len == 0))
1168 goto out_free;
1169
1170 if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
1171 goto out_free;
1172
1173 if (tot_len == skb->len)
1174 goto out;
1175
1176 key.saddr = iph->saddr;
1177 key.daddr = iph->daddr;
1178 key.pkt_seq = tcph->ack_seq;
1179 key.mark = skb->mark;
1180 hash = pkt_key_hash(dev_net(skb->dev), &key);
1181
1182 stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
1183
1184 spin_lock(&stt_percpu->lock);
1185
1186 if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
1187 evict_frags(stt_percpu);
1188
1189 frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
1190 if (!frag->skbs) {
1191 frag->skbs = skb;
1192 frag->key = key;
1193 frag->timestamp = jiffies;
1194 FRAG_CB(skb)->first.last_skb = skb;
1195 FRAG_CB(skb)->first.mem_used = skb->truesize;
1196 FRAG_CB(skb)->first.tot_len = tot_len;
1197 FRAG_CB(skb)->first.rcvd_len = skb->len;
1198 FRAG_CB(skb)->first.set_ecn_ce = false;
1199 list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
1200 stt_percpu->frag_mem_used += skb->truesize;
1201 skb = NULL;
1202 goto unlock;
1203 }
1204
1205 /* Optimize for the common case where fragments are received in-order
1206 * and not overlapping.
1207 */
1208 last_skb = FRAG_CB(frag->skbs)->first.last_skb;
1209 if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
1210 FRAG_CB(skb)->offset)) {
1211
1212 if (!__copy_skb(frag->skbs, skb, &delta, &headstolen)) {
1213 copied_skb = skb;
1214 } else {
1215 last_skb->next = skb;
1216 FRAG_CB(frag->skbs)->first.last_skb = skb;
1217 }
1218 } else {
1219 struct sk_buff *prev = NULL, *next;
1220
1221 for (next = frag->skbs; next; next = next->next) {
1222 if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
1223 break;
1224 prev = next;
1225 }
1226
1227 /* Overlapping fragments aren't allowed. We shouldn't start
1228 * before the end of the previous fragment.
1229 */
1230 if (prev &&
1231 FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
1232 goto unlock_free;
1233
1234 /* We also shouldn't end after the beginning of the next
1235 * fragment.
1236 */
1237 if (next &&
1238 FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
1239 goto unlock_free;
1240
1241 if (prev) {
1242 prev->next = skb;
1243 } else {
1244 FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
1245 frag->skbs = skb;
1246 }
1247
1248 if (next)
1249 skb->next = next;
1250 else
1251 FRAG_CB(frag->skbs)->first.last_skb = skb;
1252 }
1253
1254 FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
1255 FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
1256 stt_percpu->frag_mem_used += delta;
1257 FRAG_CB(frag->skbs)->first.mem_used += delta;
1258
1259 if (FRAG_CB(frag->skbs)->first.tot_len ==
1260 FRAG_CB(frag->skbs)->first.rcvd_len) {
1261 struct sk_buff *frag_head = frag->skbs;
1262
1263 frag_head->tstamp = skb->tstamp;
1264 if (FRAG_CB(frag_head)->first.set_ecn_ce)
1265 INET_ECN_set_ce(frag_head);
1266
1267 list_del(&frag->lru_node);
1268 stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
1269 frag->skbs = NULL;
1270 skb = frag_head;
1271 } else {
1272 list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
1273 skb = NULL;
1274 }
1275
1276 if (copied_skb)
1277 kfree_skb_partial(copied_skb, headstolen);
1278 goto unlock;
1279
1280 unlock_free:
1281 kfree_skb(skb);
1282 skb = NULL;
1283 unlock:
1284 spin_unlock(&stt_percpu->lock);
1285 return skb;
1286 out_free:
1287 kfree_skb(skb);
1288 skb = NULL;
1289 out:
1290 return skb;
1291 }
1292
1293 static bool validate_checksum(struct sk_buff *skb)
1294 {
1295 struct iphdr *iph = ip_hdr(skb);
1296
1297 if (skb_csum_unnecessary(skb))
1298 return true;
1299
1300 if (skb->ip_summed == CHECKSUM_COMPLETE &&
1301 !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
1302 return true;
1303
1304 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
1305 IPPROTO_TCP, 0);
1306
1307 return __tcp_checksum_complete(skb) == 0;
1308 }
1309
1310 static bool set_offloads(struct sk_buff *skb)
1311 {
1312 struct stthdr *stth = stt_hdr(skb);
1313 unsigned short gso_type;
1314 int l3_header_size;
1315 int l4_header_size;
1316 u16 csum_offset;
1317 u8 proto_type;
1318
1319 if (stth->vlan_tci)
1320 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
1321 ntohs(stth->vlan_tci));
1322
1323 if (!(stth->flags & STT_CSUM_PARTIAL)) {
1324 if (stth->flags & STT_CSUM_VERIFIED)
1325 skb->ip_summed = CHECKSUM_UNNECESSARY;
1326 else
1327 skb->ip_summed = CHECKSUM_NONE;
1328
1329 return clear_gso(skb) == 0;
1330 }
1331
1332 proto_type = stth->flags & STT_PROTO_TYPES;
1333
1334 switch (proto_type) {
1335 case (STT_PROTO_IPV4 | STT_PROTO_TCP):
1336 /* TCP/IPv4 */
1337 csum_offset = offsetof(struct tcphdr, check);
1338 gso_type = SKB_GSO_TCPV4;
1339 l3_header_size = sizeof(struct iphdr);
1340 l4_header_size = sizeof(struct tcphdr);
1341 skb->protocol = htons(ETH_P_IP);
1342 break;
1343 case STT_PROTO_TCP:
1344 /* TCP/IPv6 */
1345 csum_offset = offsetof(struct tcphdr, check);
1346 gso_type = SKB_GSO_TCPV6;
1347 l3_header_size = sizeof(struct ipv6hdr);
1348 l4_header_size = sizeof(struct tcphdr);
1349 skb->protocol = htons(ETH_P_IPV6);
1350 break;
1351 case STT_PROTO_IPV4:
1352 /* UDP/IPv4 */
1353 csum_offset = offsetof(struct udphdr, check);
1354 gso_type = SKB_GSO_UDP;
1355 l3_header_size = sizeof(struct iphdr);
1356 l4_header_size = sizeof(struct udphdr);
1357 skb->protocol = htons(ETH_P_IP);
1358 break;
1359 default:
1360 /* UDP/IPv6 */
1361 csum_offset = offsetof(struct udphdr, check);
1362 gso_type = SKB_GSO_UDP;
1363 l3_header_size = sizeof(struct ipv6hdr);
1364 l4_header_size = sizeof(struct udphdr);
1365 skb->protocol = htons(ETH_P_IPV6);
1366 }
1367
1368 if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
1369 return false;
1370
1371 if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
1372 return false;
1373
1374 stth = stt_hdr(skb);
1375
1376 skb->csum_start = skb_headroom(skb) + stth->l4_offset;
1377 skb->csum_offset = csum_offset;
1378 skb->ip_summed = CHECKSUM_PARTIAL;
1379
1380 if (stth->mss) {
1381 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
1382 return false;
1383
1384 skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
1385 skb_shinfo(skb)->gso_size = ntohs(stth->mss);
1386 skb_shinfo(skb)->gso_segs = 0;
1387 } else {
1388 if (unlikely(clear_gso(skb)))
1389 return false;
1390 }
1391
1392 return true;
1393 }
1394
1395 static void rcv_list(struct net_device *dev, struct sk_buff *skb,
1396 struct metadata_dst *tun_dst)
1397 {
1398 struct sk_buff *next;
1399
1400 do {
1401 next = skb->next;
1402 skb->next = NULL;
1403 if (next) {
1404 ovs_dst_hold((struct dst_entry *)tun_dst);
1405 ovs_skb_dst_set(next, (struct dst_entry *)tun_dst);
1406 }
1407 ovs_ip_tunnel_rcv(dev, skb, tun_dst);
1408 } while ((skb = next));
1409 }
1410
1411 #ifndef USE_UPSTREAM_TUNNEL
1412 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1413 {
1414 struct metadata_dst tun_dst;
1415
1416 ovs_ip_tun_rx_dst(&tun_dst, skb, TUNNEL_KEY | TUNNEL_CSUM,
1417 get_unaligned(&stt_hdr(skb)->key), 0);
1418 tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1419 tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1420
1421 rcv_list(stt_dev->dev, skb, &tun_dst);
1422 return 0;
1423 }
1424 #else
1425 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1426 {
1427 struct metadata_dst *tun_dst;
1428 __be16 flags;
1429 __be64 tun_id;
1430
1431 flags = TUNNEL_KEY | TUNNEL_CSUM;
1432 tun_id = get_unaligned(&stt_hdr(skb)->key);
1433 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
1434 if (!tun_dst)
1435 return -ENOMEM;
1436 tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1437 tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1438
1439 rcv_list(stt_dev->dev, skb, tun_dst);
1440 return 0;
1441 }
1442 #endif
1443
1444 static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1445 {
1446 int err;
1447
1448 if (unlikely(!validate_checksum(skb)))
1449 goto drop;
1450
1451 __skb_pull(skb, sizeof(struct tcphdr));
1452 skb = reassemble(skb);
1453 if (!skb)
1454 return;
1455
1456 if (skb->next && coalesce_skb(&skb))
1457 goto drop;
1458
1459 err = iptunnel_pull_header(skb,
1460 sizeof(struct stthdr) + STT_ETH_PAD,
1461 htons(ETH_P_TEB),
1462 !net_eq(stt_dev->net, dev_net(stt_dev->dev)));
1463 if (unlikely(err))
1464 goto drop;
1465
1466 if (unlikely(stt_hdr(skb)->version != 0))
1467 goto drop;
1468
1469 if (unlikely(!set_offloads(skb)))
1470 goto drop;
1471
1472 if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
1473 goto drop;
1474
1475 err = __stt_rcv(stt_dev, skb);
1476 if (err)
1477 goto drop;
1478 return;
1479 drop:
1480 /* Consume bad packet */
1481 kfree_skb_list(skb);
1482 stt_dev->dev->stats.rx_errors++;
1483 }
1484
1485 static void tcp_sock_release(struct socket *sock)
1486 {
1487 kernel_sock_shutdown(sock, SHUT_RDWR);
1488 sock_release(sock);
1489 }
1490
1491 static int tcp_sock_create4(struct net *net, __be16 port,
1492 struct socket **sockp)
1493 {
1494 struct sockaddr_in tcp_addr;
1495 struct socket *sock = NULL;
1496 int err;
1497
1498 err = sock_create_kern(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1499 if (err < 0)
1500 goto error;
1501
1502 memset(&tcp_addr, 0, sizeof(tcp_addr));
1503 tcp_addr.sin_family = AF_INET;
1504 tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
1505 tcp_addr.sin_port = port;
1506 err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
1507 sizeof(tcp_addr));
1508 if (err < 0)
1509 goto error;
1510
1511 *sockp = sock;
1512 return 0;
1513
1514 error:
1515 if (sock)
1516 tcp_sock_release(sock);
1517 *sockp = NULL;
1518 return err;
1519 }
1520
1521 static void schedule_clean_percpu(void)
1522 {
1523 schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
1524 }
1525
1526 static void clean_percpu(struct work_struct *work)
1527 {
1528 int i;
1529
1530 for_each_possible_cpu(i) {
1531 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1532 int j;
1533
1534 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1535 struct pkt_frag *frag;
1536
1537 frag = flex_array_get(stt_percpu->frag_hash, j);
1538 if (!frag->skbs ||
1539 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
1540 continue;
1541
1542 spin_lock_bh(&stt_percpu->lock);
1543
1544 if (frag->skbs &&
1545 time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
1546 free_frag(stt_percpu, frag);
1547
1548 spin_unlock_bh(&stt_percpu->lock);
1549 }
1550 }
1551 schedule_clean_percpu();
1552 }
1553
1554 #ifdef HAVE_NF_HOOKFN_ARG_OPS
1555 #define FIRST_PARAM const struct nf_hook_ops *ops
1556 #else
1557 #ifdef HAVE_NF_HOOKFN_ARG_PRIV
1558 #define FIRST_PARAM void *priv
1559 #else
1560 #define FIRST_PARAM unsigned int hooknum
1561 #endif
1562 #endif
1563
1564 #ifdef HAVE_NF_HOOK_STATE
1565 #if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)
1566 /* RHEL nfhook hacks. */
1567 #ifndef __GENKSYMS__
1568 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1569 const struct nf_hook_state *state
1570 #else
1571 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1572 int (*okfn)(struct sk_buff *)
1573 #endif
1574 #else
1575 #define LAST_PARAM const struct nf_hook_state *state
1576 #endif
1577 #else
1578 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1579 int (*okfn)(struct sk_buff *)
1580 #endif
1581
1582 static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
1583 {
1584 struct stt_dev *stt_dev;
1585 int ip_hdr_len;
1586
1587 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
1588 return NF_ACCEPT;
1589
1590 ip_hdr_len = ip_hdrlen(skb);
1591 if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
1592 return NF_ACCEPT;
1593
1594 skb_set_transport_header(skb, ip_hdr_len);
1595
1596 stt_dev = stt_find_up_dev(dev_net(skb->dev), tcp_hdr(skb)->dest);
1597 if (!stt_dev)
1598 return NF_ACCEPT;
1599
1600 __skb_pull(skb, ip_hdr_len);
1601 stt_rcv(stt_dev, skb);
1602 return NF_STOLEN;
1603 }
1604
1605 static struct nf_hook_ops nf_hook_ops __read_mostly = {
1606 .hook = nf_ip_hook,
1607 #ifdef HAVE_NF_HOOKS_OPS_OWNER
1608 .owner = THIS_MODULE,
1609 #endif
1610 .pf = NFPROTO_IPV4,
1611 .hooknum = NF_INET_LOCAL_IN,
1612 .priority = INT_MAX,
1613 };
1614
1615 static int stt_start(struct net *net)
1616 {
1617 struct stt_net *sn = net_generic(net, stt_net_id);
1618 int err;
1619 int i;
1620
1621 if (n_tunnels) {
1622 n_tunnels++;
1623 return 0;
1624 }
1625 get_random_bytes(&frag_hash_seed, sizeof(u32));
1626
1627 stt_percpu_data = alloc_percpu(struct stt_percpu);
1628 if (!stt_percpu_data) {
1629 err = -ENOMEM;
1630 goto error;
1631 }
1632
1633 for_each_possible_cpu(i) {
1634 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1635 struct flex_array *frag_hash;
1636
1637 spin_lock_init(&stt_percpu->lock);
1638 INIT_LIST_HEAD(&stt_percpu->frag_lru);
1639 get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
1640
1641 frag_hash = flex_array_alloc(sizeof(struct pkt_frag),
1642 FRAG_HASH_ENTRIES,
1643 GFP_KERNEL | __GFP_ZERO);
1644 if (!frag_hash) {
1645 err = -ENOMEM;
1646 goto free_percpu;
1647 }
1648 stt_percpu->frag_hash = frag_hash;
1649
1650 err = flex_array_prealloc(stt_percpu->frag_hash, 0,
1651 FRAG_HASH_ENTRIES,
1652 GFP_KERNEL | __GFP_ZERO);
1653 if (err)
1654 goto free_percpu;
1655 }
1656 schedule_clean_percpu();
1657 n_tunnels++;
1658
1659 if (sn->n_tunnels) {
1660 sn->n_tunnels++;
1661 return 0;
1662 }
1663 #ifdef HAVE_NF_REGISTER_NET_HOOK
1664 /* On kernel which support per net nf-hook, nf_register_hook() takes
1665 * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
1666 * use this new API.
1667 */
1668
1669 if (sn->nf_hook_reg_done)
1670 goto out;
1671
1672 err = nf_register_net_hook(net, &nf_hook_ops);
1673 if (!err)
1674 sn->nf_hook_reg_done = true;
1675 #else
1676 /* Register STT only on very first STT device addition. */
1677 if (!list_empty(&nf_hook_ops.list))
1678 goto out;
1679
1680 err = nf_register_hook(&nf_hook_ops);
1681 #endif
1682 if (err)
1683 goto dec_n_tunnel;
1684 out:
1685 sn->n_tunnels++;
1686 return 0;
1687
1688 dec_n_tunnel:
1689 n_tunnels--;
1690 free_percpu:
1691 for_each_possible_cpu(i) {
1692 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1693
1694 if (stt_percpu->frag_hash)
1695 flex_array_free(stt_percpu->frag_hash);
1696 }
1697
1698 free_percpu(stt_percpu_data);
1699
1700 error:
1701 return err;
1702 }
1703
1704 static void stt_cleanup(struct net *net)
1705 {
1706 struct stt_net *sn = net_generic(net, stt_net_id);
1707 int i;
1708
1709 sn->n_tunnels--;
1710 if (sn->n_tunnels)
1711 goto out;
1712 out:
1713 n_tunnels--;
1714 if (n_tunnels)
1715 return;
1716
1717 cancel_delayed_work_sync(&clean_percpu_wq);
1718 for_each_possible_cpu(i) {
1719 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1720 int j;
1721
1722 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1723 struct pkt_frag *frag;
1724
1725 frag = flex_array_get(stt_percpu->frag_hash, j);
1726 kfree_skb_list(frag->skbs);
1727 }
1728
1729 flex_array_free(stt_percpu->frag_hash);
1730 }
1731
1732 free_percpu(stt_percpu_data);
1733 }
1734
1735 static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
1736 {
1737 #ifdef USE_UPSTREAM_TUNNEL
1738 return ovs_stt_xmit(skb);
1739 #else
1740 /* Drop All packets coming from networking stack. OVS-CB is
1741 * not initialized for these packets.
1742 */
1743 dev_kfree_skb(skb);
1744 dev->stats.tx_dropped++;
1745 return NETDEV_TX_OK;
1746 #endif
1747 }
1748
1749 /* Setup stats when device is created */
1750 static int stt_init(struct net_device *dev)
1751 {
1752 dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1753 if (!dev->tstats)
1754 return -ENOMEM;
1755
1756 return 0;
1757 }
1758
1759 static void stt_uninit(struct net_device *dev)
1760 {
1761 free_percpu(dev->tstats);
1762 }
1763
1764 static int stt_open(struct net_device *dev)
1765 {
1766 struct stt_dev *stt = netdev_priv(dev);
1767 struct net *net = stt->net;
1768 struct stt_net *sn = net_generic(net, stt_net_id);
1769 int err;
1770
1771 err = stt_start(net);
1772 if (err)
1773 return err;
1774
1775 err = tcp_sock_create4(net, stt->dst_port, &stt->sock);
1776 if (err)
1777 return err;
1778 list_add_rcu(&stt->up_next, &sn->stt_up_list);
1779 return 0;
1780 }
1781
1782 static int stt_stop(struct net_device *dev)
1783 {
1784 struct stt_dev *stt_dev = netdev_priv(dev);
1785 struct net *net = stt_dev->net;
1786
1787 list_del_rcu(&stt_dev->up_next);
1788 synchronize_net();
1789 tcp_sock_release(stt_dev->sock);
1790 stt_dev->sock = NULL;
1791 stt_cleanup(net);
1792 return 0;
1793 }
1794
1795 static int __stt_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1796 {
1797 int max_mtu = IP_MAX_MTU - STT_HEADER_LEN - sizeof(struct iphdr)
1798 - dev->hard_header_len;
1799
1800 if (new_mtu < 68)
1801 return -EINVAL;
1802
1803 if (new_mtu > max_mtu) {
1804 if (strict)
1805 return -EINVAL;
1806
1807 new_mtu = max_mtu;
1808 }
1809
1810 dev->mtu = new_mtu;
1811 return 0;
1812 }
1813
1814 static int stt_change_mtu(struct net_device *dev, int new_mtu)
1815 {
1816 return __stt_change_mtu(dev, new_mtu, true);
1817 }
1818
1819 int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
1820 {
1821 struct ip_tunnel_info *info = skb_tunnel_info(skb);
1822 struct stt_dev *stt_dev = netdev_priv(dev);
1823 struct net *net = stt_dev->net;
1824 __be16 dport = stt_dev->dst_port;
1825 struct flowi4 fl4;
1826 struct rtable *rt;
1827
1828 if (ip_tunnel_info_af(info) != AF_INET)
1829 return -EINVAL;
1830
1831 rt = stt_get_rt(skb, dev, &fl4, &info->key);
1832 if (IS_ERR(rt))
1833 return PTR_ERR(rt);
1834
1835 ip_rt_put(rt);
1836
1837 info->key.u.ipv4.src = fl4.saddr;
1838 info->key.tp_src = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
1839 info->key.tp_dst = dport;
1840 return 0;
1841 }
1842 EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst);
1843
1844 static const struct net_device_ops stt_netdev_ops = {
1845 .ndo_init = stt_init,
1846 .ndo_uninit = stt_uninit,
1847 .ndo_open = stt_open,
1848 .ndo_stop = stt_stop,
1849 .ndo_start_xmit = stt_dev_xmit,
1850 .ndo_get_stats64 = ip_tunnel_get_stats64,
1851 .ndo_change_mtu = stt_change_mtu,
1852 .ndo_validate_addr = eth_validate_addr,
1853 .ndo_set_mac_address = eth_mac_addr,
1854 #ifdef USE_UPSTREAM_TUNNEL
1855 #ifdef HAVE_NDO_FILL_METADATA_DST
1856 .ndo_fill_metadata_dst = stt_fill_metadata_dst,
1857 #endif
1858 #endif
1859 };
1860
1861 static void stt_get_drvinfo(struct net_device *dev,
1862 struct ethtool_drvinfo *drvinfo)
1863 {
1864 strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version));
1865 strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver));
1866 }
1867
1868 static const struct ethtool_ops stt_ethtool_ops = {
1869 .get_drvinfo = stt_get_drvinfo,
1870 .get_link = ethtool_op_get_link,
1871 };
1872
1873 /* Info for udev, that this is a virtual tunnel endpoint */
1874 static struct device_type stt_type = {
1875 .name = "stt",
1876 };
1877
1878 /* Initialize the device structure. */
1879 static void stt_setup(struct net_device *dev)
1880 {
1881 ether_setup(dev);
1882
1883 dev->netdev_ops = &stt_netdev_ops;
1884 dev->ethtool_ops = &stt_ethtool_ops;
1885 dev->destructor = free_netdev;
1886
1887 SET_NETDEV_DEVTYPE(dev, &stt_type);
1888
1889 dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL;
1890 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
1891 dev->features |= NETIF_F_RXCSUM;
1892 dev->features |= NETIF_F_GSO_SOFTWARE;
1893
1894 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1895 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1896
1897 #ifdef USE_UPSTREAM_TUNNEL
1898 netif_keep_dst(dev);
1899 #endif
1900 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
1901 eth_hw_addr_random(dev);
1902 }
1903
1904 static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = {
1905 [IFLA_STT_PORT] = { .type = NLA_U16 },
1906 };
1907
1908 static int stt_validate(struct nlattr *tb[], struct nlattr *data[])
1909 {
1910 if (tb[IFLA_ADDRESS]) {
1911 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1912 return -EINVAL;
1913
1914 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1915 return -EADDRNOTAVAIL;
1916 }
1917
1918 return 0;
1919 }
1920
1921 static struct stt_dev *find_dev(struct net *net, __be16 dst_port)
1922 {
1923 struct stt_net *sn = net_generic(net, stt_net_id);
1924 struct stt_dev *dev;
1925
1926 list_for_each_entry(dev, &sn->stt_list, next) {
1927 if (dev->dst_port == dst_port)
1928 return dev;
1929 }
1930 return NULL;
1931 }
1932
1933 static int stt_configure(struct net *net, struct net_device *dev,
1934 __be16 dst_port)
1935 {
1936 struct stt_net *sn = net_generic(net, stt_net_id);
1937 struct stt_dev *stt = netdev_priv(dev);
1938 int err;
1939
1940 stt->net = net;
1941 stt->dev = dev;
1942
1943 stt->dst_port = dst_port;
1944
1945 if (find_dev(net, dst_port))
1946 return -EBUSY;
1947
1948 err = __stt_change_mtu(dev, IP_MAX_MTU, false);
1949 if (err)
1950 return err;
1951
1952 err = register_netdevice(dev);
1953 if (err)
1954 return err;
1955
1956 list_add(&stt->next, &sn->stt_list);
1957 return 0;
1958 }
1959
1960 static int stt_newlink(struct net *net, struct net_device *dev,
1961 struct nlattr *tb[], struct nlattr *data[])
1962 {
1963 __be16 dst_port = htons(STT_DST_PORT);
1964
1965 if (data[IFLA_STT_PORT])
1966 dst_port = nla_get_be16(data[IFLA_STT_PORT]);
1967
1968 return stt_configure(net, dev, dst_port);
1969 }
1970
1971 static void stt_dellink(struct net_device *dev, struct list_head *head)
1972 {
1973 struct stt_dev *stt = netdev_priv(dev);
1974
1975 list_del(&stt->next);
1976 unregister_netdevice_queue(dev, head);
1977 }
1978
1979 static size_t stt_get_size(const struct net_device *dev)
1980 {
1981 return nla_total_size(sizeof(__be32)); /* IFLA_STT_PORT */
1982 }
1983
1984 static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev)
1985 {
1986 struct stt_dev *stt = netdev_priv(dev);
1987
1988 if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port))
1989 goto nla_put_failure;
1990
1991 return 0;
1992
1993 nla_put_failure:
1994 return -EMSGSIZE;
1995 }
1996
1997 static struct rtnl_link_ops stt_link_ops __read_mostly = {
1998 .kind = "stt",
1999 .maxtype = IFLA_STT_MAX,
2000 .policy = stt_policy,
2001 .priv_size = sizeof(struct stt_dev),
2002 .setup = stt_setup,
2003 .validate = stt_validate,
2004 .newlink = stt_newlink,
2005 .dellink = stt_dellink,
2006 .get_size = stt_get_size,
2007 .fill_info = stt_fill_info,
2008 };
2009
2010 struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
2011 u8 name_assign_type, u16 dst_port)
2012 {
2013 struct nlattr *tb[IFLA_MAX + 1];
2014 struct net_device *dev;
2015 int err;
2016
2017 memset(tb, 0, sizeof(tb));
2018 dev = rtnl_create_link(net, (char *) name, name_assign_type,
2019 &stt_link_ops, tb);
2020 if (IS_ERR(dev))
2021 return dev;
2022
2023 err = stt_configure(net, dev, htons(dst_port));
2024 if (err) {
2025 free_netdev(dev);
2026 return ERR_PTR(err);
2027 }
2028 return dev;
2029 }
2030 EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb);
2031
2032 static int stt_init_net(struct net *net)
2033 {
2034 struct stt_net *sn = net_generic(net, stt_net_id);
2035
2036 INIT_LIST_HEAD(&sn->stt_list);
2037 INIT_LIST_HEAD(&sn->stt_up_list);
2038 #ifdef HAVE_NF_REGISTER_NET_HOOK
2039 sn->nf_hook_reg_done = false;
2040 #endif
2041 return 0;
2042 }
2043
2044 static void stt_exit_net(struct net *net)
2045 {
2046 struct stt_net *sn = net_generic(net, stt_net_id);
2047 struct stt_dev *stt, *next;
2048 struct net_device *dev, *aux;
2049 LIST_HEAD(list);
2050
2051 #ifdef HAVE_NF_REGISTER_NET_HOOK
2052 /* Ideally this should be done from stt_stop(), But on some kernels
2053 * nf-unreg operation needs RTNL-lock, which can cause deallock.
2054 * So it is done from here. */
2055 if (sn->nf_hook_reg_done)
2056 nf_unregister_net_hook(net, &nf_hook_ops);
2057 #endif
2058
2059 rtnl_lock();
2060
2061 /* gather any stt devices that were moved into this ns */
2062 for_each_netdev_safe(net, dev, aux)
2063 if (dev->rtnl_link_ops == &stt_link_ops)
2064 unregister_netdevice_queue(dev, &list);
2065
2066 list_for_each_entry_safe(stt, next, &sn->stt_list, next) {
2067 /* If stt->dev is in the same netns, it was already added
2068 * to the stt by the previous loop.
2069 */
2070 if (!net_eq(dev_net(stt->dev), net))
2071 unregister_netdevice_queue(stt->dev, &list);
2072 }
2073
2074 /* unregister the devices gathered above */
2075 unregister_netdevice_many(&list);
2076 rtnl_unlock();
2077 }
2078
2079 static struct pernet_operations stt_net_ops = {
2080 .init = stt_init_net,
2081 .exit = stt_exit_net,
2082 .id = &stt_net_id,
2083 .size = sizeof(struct stt_net),
2084 };
2085
2086 int stt_init_module(void)
2087 {
2088 int rc;
2089
2090 rc = register_pernet_subsys(&stt_net_ops);
2091 if (rc)
2092 goto out1;
2093
2094 rc = rtnl_link_register(&stt_link_ops);
2095 if (rc)
2096 goto out2;
2097
2098 INIT_LIST_HEAD(&nf_hook_ops.list);
2099 pr_info("STT tunneling driver\n");
2100 return 0;
2101 out2:
2102 unregister_pernet_subsys(&stt_net_ops);
2103 out1:
2104 return rc;
2105 }
2106
2107 void stt_cleanup_module(void)
2108 {
2109 #ifndef HAVE_NF_REGISTER_NET_HOOK
2110 if (!list_empty(&nf_hook_ops.list))
2111 nf_unregister_hook(&nf_hook_ops);
2112 #endif
2113 rtnl_link_unregister(&stt_link_ops);
2114 unregister_pernet_subsys(&stt_net_ops);
2115 }
2116 #endif