]> git.proxmox.com Git - mirror_ovs.git/blob - datapath/linux/compat/stt.c
compat: Fix broken partial backport of extack op parameter
[mirror_ovs.git] / datapath / linux / compat / stt.c
1 /*
2 * Stateless TCP Tunnel (STT) vport.
3 *
4 * Copyright (c) 2015 Nicira, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <asm/unaligned.h>
14
15 #include <linux/delay.h>
16 #include <linux/if.h>
17 #include <linux/if_vlan.h>
18 #include <linux/ip.h>
19 #include <linux/ipv6.h>
20 #include <linux/jhash.h>
21 #include <linux/list.h>
22 #include <linux/log2.h>
23 #include <linux/module.h>
24 #include <linux/net.h>
25 #include <linux/netfilter.h>
26 #include <linux/percpu.h>
27 #include <linux/skbuff.h>
28 #include <linux/tcp.h>
29 #include <linux/workqueue.h>
30
31 #include <net/dst_metadata.h>
32 #include <net/icmp.h>
33 #include <net/inet_ecn.h>
34 #include <net/ip.h>
35 #include <net/ip_tunnels.h>
36 #include <net/ip6_checksum.h>
37 #include <net/net_namespace.h>
38 #include <net/netns/generic.h>
39 #include <net/sock.h>
40 #include <net/stt.h>
41 #include <net/tcp.h>
42 #include <net/udp.h>
43
44 #include "gso.h"
45 #include "compat.h"
46
47 #define STT_NETDEV_VER "0.1"
48 #define STT_DST_PORT 7471
49
50 #ifdef OVS_STT
51 #ifdef CONFIG_SLUB
52 /*
53 * We saw better performance with skipping zero copy in case of SLUB.
54 * So skip zero copy for SLUB case.
55 */
56 #define SKIP_ZERO_COPY
57 #endif
58
59 #define STT_VER 0
60
61 /* @list: Per-net list of STT ports.
62 * @rcv: The callback is called on STT packet recv, STT reassembly can generate
63 * multiple packets, in this case first packet has tunnel outer header, rest
64 * of the packets are inner packet segments with no stt header.
65 * @rcv_data: user data.
66 * @sock: Fake TCP socket for the STT port.
67 */
68 struct stt_dev {
69 struct net_device *dev;
70 struct net *net;
71 struct list_head next;
72 struct list_head up_next;
73 struct socket *sock;
74 __be16 dst_port;
75 };
76
77 #define STT_CSUM_VERIFIED BIT(0)
78 #define STT_CSUM_PARTIAL BIT(1)
79 #define STT_PROTO_IPV4 BIT(2)
80 #define STT_PROTO_TCP BIT(3)
81 #define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
82
83 #ifdef HAVE_SKB_GSO_UDP
84 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
85 SKB_GSO_TCPV6)
86 #else
87 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_DODGY | \
88 SKB_GSO_TCPV6)
89 #endif
90
91 /* The length and offset of a fragment are encoded in the sequence number.
92 * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
93 * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
94 */
95 #define STT_SEQ_LEN_SHIFT 16
96 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
97
98 /* The maximum amount of memory used to store packets waiting to be reassembled
99 * on a given CPU. Once this threshold is exceeded we will begin freeing the
100 * least recently used fragments.
101 */
102 #define REASM_HI_THRESH (4 * 1024 * 1024)
103 /* The target for the high memory evictor. Once we have exceeded
104 * REASM_HI_THRESH, we will continue freeing fragments until we hit
105 * this limit.
106 */
107 #define REASM_LO_THRESH (3 * 1024 * 1024)
108 /* The length of time a given packet has to be reassembled from the time the
109 * first fragment arrives. Once this limit is exceeded it becomes available
110 * for cleaning.
111 */
112 #define FRAG_EXP_TIME (30 * HZ)
113 /* Number of hash entries. Each entry has only a single slot to hold a packet
114 * so if there are collisions, we will drop packets. This is allocated
115 * per-cpu and each entry consists of struct pkt_frag.
116 */
117 #define FRAG_HASH_SHIFT 8
118 #define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT)
119 #define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
120
121 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
122
123 struct pkt_key {
124 __be32 saddr;
125 __be32 daddr;
126 __be32 pkt_seq;
127 u32 mark;
128 };
129
130 struct pkt_frag {
131 struct sk_buff *skbs;
132 unsigned long timestamp;
133 struct list_head lru_node;
134 struct pkt_key key;
135 };
136
137 struct stt_percpu {
138 struct pkt_frag *frag_hash;
139 struct list_head frag_lru;
140 unsigned int frag_mem_used;
141
142 /* Protect frags table. */
143 spinlock_t lock;
144 };
145
146 struct first_frag {
147 struct sk_buff *last_skb;
148 unsigned int mem_used;
149 u16 tot_len;
150 u16 rcvd_len;
151 bool set_ecn_ce;
152 };
153
154 struct frag_skb_cb {
155 u16 offset;
156
157 /* Only valid for the first skb in the chain. */
158 struct first_frag first;
159 };
160
161 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
162
163 /* per-network namespace private data for this module */
164 struct stt_net {
165 struct list_head stt_list;
166 struct list_head stt_up_list; /* Devices which are in IFF_UP state. */
167 int n_tunnels;
168 #ifdef HAVE_NF_REGISTER_NET_HOOK
169 bool nf_hook_reg_done;
170 #endif
171 };
172
173 static int stt_net_id;
174
175 static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
176 static u32 frag_hash_seed __read_mostly;
177
178 /* Protects sock-hash and refcounts. */
179 static DEFINE_MUTEX(stt_mutex);
180
181 static int n_tunnels;
182 static DEFINE_PER_CPU(u32, pkt_seq_counter);
183
184 static void clean_percpu(struct work_struct *work);
185 static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
186
187 static struct stt_dev *stt_find_up_dev(struct net *net, __be16 port)
188 {
189 struct stt_net *sn = net_generic(net, stt_net_id);
190 struct stt_dev *stt_dev;
191
192 list_for_each_entry_rcu(stt_dev, &sn->stt_up_list, up_next) {
193 if (stt_dev->dst_port == port)
194 return stt_dev;
195 }
196 return NULL;
197 }
198
199 static __be32 ack_seq(void)
200 {
201 #if NR_CPUS <= 65536
202 u32 pkt_seq, ack;
203
204 pkt_seq = this_cpu_read(pkt_seq_counter);
205 ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
206 this_cpu_inc(pkt_seq_counter);
207
208 return (__force __be32)ack;
209 #else
210 #error "Support for greater than 64k CPUs not implemented"
211 #endif
212 }
213
214 static int clear_gso(struct sk_buff *skb)
215 {
216 struct skb_shared_info *shinfo = skb_shinfo(skb);
217 int err;
218
219 if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
220 shinfo->gso_segs == 0)
221 return 0;
222
223 err = skb_unclone(skb, GFP_ATOMIC);
224 if (unlikely(err))
225 return err;
226
227 shinfo = skb_shinfo(skb);
228 shinfo->gso_type = 0;
229 shinfo->gso_size = 0;
230 shinfo->gso_segs = 0;
231 return 0;
232 }
233
234 static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
235 {
236 to->protocol = from->protocol;
237 to->tstamp = from->tstamp;
238 to->priority = from->priority;
239 to->mark = from->mark;
240 to->vlan_tci = from->vlan_tci;
241 to->vlan_proto = from->vlan_proto;
242 skb_copy_secmark(to, from);
243 }
244
245 static void update_headers(struct sk_buff *skb, bool head,
246 unsigned int l4_offset, unsigned int hdr_len,
247 bool ipv4, u32 tcp_seq)
248 {
249 u16 old_len, new_len;
250 __be32 delta;
251 struct tcphdr *tcph;
252 int gso_size;
253
254 if (ipv4) {
255 struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
256
257 old_len = ntohs(iph->tot_len);
258 new_len = skb->len - ETH_HLEN;
259 iph->tot_len = htons(new_len);
260
261 ip_send_check(iph);
262 } else {
263 struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
264
265 old_len = ntohs(ip6h->payload_len);
266 new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
267 ip6h->payload_len = htons(new_len);
268 }
269
270 tcph = (struct tcphdr *)(skb->data + l4_offset);
271 if (!head) {
272 tcph->seq = htonl(tcp_seq);
273 tcph->cwr = 0;
274 }
275
276 if (skb->next) {
277 tcph->fin = 0;
278 tcph->psh = 0;
279 }
280
281 delta = htonl(~old_len + new_len);
282 tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
283 (__force u32)delta));
284
285 gso_size = skb_shinfo(skb)->gso_size;
286 if (gso_size && skb->len - hdr_len <= gso_size)
287 BUG_ON(clear_gso(skb));
288 }
289
290 static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
291 {
292 /* If no offloading is in use then we don't have enough information
293 * to process the headers.
294 */
295 if (!csum_partial)
296 goto linearize;
297
298 /* Handling UDP packets requires IP fragmentation, which means that
299 * the L4 checksum can no longer be calculated by hardware (since the
300 * fragments are in different packets. If we have to compute the
301 * checksum it's faster just to linearize and large UDP packets are
302 * pretty uncommon anyways, so it's not worth dealing with for now.
303 */
304 if (!tcp)
305 goto linearize;
306
307 if (ipv4) {
308 struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
309
310 /* It's difficult to get the IP IDs exactly right here due to
311 * varying segment sizes and potentially multiple layers of
312 * segmentation. IP ID isn't important when DF is set and DF
313 * is generally set for TCP packets, so just linearize if it's
314 * not.
315 */
316 if (!(iph->frag_off & htons(IP_DF)))
317 goto linearize;
318 } else {
319 struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
320
321 /* Jumbograms require more processing to update and we'll
322 * probably never see them, so just linearize.
323 */
324 if (ip6h->payload_len == 0)
325 goto linearize;
326 }
327 return true;
328
329 linearize:
330 return false;
331 }
332
333 static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
334 int hdr_len)
335 {
336 u16 csum_start;
337
338 if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
339 int extra_head = hdr_len - skb_headroom(frag);
340
341 extra_head = extra_head > 0 ? extra_head : 0;
342 if (unlikely(pskb_expand_head(frag, extra_head, 0,
343 GFP_ATOMIC)))
344 return -ENOMEM;
345 }
346
347 memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
348
349 csum_start = head->csum_start - skb_headroom(head);
350 frag->csum_start = skb_headroom(frag) + csum_start;
351 frag->csum_offset = head->csum_offset;
352 frag->ip_summed = head->ip_summed;
353
354 skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
355 skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
356 skb_shinfo(frag)->gso_segs = 0;
357
358 copy_skb_metadata(frag, head);
359 return 0;
360 }
361
362 static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
363 {
364 struct sk_buff *skb;
365 struct tcphdr *tcph;
366 int seg_len;
367 int hdr_len;
368 int tcp_len;
369 u32 seq;
370
371 if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
372 return -ENOMEM;
373
374 tcph = (struct tcphdr *)(head->data + l4_offset);
375 tcp_len = tcph->doff * 4;
376 hdr_len = l4_offset + tcp_len;
377
378 if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
379 (head->len < hdr_len)))
380 return -EINVAL;
381
382 if (unlikely(!pskb_may_pull(head, hdr_len)))
383 return -ENOMEM;
384
385 tcph = (struct tcphdr *)(head->data + l4_offset);
386 /* Update header of each segment. */
387 seq = ntohl(tcph->seq);
388 seg_len = skb_pagelen(head) - hdr_len;
389
390 skb = skb_shinfo(head)->frag_list;
391 skb_shinfo(head)->frag_list = NULL;
392 head->next = skb;
393 for (; skb; skb = skb->next) {
394 int err;
395
396 head->len -= skb->len;
397 head->data_len -= skb->len;
398 head->truesize -= skb->truesize;
399
400 seq += seg_len;
401 seg_len = skb->len;
402 err = copy_headers(head, skb, hdr_len);
403 if (err)
404 return err;
405 update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
406 }
407 update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
408 return 0;
409 }
410
411 #ifndef SKIP_ZERO_COPY
412 static struct sk_buff *normalize_frag_list(struct sk_buff *head,
413 struct sk_buff **skbp)
414 {
415 struct sk_buff *skb = *skbp;
416 struct sk_buff *last;
417
418 do {
419 struct sk_buff *frags;
420
421 if (skb_shared(skb)) {
422 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
423
424 if (unlikely(!nskb))
425 return ERR_PTR(-ENOMEM);
426
427 nskb->next = skb->next;
428 consume_skb(skb);
429 skb = nskb;
430 *skbp = skb;
431 }
432
433 if (head) {
434 head->len -= skb->len;
435 head->data_len -= skb->len;
436 head->truesize -= skb->truesize;
437 }
438
439 frags = skb_shinfo(skb)->frag_list;
440 if (frags) {
441 int err;
442
443 err = skb_unclone(skb, GFP_ATOMIC);
444 if (unlikely(err))
445 return ERR_PTR(err);
446
447 last = normalize_frag_list(skb, &frags);
448 if (IS_ERR(last))
449 return last;
450
451 skb_shinfo(skb)->frag_list = NULL;
452 last->next = skb->next;
453 skb->next = frags;
454 } else {
455 last = skb;
456 }
457
458 skbp = &skb->next;
459 } while ((skb = skb->next));
460
461 return last;
462 }
463
464 /* Takes a linked list of skbs, which potentially contain frag_list
465 * (whose members in turn potentially contain frag_lists, etc.) and
466 * converts them into a single linear linked list.
467 */
468 static int straighten_frag_list(struct sk_buff **skbp)
469 {
470 struct sk_buff *err_skb;
471
472 err_skb = normalize_frag_list(NULL, skbp);
473 if (IS_ERR(err_skb))
474 return PTR_ERR(err_skb);
475
476 return 0;
477 }
478
479 static int coalesce_skb(struct sk_buff **headp)
480 {
481 struct sk_buff *frag, *head, *prev;
482 int err;
483
484 err = straighten_frag_list(headp);
485 if (unlikely(err))
486 return err;
487 head = *headp;
488
489 /* Coalesce frag list. */
490 prev = head;
491 for (frag = head->next; frag; frag = frag->next) {
492 bool headstolen;
493 int delta;
494
495 if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
496 return -ENOMEM;
497
498 if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
499 prev = frag;
500 continue;
501 }
502
503 prev->next = frag->next;
504 frag->len = 0;
505 frag->data_len = 0;
506 frag->truesize -= delta;
507 kfree_skb_partial(frag, headstolen);
508 frag = prev;
509 }
510
511 if (!head->next)
512 return 0;
513
514 for (frag = head->next; frag; frag = frag->next) {
515 head->len += frag->len;
516 head->data_len += frag->len;
517 head->truesize += frag->truesize;
518 }
519
520 skb_shinfo(head)->frag_list = head->next;
521 head->next = NULL;
522 return 0;
523 }
524 #else
525 static int coalesce_skb(struct sk_buff **headp)
526 {
527 struct sk_buff *frag, *head = *headp, *next;
528 int delta = FRAG_CB(head)->first.tot_len - skb_headlen(head);
529 int err;
530
531 if (unlikely(!head->next))
532 return 0;
533
534 err = pskb_expand_head(head, 0, delta, GFP_ATOMIC);
535 if (unlikely(err))
536 return err;
537
538 if (unlikely(!__pskb_pull_tail(head, head->data_len)))
539 BUG();
540
541 for (frag = head->next; frag; frag = next) {
542 skb_copy_bits(frag, 0, skb_put(head, frag->len), frag->len);
543 next = frag->next;
544 kfree_skb(frag);
545 }
546
547 head->next = NULL;
548 head->truesize = SKB_TRUESIZE(head->len);
549 return 0;
550 }
551 #endif
552
553 static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
554 bool ipv4, bool tcp, int l4_offset)
555 {
556 if (can_segment(skb, ipv4, tcp, csum_partial))
557 return skb_list_segment(skb, ipv4, l4_offset);
558 else
559 return skb_linearize(skb);
560 }
561
562 static int try_to_segment(struct sk_buff *skb)
563 {
564 struct stthdr *stth = stt_hdr(skb);
565 bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
566 bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
567 bool tcp = !!(stth->flags & STT_PROTO_TCP);
568 int l4_offset = stth->l4_offset;
569
570 return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
571 }
572
573 static int segment_skb(struct sk_buff **headp, bool csum_partial,
574 bool ipv4, bool tcp, int l4_offset)
575 {
576 #ifndef SKIP_ZERO_COPY
577 int err;
578
579 err = coalesce_skb(headp);
580 if (err)
581 return err;
582 #endif
583
584 if (skb_shinfo(*headp)->frag_list)
585 return __try_to_segment(*headp, csum_partial,
586 ipv4, tcp, l4_offset);
587 return 0;
588 }
589
590 static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
591 __be16 s_port, __be16 d_port,
592 __be32 saddr, __be32 dst,
593 __be16 l3_proto, u8 l4_proto,
594 int dst_mtu)
595 {
596 int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
597 unsigned short encap_mss;
598 struct tcphdr *tcph;
599 struct stthdr *stth;
600
601 skb_push(skb, STT_HEADER_LEN);
602 skb_reset_transport_header(skb);
603 tcph = tcp_hdr(skb);
604 memset(tcph, 0, STT_HEADER_LEN);
605 stth = stt_hdr(skb);
606
607 if (skb->ip_summed == CHECKSUM_PARTIAL) {
608 stth->flags |= STT_CSUM_PARTIAL;
609
610 stth->l4_offset = skb->csum_start -
611 (skb_headroom(skb) +
612 STT_HEADER_LEN);
613
614 if (l3_proto == htons(ETH_P_IP))
615 stth->flags |= STT_PROTO_IPV4;
616
617 if (l4_proto == IPPROTO_TCP)
618 stth->flags |= STT_PROTO_TCP;
619
620 stth->mss = htons(skb_shinfo(skb)->gso_size);
621 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
622 stth->flags |= STT_CSUM_VERIFIED;
623 }
624
625 stth->vlan_tci = htons(skb->vlan_tci);
626 skb->vlan_tci = 0;
627 put_unaligned(tun_id, &stth->key);
628
629 tcph->source = s_port;
630 tcph->dest = d_port;
631 tcph->doff = sizeof(struct tcphdr) / 4;
632 tcph->ack = 1;
633 tcph->psh = 1;
634 tcph->window = htons(USHRT_MAX);
635 tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT);
636 tcph->ack_seq = ack_seq();
637 tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0);
638
639 skb->csum_start = skb_transport_header(skb) - skb->head;
640 skb->csum_offset = offsetof(struct tcphdr, check);
641 skb->ip_summed = CHECKSUM_PARTIAL;
642
643 encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
644 if (data_len > encap_mss) {
645 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
646 return -EINVAL;
647
648 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
649 skb_shinfo(skb)->gso_size = encap_mss;
650 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
651 } else {
652 if (unlikely(clear_gso(skb)))
653 return -EINVAL;
654 }
655 return 0;
656 }
657
658 static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
659 __be16 s_port, __be16 d_port,
660 __be32 saddr, __be32 dst,
661 __be16 l3_proto, u8 l4_proto,
662 int dst_mtu)
663 {
664 struct sk_buff *skb;
665
666 if (skb_shinfo(head)->frag_list) {
667 bool ipv4 = (l3_proto == htons(ETH_P_IP));
668 bool tcp = (l4_proto == IPPROTO_TCP);
669 bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
670 int l4_offset = skb_transport_offset(head);
671
672 /* Need to call skb_orphan() to report currect true-size.
673 * calling skb_orphan() in this layer is odd but SKB with
674 * frag-list should not be associated with any socket, so
675 * skb-orphan should be no-op. */
676 skb_orphan(head);
677 if (unlikely(segment_skb(&head, csum_partial,
678 ipv4, tcp, l4_offset)))
679 goto error;
680 }
681
682 for (skb = head; skb; skb = skb->next) {
683 if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
684 l3_proto, l4_proto, dst_mtu))
685 goto error;
686 }
687
688 return head;
689 error:
690 kfree_skb_list(head);
691 return NULL;
692 }
693
694 static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
695 {
696 if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
697 int csum_offset;
698 __sum16 *csum;
699 int len;
700
701 if (l4_proto == IPPROTO_TCP)
702 csum_offset = offsetof(struct tcphdr, check);
703 else if (l4_proto == IPPROTO_UDP)
704 csum_offset = offsetof(struct udphdr, check);
705 else
706 return 0;
707
708 len = skb->len - skb_transport_offset(skb);
709 csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
710
711 if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
712 csum_offset + sizeof(*csum))))
713 return -EINVAL;
714
715 if (l3_proto == htons(ETH_P_IP)) {
716 struct iphdr *iph = ip_hdr(skb);
717
718 *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
719 len, l4_proto, 0);
720 } else if (l3_proto == htons(ETH_P_IPV6)) {
721 struct ipv6hdr *ip6h = ipv6_hdr(skb);
722
723 *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
724 len, l4_proto, 0);
725 } else {
726 return 0;
727 }
728 skb->csum_start = skb_transport_header(skb) - skb->head;
729 skb->csum_offset = csum_offset;
730 skb->ip_summed = CHECKSUM_PARTIAL;
731 }
732
733 if (skb->ip_summed == CHECKSUM_PARTIAL) {
734 /* Assume receiver can only offload TCP/UDP over IPv4/6,
735 * and require 802.1Q VLANs to be accelerated.
736 */
737 if (l3_proto != htons(ETH_P_IP) &&
738 l3_proto != htons(ETH_P_IPV6))
739 return 0;
740
741 if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
742 return 0;
743
744 /* L4 offset must fit in a 1-byte field. */
745 if (skb->csum_start - skb_headroom(skb) > 255)
746 return 0;
747
748 if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
749 return 0;
750 }
751 /* Total size of encapsulated packet must fit in 16 bits. */
752 if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
753 return 0;
754
755 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
756 return 0;
757 return 1;
758 }
759
760 static bool need_linearize(const struct sk_buff *skb)
761 {
762 struct skb_shared_info *shinfo = skb_shinfo(skb);
763 int i;
764
765 if (unlikely(shinfo->frag_list))
766 return true;
767
768 /* Generally speaking we should linearize if there are paged frags.
769 * However, if all of the refcounts are 1 we know nobody else can
770 * change them from underneath us and we can skip the linearization.
771 */
772 for (i = 0; i < shinfo->nr_frags; i++)
773 if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
774 return true;
775
776 return false;
777 }
778
779 static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
780 {
781 int err;
782
783 if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
784
785 min_headroom += VLAN_HLEN;
786 if (skb_headroom(skb) < min_headroom) {
787 int head_delta = SKB_DATA_ALIGN(min_headroom -
788 skb_headroom(skb) + 16);
789
790 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
791 0, GFP_ATOMIC);
792 if (unlikely(err))
793 goto error;
794 }
795
796 skb = __vlan_hwaccel_push_inside(skb);
797 if (!skb) {
798 err = -ENOMEM;
799 goto error;
800 }
801 }
802
803 if (skb_is_gso(skb)) {
804 struct sk_buff *nskb;
805 char cb[sizeof(skb->cb)];
806
807 memcpy(cb, skb->cb, sizeof(cb));
808
809 nskb = __skb_gso_segment(skb, 0, false);
810 if (IS_ERR(nskb)) {
811 err = PTR_ERR(nskb);
812 goto error;
813 }
814
815 consume_skb(skb);
816 skb = nskb;
817 while (nskb) {
818 memcpy(nskb->cb, cb, sizeof(cb));
819 nskb = nskb->next;
820 }
821 } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
822 /* Pages aren't locked and could change at any time.
823 * If this happens after we compute the checksum, the
824 * checksum will be wrong. We linearize now to avoid
825 * this problem.
826 */
827 if (unlikely(need_linearize(skb))) {
828 err = __skb_linearize(skb);
829 if (unlikely(err))
830 goto error;
831 }
832
833 err = skb_checksum_help(skb);
834 if (unlikely(err))
835 goto error;
836 }
837 skb->ip_summed = CHECKSUM_NONE;
838
839 return skb;
840 error:
841 kfree_skb(skb);
842 return ERR_PTR(err);
843 }
844
845 static void skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
846 __be32 dst, __u8 tos, __u8 ttl, __be16 df)
847 {
848 while (skb) {
849 struct sk_buff *next = skb->next;
850
851 if (next)
852 dst_clone(&rt->dst);
853
854 skb->next = NULL;
855 iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
856 tos, ttl, df, false);
857
858 skb = next;
859 }
860 }
861
862 static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
863 {
864 unsigned int nh_ofs = skb_network_offset(skb);
865 int payload_ofs;
866 struct ipv6hdr *nh;
867 uint8_t nexthdr;
868 __be16 frag_off;
869
870 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
871 return 0;
872
873 nh = ipv6_hdr(skb);
874 nexthdr = nh->nexthdr;
875 payload_ofs = (u8 *)(nh + 1) - skb->data;
876
877 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
878 if (unlikely(payload_ofs < 0))
879 return 0;
880
881 return nexthdr;
882 }
883
884 static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
885 {
886 if (l3_proto == htons(ETH_P_IP)) {
887 unsigned int nh_ofs = skb_network_offset(skb);
888
889 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
890 return 0;
891
892 return ip_hdr(skb)->protocol;
893 } else if (l3_proto == htons(ETH_P_IPV6)) {
894 return parse_ipv6_l4_proto(skb);
895 }
896 return 0;
897 }
898
899 static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
900 __be32 src, __be32 dst, __u8 tos,
901 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
902 __be64 tun_id)
903 {
904 struct ethhdr *eh = eth_hdr(skb);
905 int ret = 0, min_headroom;
906 __be16 inner_l3_proto;
907 u8 inner_l4_proto;
908
909 inner_l3_proto = eh->h_proto;
910 inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
911
912 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
913 + STT_HEADER_LEN + sizeof(struct iphdr);
914
915 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
916 int head_delta = SKB_DATA_ALIGN(min_headroom -
917 skb_headroom(skb) +
918 16);
919
920 ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
921 0, GFP_ATOMIC);
922 if (unlikely(ret))
923 goto err_free_rt;
924 }
925
926 ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
927 if (ret < 0)
928 goto err_free_rt;
929 if (!ret) {
930 skb = handle_offloads(skb, min_headroom);
931 if (IS_ERR(skb)) {
932 ret = PTR_ERR(skb);
933 skb = NULL;
934 goto err_free_rt;
935 }
936 }
937
938 ret = 0;
939 while (skb) {
940 struct sk_buff *next_skb = skb->next;
941
942 skb->next = NULL;
943
944 if (next_skb)
945 dst_clone(&rt->dst);
946
947 /* Push STT and TCP header. */
948 skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
949 dst, inner_l3_proto, inner_l4_proto,
950 dst_mtu(&rt->dst));
951 if (unlikely(!skb)) {
952 ip_rt_put(rt);
953 goto next;
954 }
955
956 /* Push IP header. */
957 skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
958
959 next:
960 skb = next_skb;
961 }
962
963 return 0;
964
965 err_free_rt:
966 ip_rt_put(rt);
967 kfree_skb(skb);
968 return ret;
969 }
970
971 static struct rtable *stt_get_rt(struct sk_buff *skb,
972 struct net_device *dev,
973 struct flowi4 *fl,
974 const struct ip_tunnel_key *key,
975 __be16 dport, __be16 sport)
976 {
977 struct net *net = dev_net(dev);
978
979 /* Route lookup */
980 memset(fl, 0, sizeof(*fl));
981 fl->daddr = key->u.ipv4.dst;
982 fl->saddr = key->u.ipv4.src;
983 fl->flowi4_tos = RT_TOS(key->tos);
984 fl->flowi4_mark = skb->mark;
985 fl->flowi4_proto = IPPROTO_TCP;
986 fl->fl4_dport = dport;
987 fl->fl4_sport = sport;
988
989 return ip_route_output_key(net, fl);
990 }
991
992 netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
993 {
994 struct net_device *dev = skb->dev;
995 struct stt_dev *stt_dev = netdev_priv(dev);
996 struct net *net = stt_dev->net;
997 __be16 dport = stt_dev->dst_port;
998 struct ip_tunnel_key *tun_key;
999 struct ip_tunnel_info *tun_info;
1000 struct rtable *rt;
1001 struct flowi4 fl;
1002 __be16 sport;
1003 __be16 df;
1004 int err;
1005
1006 tun_info = skb_tunnel_info(skb);
1007 if (unlikely(!tun_info)) {
1008 err = -EINVAL;
1009 goto error;
1010 }
1011
1012 tun_key = &tun_info->key;
1013
1014 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
1015 rt = stt_get_rt(skb, dev, &fl, tun_key, dport, sport);
1016 if (IS_ERR(rt)) {
1017 err = PTR_ERR(rt);
1018 goto error;
1019 }
1020
1021 df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
1022 skb->ignore_df = 1;
1023
1024 stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
1025 tun_key->tos, tun_key->ttl,
1026 df, sport, dport, tun_key->tun_id);
1027 return NETDEV_TX_OK;
1028 error:
1029 kfree_skb(skb);
1030 dev->stats.tx_errors++;
1031 return err;
1032 }
1033 EXPORT_SYMBOL(ovs_stt_xmit);
1034
1035 static void free_frag(struct stt_percpu *stt_percpu,
1036 struct pkt_frag *frag)
1037 {
1038 stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
1039 kfree_skb_list(frag->skbs);
1040 list_del(&frag->lru_node);
1041 frag->skbs = NULL;
1042 }
1043
1044 static void evict_frags(struct stt_percpu *stt_percpu)
1045 {
1046 while (!list_empty(&stt_percpu->frag_lru) &&
1047 stt_percpu->frag_mem_used > REASM_LO_THRESH) {
1048 struct pkt_frag *frag;
1049
1050 frag = list_first_entry(&stt_percpu->frag_lru,
1051 struct pkt_frag,
1052 lru_node);
1053 free_frag(stt_percpu, frag);
1054 }
1055 }
1056
1057 static bool pkt_key_match(struct net *net,
1058 const struct pkt_frag *a, const struct pkt_key *b)
1059 {
1060 return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
1061 a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
1062 net_eq(dev_net(a->skbs->dev), net);
1063 }
1064
1065 static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
1066 {
1067 u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
1068
1069 return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
1070 (__force u32)key->pkt_seq, initval);
1071 }
1072
1073 static struct pkt_frag *lookup_frag(struct net *net,
1074 struct stt_percpu *stt_percpu,
1075 const struct pkt_key *key, u32 hash)
1076 {
1077 struct pkt_frag *frag, *victim_frag = NULL;
1078 int i;
1079
1080 for (i = 0; i < FRAG_HASH_SEGS; i++) {
1081 frag = &stt_percpu->frag_hash[hash & (FRAG_HASH_ENTRIES - 1)];
1082
1083 if (frag->skbs &&
1084 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
1085 pkt_key_match(net, frag, key))
1086 return frag;
1087
1088 if (!victim_frag ||
1089 (victim_frag->skbs &&
1090 (!frag->skbs ||
1091 time_before(frag->timestamp, victim_frag->timestamp))))
1092 victim_frag = frag;
1093
1094 hash >>= FRAG_HASH_SHIFT;
1095 }
1096
1097 if (victim_frag->skbs)
1098 free_frag(stt_percpu, victim_frag);
1099
1100 return victim_frag;
1101 }
1102
1103 #ifdef SKIP_ZERO_COPY
1104 static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
1105 int *delta, bool *headstolen)
1106 {
1107 int err;
1108
1109 if (unlikely(to->next))
1110 return -EINVAL;
1111
1112 if (unlikely(FRAG_CB(to)->offset))
1113 return -EINVAL;
1114
1115 if (unlikely(skb_unclone(to, GFP_ATOMIC)))
1116 return -ENOMEM;
1117
1118 if (skb_try_coalesce(to, from, headstolen, delta))
1119 return 0;
1120
1121 *headstolen = false;
1122 err = pskb_expand_head(to, 0, to->data_len + from->len, GFP_ATOMIC);
1123 if (unlikely(err))
1124 return err;
1125
1126 if (unlikely(!__pskb_pull_tail(to, to->data_len)))
1127 BUG();
1128
1129 skb_copy_bits(from, 0, skb_put(to, from->len), from->len);
1130
1131 *delta = from->len;
1132 to->truesize += from->len;
1133 return 0;
1134 }
1135 #else
1136 static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
1137 int *delta, bool *headstolen)
1138 {
1139 *headstolen = false;
1140 return -EINVAL;
1141 }
1142 #endif
1143
1144 static struct sk_buff *reassemble(struct sk_buff *skb)
1145 {
1146 struct iphdr *iph = ip_hdr(skb);
1147 struct tcphdr *tcph = tcp_hdr(skb);
1148 u32 seq = ntohl(tcph->seq);
1149 struct stt_percpu *stt_percpu;
1150 struct sk_buff *last_skb, *copied_skb = NULL;
1151 struct pkt_frag *frag;
1152 struct pkt_key key;
1153 int tot_len, delta = skb->truesize;
1154 bool headstolen;
1155 u32 hash;
1156
1157 tot_len = seq >> STT_SEQ_LEN_SHIFT;
1158 FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
1159
1160 if (unlikely(skb->len == 0))
1161 goto out_free;
1162
1163 if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
1164 goto out_free;
1165
1166 if (tot_len == skb->len)
1167 goto out;
1168
1169 key.saddr = iph->saddr;
1170 key.daddr = iph->daddr;
1171 key.pkt_seq = tcph->ack_seq;
1172 key.mark = skb->mark;
1173 hash = pkt_key_hash(dev_net(skb->dev), &key);
1174
1175 stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
1176
1177 spin_lock(&stt_percpu->lock);
1178
1179 if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
1180 evict_frags(stt_percpu);
1181
1182 frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
1183 if (!frag->skbs) {
1184 frag->skbs = skb;
1185 frag->key = key;
1186 frag->timestamp = jiffies;
1187 FRAG_CB(skb)->first.last_skb = skb;
1188 FRAG_CB(skb)->first.mem_used = skb->truesize;
1189 FRAG_CB(skb)->first.tot_len = tot_len;
1190 FRAG_CB(skb)->first.rcvd_len = skb->len;
1191 FRAG_CB(skb)->first.set_ecn_ce = false;
1192 list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
1193 stt_percpu->frag_mem_used += skb->truesize;
1194 skb = NULL;
1195 goto unlock;
1196 }
1197
1198 /* Optimize for the common case where fragments are received in-order
1199 * and not overlapping.
1200 */
1201 last_skb = FRAG_CB(frag->skbs)->first.last_skb;
1202 if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
1203 FRAG_CB(skb)->offset)) {
1204
1205 if (!__copy_skb(frag->skbs, skb, &delta, &headstolen)) {
1206 copied_skb = skb;
1207 } else {
1208 last_skb->next = skb;
1209 FRAG_CB(frag->skbs)->first.last_skb = skb;
1210 }
1211 } else {
1212 struct sk_buff *prev = NULL, *next;
1213
1214 for (next = frag->skbs; next; next = next->next) {
1215 if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
1216 break;
1217 prev = next;
1218 }
1219
1220 /* Overlapping fragments aren't allowed. We shouldn't start
1221 * before the end of the previous fragment.
1222 */
1223 if (prev &&
1224 FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
1225 goto unlock_free;
1226
1227 /* We also shouldn't end after the beginning of the next
1228 * fragment.
1229 */
1230 if (next &&
1231 FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
1232 goto unlock_free;
1233
1234 if (prev) {
1235 prev->next = skb;
1236 } else {
1237 FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
1238 frag->skbs = skb;
1239 }
1240
1241 if (next)
1242 skb->next = next;
1243 else
1244 FRAG_CB(frag->skbs)->first.last_skb = skb;
1245 }
1246
1247 FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
1248 FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
1249 stt_percpu->frag_mem_used += delta;
1250 FRAG_CB(frag->skbs)->first.mem_used += delta;
1251
1252 if (FRAG_CB(frag->skbs)->first.tot_len ==
1253 FRAG_CB(frag->skbs)->first.rcvd_len) {
1254 struct sk_buff *frag_head = frag->skbs;
1255
1256 frag_head->tstamp = skb->tstamp;
1257 if (FRAG_CB(frag_head)->first.set_ecn_ce)
1258 INET_ECN_set_ce(frag_head);
1259
1260 list_del(&frag->lru_node);
1261 stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
1262 frag->skbs = NULL;
1263 skb = frag_head;
1264 } else {
1265 list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
1266 skb = NULL;
1267 }
1268
1269 if (copied_skb)
1270 kfree_skb_partial(copied_skb, headstolen);
1271 goto unlock;
1272
1273 unlock_free:
1274 kfree_skb(skb);
1275 skb = NULL;
1276 unlock:
1277 spin_unlock(&stt_percpu->lock);
1278 return skb;
1279 out_free:
1280 kfree_skb(skb);
1281 skb = NULL;
1282 out:
1283 return skb;
1284 }
1285
1286 static bool validate_checksum(struct sk_buff *skb)
1287 {
1288 struct iphdr *iph = ip_hdr(skb);
1289
1290 if (skb_csum_unnecessary(skb))
1291 return true;
1292
1293 if (skb->ip_summed == CHECKSUM_COMPLETE &&
1294 !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
1295 return true;
1296
1297 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
1298 IPPROTO_TCP, 0);
1299
1300 return __skb_checksum_complete(skb) == 0;
1301 }
1302
1303 static bool set_offloads(struct sk_buff *skb)
1304 {
1305 struct stthdr *stth = stt_hdr(skb);
1306 unsigned int gso_type = 0;
1307 int l3_header_size;
1308 int l4_header_size;
1309 u16 csum_offset;
1310 u8 proto_type;
1311
1312 if (stth->vlan_tci)
1313 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
1314 ntohs(stth->vlan_tci));
1315
1316 if (!(stth->flags & STT_CSUM_PARTIAL)) {
1317 if (stth->flags & STT_CSUM_VERIFIED)
1318 skb->ip_summed = CHECKSUM_UNNECESSARY;
1319 else
1320 skb->ip_summed = CHECKSUM_NONE;
1321
1322 return clear_gso(skb) == 0;
1323 }
1324
1325 proto_type = stth->flags & STT_PROTO_TYPES;
1326
1327 switch (proto_type) {
1328 case (STT_PROTO_IPV4 | STT_PROTO_TCP):
1329 /* TCP/IPv4 */
1330 csum_offset = offsetof(struct tcphdr, check);
1331 gso_type = SKB_GSO_TCPV4;
1332 l3_header_size = sizeof(struct iphdr);
1333 l4_header_size = sizeof(struct tcphdr);
1334 skb->protocol = htons(ETH_P_IP);
1335 break;
1336 case STT_PROTO_TCP:
1337 /* TCP/IPv6 */
1338 csum_offset = offsetof(struct tcphdr, check);
1339 gso_type = SKB_GSO_TCPV6;
1340 l3_header_size = sizeof(struct ipv6hdr);
1341 l4_header_size = sizeof(struct tcphdr);
1342 skb->protocol = htons(ETH_P_IPV6);
1343 break;
1344 case STT_PROTO_IPV4:
1345 /* UDP/IPv4 */
1346 csum_offset = offsetof(struct udphdr, check);
1347 #ifdef HAVE_SKB_GSO_UDP
1348 gso_type = SKB_GSO_UDP;
1349 #endif
1350 l3_header_size = sizeof(struct iphdr);
1351 l4_header_size = sizeof(struct udphdr);
1352 skb->protocol = htons(ETH_P_IP);
1353 break;
1354 default:
1355 /* UDP/IPv6 */
1356 csum_offset = offsetof(struct udphdr, check);
1357 #ifdef HAVE_SKB_GSO_UDP
1358 gso_type = SKB_GSO_UDP;
1359 #endif
1360 l3_header_size = sizeof(struct ipv6hdr);
1361 l4_header_size = sizeof(struct udphdr);
1362 skb->protocol = htons(ETH_P_IPV6);
1363 }
1364
1365 if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
1366 return false;
1367
1368 if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
1369 return false;
1370
1371 stth = stt_hdr(skb);
1372
1373 skb->csum_start = skb_headroom(skb) + stth->l4_offset;
1374 skb->csum_offset = csum_offset;
1375 skb->ip_summed = CHECKSUM_PARTIAL;
1376
1377 if (stth->mss) {
1378 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
1379 return false;
1380
1381 skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
1382 skb_shinfo(skb)->gso_size = ntohs(stth->mss);
1383 skb_shinfo(skb)->gso_segs = 0;
1384 } else {
1385 if (unlikely(clear_gso(skb)))
1386 return false;
1387 }
1388
1389 return true;
1390 }
1391
1392 static void rcv_list(struct net_device *dev, struct sk_buff *skb,
1393 struct metadata_dst *tun_dst)
1394 {
1395 struct sk_buff *next;
1396
1397 do {
1398 next = skb->next;
1399 skb->next = NULL;
1400 if (next) {
1401 ovs_dst_hold((struct dst_entry *)tun_dst);
1402 ovs_skb_dst_set(next, (struct dst_entry *)tun_dst);
1403 }
1404 ovs_ip_tunnel_rcv(dev, skb, tun_dst);
1405 } while ((skb = next));
1406 }
1407
1408 #ifndef USE_UPSTREAM_TUNNEL
1409 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1410 {
1411 struct metadata_dst tun_dst;
1412
1413 ovs_ip_tun_rx_dst(&tun_dst, skb, TUNNEL_KEY | TUNNEL_CSUM,
1414 get_unaligned(&stt_hdr(skb)->key), 0);
1415 tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1416 tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1417
1418 rcv_list(stt_dev->dev, skb, &tun_dst);
1419 return 0;
1420 }
1421 #else
1422 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1423 {
1424 struct metadata_dst *tun_dst;
1425 __be16 flags;
1426 __be64 tun_id;
1427
1428 flags = TUNNEL_KEY | TUNNEL_CSUM;
1429 tun_id = get_unaligned(&stt_hdr(skb)->key);
1430 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
1431 if (!tun_dst)
1432 return -ENOMEM;
1433 tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1434 tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1435
1436 rcv_list(stt_dev->dev, skb, tun_dst);
1437 return 0;
1438 }
1439 #endif
1440
1441 static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1442 {
1443 int err;
1444
1445 if (unlikely(!validate_checksum(skb)))
1446 goto drop;
1447
1448 __skb_pull(skb, sizeof(struct tcphdr));
1449 skb = reassemble(skb);
1450 if (!skb)
1451 return;
1452
1453 if (skb->next && coalesce_skb(&skb))
1454 goto drop;
1455
1456 err = iptunnel_pull_header(skb,
1457 sizeof(struct stthdr) + STT_ETH_PAD,
1458 htons(ETH_P_TEB),
1459 !net_eq(stt_dev->net, dev_net(stt_dev->dev)));
1460 if (unlikely(err))
1461 goto drop;
1462
1463 if (unlikely(stt_hdr(skb)->version != 0))
1464 goto drop;
1465
1466 if (unlikely(!set_offloads(skb)))
1467 goto drop;
1468
1469 if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
1470 goto drop;
1471
1472 err = __stt_rcv(stt_dev, skb);
1473 if (err)
1474 goto drop;
1475 return;
1476 drop:
1477 /* Consume bad packet */
1478 kfree_skb_list(skb);
1479 stt_dev->dev->stats.rx_errors++;
1480 }
1481
1482 static void tcp_sock_release(struct socket *sock)
1483 {
1484 kernel_sock_shutdown(sock, SHUT_RDWR);
1485 sock_release(sock);
1486 }
1487
1488 static int tcp_sock_create4(struct net *net, __be16 port,
1489 struct socket **sockp)
1490 {
1491 struct sockaddr_in tcp_addr;
1492 struct socket *sock = NULL;
1493 int err;
1494
1495 err = sock_create_kern(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1496 if (err < 0)
1497 goto error;
1498
1499 memset(&tcp_addr, 0, sizeof(tcp_addr));
1500 tcp_addr.sin_family = AF_INET;
1501 tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
1502 tcp_addr.sin_port = port;
1503 err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
1504 sizeof(tcp_addr));
1505 if (err < 0)
1506 goto error;
1507
1508 *sockp = sock;
1509 return 0;
1510
1511 error:
1512 if (sock)
1513 tcp_sock_release(sock);
1514 *sockp = NULL;
1515 return err;
1516 }
1517
1518 static void schedule_clean_percpu(void)
1519 {
1520 schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
1521 }
1522
1523 static void clean_percpu(struct work_struct *work)
1524 {
1525 int i;
1526
1527 for_each_possible_cpu(i) {
1528 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1529 int j;
1530
1531 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1532 struct pkt_frag *frag;
1533
1534 frag = &stt_percpu->frag_hash[j];
1535 if (!frag->skbs ||
1536 time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
1537 continue;
1538
1539 spin_lock_bh(&stt_percpu->lock);
1540
1541 if (frag->skbs &&
1542 time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
1543 free_frag(stt_percpu, frag);
1544
1545 spin_unlock_bh(&stt_percpu->lock);
1546 }
1547 }
1548 schedule_clean_percpu();
1549 }
1550
1551 #ifdef HAVE_NF_HOOKFN_ARG_OPS
1552 #define FIRST_PARAM const struct nf_hook_ops *ops
1553 #else
1554 #ifdef HAVE_NF_HOOKFN_ARG_PRIV
1555 #define FIRST_PARAM void *priv
1556 #else
1557 #define FIRST_PARAM unsigned int hooknum
1558 #endif
1559 #endif
1560
1561 #ifdef HAVE_NF_HOOK_STATE
1562 #if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0)
1563 /* RHEL nfhook hacks. */
1564 #ifndef __GENKSYMS__
1565 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1566 const struct nf_hook_state *state
1567 #else
1568 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1569 int (*okfn)(struct sk_buff *)
1570 #endif
1571 #else
1572 #define LAST_PARAM const struct nf_hook_state *state
1573 #endif
1574 #else
1575 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1576 int (*okfn)(struct sk_buff *)
1577 #endif
1578
1579 static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
1580 {
1581 struct stt_dev *stt_dev;
1582 int ip_hdr_len;
1583
1584 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
1585 return NF_ACCEPT;
1586
1587 ip_hdr_len = ip_hdrlen(skb);
1588 if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
1589 return NF_ACCEPT;
1590
1591 skb_set_transport_header(skb, ip_hdr_len);
1592
1593 stt_dev = stt_find_up_dev(dev_net(skb->dev), tcp_hdr(skb)->dest);
1594 if (!stt_dev)
1595 return NF_ACCEPT;
1596
1597 __skb_pull(skb, ip_hdr_len);
1598 stt_rcv(stt_dev, skb);
1599 return NF_STOLEN;
1600 }
1601
1602 static struct nf_hook_ops nf_hook_ops __read_mostly = {
1603 .hook = nf_ip_hook,
1604 #ifdef HAVE_NF_HOOKS_OPS_OWNER
1605 .owner = THIS_MODULE,
1606 #endif
1607 .pf = NFPROTO_IPV4,
1608 .hooknum = NF_INET_LOCAL_IN,
1609 .priority = INT_MAX,
1610 };
1611
1612 static int stt_start(struct net *net)
1613 {
1614 struct stt_net *sn = net_generic(net, stt_net_id);
1615 int err;
1616 int i;
1617
1618 if (n_tunnels) {
1619 n_tunnels++;
1620 return 0;
1621 }
1622 get_random_bytes(&frag_hash_seed, sizeof(u32));
1623
1624 stt_percpu_data = alloc_percpu(struct stt_percpu);
1625 if (!stt_percpu_data) {
1626 err = -ENOMEM;
1627 goto error;
1628 }
1629
1630 for_each_possible_cpu(i) {
1631 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1632 struct pkt_frag *frag_hash;
1633
1634 spin_lock_init(&stt_percpu->lock);
1635 INIT_LIST_HEAD(&stt_percpu->frag_lru);
1636 get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
1637
1638 frag_hash = kvmalloc_array(sizeof(struct pkt_frag),
1639 FRAG_HASH_ENTRIES,
1640 GFP_KERNEL | __GFP_ZERO);
1641 if (!frag_hash) {
1642 err = -ENOMEM;
1643 goto free_percpu;
1644 }
1645 stt_percpu->frag_hash = frag_hash;
1646 }
1647 schedule_clean_percpu();
1648 n_tunnels++;
1649
1650 if (sn->n_tunnels) {
1651 sn->n_tunnels++;
1652 return 0;
1653 }
1654 #ifdef HAVE_NF_REGISTER_NET_HOOK
1655 /* On kernel which support per net nf-hook, nf_register_hook() takes
1656 * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
1657 * use this new API.
1658 */
1659
1660 if (sn->nf_hook_reg_done)
1661 goto out;
1662
1663 err = nf_register_net_hook(net, &nf_hook_ops);
1664 if (!err)
1665 sn->nf_hook_reg_done = true;
1666 #else
1667 /* Register STT only on very first STT device addition. */
1668 if (!list_empty(&nf_hook_ops.list))
1669 goto out;
1670
1671 err = nf_register_hook(&nf_hook_ops);
1672 #endif
1673 if (err)
1674 goto dec_n_tunnel;
1675 out:
1676 sn->n_tunnels++;
1677 return 0;
1678
1679 dec_n_tunnel:
1680 n_tunnels--;
1681 free_percpu:
1682 for_each_possible_cpu(i) {
1683 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1684
1685 if (stt_percpu->frag_hash)
1686 kvfree(stt_percpu->frag_hash);
1687 }
1688
1689 free_percpu(stt_percpu_data);
1690
1691 error:
1692 return err;
1693 }
1694
1695 static void stt_cleanup(struct net *net)
1696 {
1697 struct stt_net *sn = net_generic(net, stt_net_id);
1698 int i;
1699
1700 sn->n_tunnels--;
1701 n_tunnels--;
1702 if (n_tunnels)
1703 return;
1704
1705 cancel_delayed_work_sync(&clean_percpu_wq);
1706 for_each_possible_cpu(i) {
1707 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1708 int j;
1709
1710 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1711 struct pkt_frag *frag;
1712
1713 frag = &stt_percpu->frag_hash[j];
1714 kfree_skb_list(frag->skbs);
1715 }
1716
1717 kvfree(stt_percpu->frag_hash);
1718 }
1719
1720 free_percpu(stt_percpu_data);
1721 }
1722
1723 static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
1724 {
1725 #ifdef USE_UPSTREAM_TUNNEL
1726 return ovs_stt_xmit(skb);
1727 #else
1728 /* Drop All packets coming from networking stack. OVS-CB is
1729 * not initialized for these packets.
1730 */
1731 dev_kfree_skb(skb);
1732 dev->stats.tx_dropped++;
1733 return NETDEV_TX_OK;
1734 #endif
1735 }
1736
1737 /* Setup stats when device is created */
1738 static int stt_init(struct net_device *dev)
1739 {
1740 dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1741 if (!dev->tstats)
1742 return -ENOMEM;
1743
1744 return 0;
1745 }
1746
1747 static void stt_uninit(struct net_device *dev)
1748 {
1749 free_percpu(dev->tstats);
1750 }
1751
1752 static int stt_open(struct net_device *dev)
1753 {
1754 struct stt_dev *stt = netdev_priv(dev);
1755 struct net *net = stt->net;
1756 struct stt_net *sn = net_generic(net, stt_net_id);
1757 int err;
1758
1759 err = stt_start(net);
1760 if (err)
1761 return err;
1762
1763 err = tcp_sock_create4(net, stt->dst_port, &stt->sock);
1764 if (err)
1765 return err;
1766 list_add_rcu(&stt->up_next, &sn->stt_up_list);
1767 return 0;
1768 }
1769
1770 static int stt_stop(struct net_device *dev)
1771 {
1772 struct stt_dev *stt_dev = netdev_priv(dev);
1773 struct net *net = stt_dev->net;
1774
1775 list_del_rcu(&stt_dev->up_next);
1776 synchronize_net();
1777 tcp_sock_release(stt_dev->sock);
1778 stt_dev->sock = NULL;
1779 stt_cleanup(net);
1780 return 0;
1781 }
1782
1783 static int __stt_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1784 {
1785 int max_mtu = IP_MAX_MTU - STT_HEADER_LEN - sizeof(struct iphdr)
1786 - dev->hard_header_len;
1787
1788 if (new_mtu < 68)
1789 return -EINVAL;
1790
1791 if (new_mtu > max_mtu) {
1792 if (strict)
1793 return -EINVAL;
1794
1795 new_mtu = max_mtu;
1796 }
1797
1798 dev->mtu = new_mtu;
1799 return 0;
1800 }
1801
1802 static int stt_change_mtu(struct net_device *dev, int new_mtu)
1803 {
1804 return __stt_change_mtu(dev, new_mtu, true);
1805 }
1806
1807 int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
1808 {
1809 struct ip_tunnel_info *info = skb_tunnel_info(skb);
1810 struct stt_dev *stt_dev = netdev_priv(dev);
1811 struct net *net = stt_dev->net;
1812 __be16 dport = stt_dev->dst_port;
1813 __be16 sport;
1814 struct flowi4 fl4;
1815 struct rtable *rt;
1816
1817 if (ip_tunnel_info_af(info) != AF_INET)
1818 return -EINVAL;
1819
1820 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
1821 rt = stt_get_rt(skb, dev, &fl4, &info->key, dport, sport);
1822 if (IS_ERR(rt))
1823 return PTR_ERR(rt);
1824
1825 ip_rt_put(rt);
1826
1827 info->key.u.ipv4.src = fl4.saddr;
1828 info->key.tp_src = sport;
1829 info->key.tp_dst = dport;
1830 return 0;
1831 }
1832 EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst);
1833
1834 static const struct net_device_ops stt_netdev_ops = {
1835 .ndo_init = stt_init,
1836 .ndo_uninit = stt_uninit,
1837 .ndo_open = stt_open,
1838 .ndo_stop = stt_stop,
1839 .ndo_start_xmit = stt_dev_xmit,
1840 .ndo_get_stats64 = ip_tunnel_get_stats64,
1841 #ifdef HAVE_RHEL7_MAX_MTU
1842 .ndo_size = sizeof(struct net_device_ops),
1843 .extended.ndo_change_mtu = stt_change_mtu,
1844 #else
1845 .ndo_change_mtu = stt_change_mtu,
1846 #endif
1847 .ndo_validate_addr = eth_validate_addr,
1848 .ndo_set_mac_address = eth_mac_addr,
1849 #ifdef USE_UPSTREAM_TUNNEL
1850 #ifdef HAVE_NDO_FILL_METADATA_DST
1851 .ndo_fill_metadata_dst = stt_fill_metadata_dst,
1852 #endif
1853 #endif
1854 };
1855
1856 static void stt_get_drvinfo(struct net_device *dev,
1857 struct ethtool_drvinfo *drvinfo)
1858 {
1859 strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version));
1860 strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver));
1861 }
1862
1863 static const struct ethtool_ops stt_ethtool_ops = {
1864 .get_drvinfo = stt_get_drvinfo,
1865 .get_link = ethtool_op_get_link,
1866 };
1867
1868 /* Info for udev, that this is a virtual tunnel endpoint */
1869 static struct device_type stt_type = {
1870 .name = "stt",
1871 };
1872
1873 /* Initialize the device structure. */
1874 static void stt_setup(struct net_device *dev)
1875 {
1876 ether_setup(dev);
1877
1878 dev->netdev_ops = &stt_netdev_ops;
1879 dev->ethtool_ops = &stt_ethtool_ops;
1880 #ifndef HAVE_NEEDS_FREE_NETDEV
1881 dev->destructor = free_netdev;
1882 #else
1883 dev->needs_free_netdev = true;
1884 #endif
1885
1886 SET_NETDEV_DEVTYPE(dev, &stt_type);
1887
1888 dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL;
1889 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
1890 dev->features |= NETIF_F_RXCSUM;
1891 dev->features |= NETIF_F_GSO_SOFTWARE;
1892
1893 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1894 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1895
1896 #ifdef USE_UPSTREAM_TUNNEL
1897 netif_keep_dst(dev);
1898 #endif
1899 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
1900 eth_hw_addr_random(dev);
1901 }
1902
1903 static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = {
1904 [IFLA_STT_PORT] = { .type = NLA_U16 },
1905 };
1906
1907 #ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK
1908 static int stt_validate(struct nlattr *tb[], struct nlattr *data[],
1909 struct netlink_ext_ack __always_unused *extack)
1910 #else
1911 static int stt_validate(struct nlattr *tb[], struct nlattr *data[])
1912 #endif
1913 {
1914 if (tb[IFLA_ADDRESS]) {
1915 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1916 return -EINVAL;
1917
1918 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1919 return -EADDRNOTAVAIL;
1920 }
1921
1922 return 0;
1923 }
1924
1925 static struct stt_dev *find_dev(struct net *net, __be16 dst_port)
1926 {
1927 struct stt_net *sn = net_generic(net, stt_net_id);
1928 struct stt_dev *dev;
1929
1930 list_for_each_entry(dev, &sn->stt_list, next) {
1931 if (dev->dst_port == dst_port)
1932 return dev;
1933 }
1934 return NULL;
1935 }
1936
1937 static int stt_configure(struct net *net, struct net_device *dev,
1938 __be16 dst_port)
1939 {
1940 struct stt_net *sn = net_generic(net, stt_net_id);
1941 struct stt_dev *stt = netdev_priv(dev);
1942 int err;
1943
1944 stt->net = net;
1945 stt->dev = dev;
1946
1947 stt->dst_port = dst_port;
1948
1949 if (find_dev(net, dst_port))
1950 return -EBUSY;
1951
1952 err = __stt_change_mtu(dev, IP_MAX_MTU, false);
1953 if (err)
1954 return err;
1955
1956 err = register_netdevice(dev);
1957 if (err)
1958 return err;
1959
1960 list_add(&stt->next, &sn->stt_list);
1961 return 0;
1962 }
1963
1964 #ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS
1965 static int stt_newlink(struct net *net, struct net_device *dev,
1966 struct nlattr *tb[], struct nlattr *data[],
1967 struct netlink_ext_ack __always_unused *extack)
1968 #else
1969 static int stt_newlink(struct net *net, struct net_device *dev,
1970 struct nlattr *tb[], struct nlattr *data[])
1971 #endif
1972 {
1973 __be16 dst_port = htons(STT_DST_PORT);
1974
1975 if (data[IFLA_STT_PORT])
1976 dst_port = nla_get_be16(data[IFLA_STT_PORT]);
1977
1978 return stt_configure(net, dev, dst_port);
1979 }
1980
1981 static void stt_dellink(struct net_device *dev, struct list_head *head)
1982 {
1983 struct stt_dev *stt = netdev_priv(dev);
1984
1985 list_del(&stt->next);
1986 unregister_netdevice_queue(dev, head);
1987 }
1988
1989 static size_t stt_get_size(const struct net_device *dev)
1990 {
1991 return nla_total_size(sizeof(__be32)); /* IFLA_STT_PORT */
1992 }
1993
1994 static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev)
1995 {
1996 struct stt_dev *stt = netdev_priv(dev);
1997
1998 if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port))
1999 goto nla_put_failure;
2000
2001 return 0;
2002
2003 nla_put_failure:
2004 return -EMSGSIZE;
2005 }
2006
2007 static struct rtnl_link_ops stt_link_ops __read_mostly = {
2008 .kind = "stt",
2009 .maxtype = IFLA_STT_MAX,
2010 .policy = stt_policy,
2011 .priv_size = sizeof(struct stt_dev),
2012 .setup = stt_setup,
2013 .validate = stt_validate,
2014 .newlink = stt_newlink,
2015 .dellink = stt_dellink,
2016 .get_size = stt_get_size,
2017 .fill_info = stt_fill_info,
2018 };
2019
2020 struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
2021 u8 name_assign_type, u16 dst_port)
2022 {
2023 struct nlattr *tb[IFLA_MAX + 1];
2024 struct net_device *dev;
2025 int err;
2026
2027 memset(tb, 0, sizeof(tb));
2028 dev = rtnl_create_link(net, (char *) name, name_assign_type,
2029 &stt_link_ops, tb);
2030 if (IS_ERR(dev))
2031 return dev;
2032
2033 err = stt_configure(net, dev, htons(dst_port));
2034 if (err) {
2035 free_netdev(dev);
2036 return ERR_PTR(err);
2037 }
2038 return dev;
2039 }
2040 EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb);
2041
2042 static int stt_init_net(struct net *net)
2043 {
2044 struct stt_net *sn = net_generic(net, stt_net_id);
2045
2046 INIT_LIST_HEAD(&sn->stt_list);
2047 INIT_LIST_HEAD(&sn->stt_up_list);
2048 #ifdef HAVE_NF_REGISTER_NET_HOOK
2049 sn->nf_hook_reg_done = false;
2050 #endif
2051 return 0;
2052 }
2053
2054 static void stt_exit_net(struct net *net)
2055 {
2056 struct stt_net *sn = net_generic(net, stt_net_id);
2057 struct stt_dev *stt, *next;
2058 struct net_device *dev, *aux;
2059 LIST_HEAD(list);
2060
2061 #ifdef HAVE_NF_REGISTER_NET_HOOK
2062 /* Ideally this should be done from stt_stop(), But on some kernels
2063 * nf-unreg operation needs RTNL-lock, which can cause deallock.
2064 * So it is done from here. */
2065 if (sn->nf_hook_reg_done)
2066 nf_unregister_net_hook(net, &nf_hook_ops);
2067 #endif
2068
2069 rtnl_lock();
2070
2071 /* gather any stt devices that were moved into this ns */
2072 for_each_netdev_safe(net, dev, aux)
2073 if (dev->rtnl_link_ops == &stt_link_ops)
2074 unregister_netdevice_queue(dev, &list);
2075
2076 list_for_each_entry_safe(stt, next, &sn->stt_list, next) {
2077 /* If stt->dev is in the same netns, it was already added
2078 * to the stt by the previous loop.
2079 */
2080 if (!net_eq(dev_net(stt->dev), net))
2081 unregister_netdevice_queue(stt->dev, &list);
2082 }
2083
2084 /* unregister the devices gathered above */
2085 unregister_netdevice_many(&list);
2086 rtnl_unlock();
2087 }
2088
2089 static struct pernet_operations stt_net_ops = {
2090 .init = stt_init_net,
2091 .exit = stt_exit_net,
2092 .id = &stt_net_id,
2093 .size = sizeof(struct stt_net),
2094 };
2095
2096 int stt_init_module(void)
2097 {
2098 int rc;
2099
2100 rc = register_pernet_subsys(&stt_net_ops);
2101 if (rc)
2102 goto out1;
2103
2104 rc = rtnl_link_register(&stt_link_ops);
2105 if (rc)
2106 goto out2;
2107
2108 #ifdef HAVE_LIST_IN_NF_HOOK_OPS
2109 INIT_LIST_HEAD(&nf_hook_ops.list);
2110 #endif
2111 pr_info("STT tunneling driver\n");
2112 return 0;
2113 out2:
2114 unregister_pernet_subsys(&stt_net_ops);
2115 out1:
2116 pr_err("Error while initializing STT %d\n", rc);
2117 return rc;
2118 }
2119
2120 void stt_cleanup_module(void)
2121 {
2122 #ifndef HAVE_NF_REGISTER_NET_HOOK
2123 if (!list_empty(&nf_hook_ops.list))
2124 nf_unregister_hook(&nf_hook_ops);
2125 #endif
2126 rtnl_link_unregister(&stt_link_ops);
2127 unregister_pernet_subsys(&stt_net_ops);
2128 }
2129 #endif