datapath/linux/compat/stt.c

   1 /*
   2  * Stateless TCP Tunnel (STT) vport.
   3  *
   4  * Copyright (c) 2015 Nicira, Inc.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13 #include <asm/unaligned.h>
  14
  15 #include <linux/delay.h>
  16 #include <linux/flex_array.h>
  17 #include <linux/if.h>
  18 #include <linux/if_vlan.h>
  19 #include <linux/ip.h>
  20 #include <linux/ipv6.h>
  21 #include <linux/jhash.h>
  22 #include <linux/list.h>
  23 #include <linux/log2.h>
  24 #include <linux/module.h>
  25 #include <linux/net.h>
  26 #include <linux/netfilter.h>
  27 #include <linux/percpu.h>
  28 #include <linux/skbuff.h>
  29 #include <linux/tcp.h>
  30 #include <linux/workqueue.h>
  31
  32 #include <net/dst_metadata.h>
  33 #include <net/icmp.h>
  34 #include <net/inet_ecn.h>
  35 #include <net/ip.h>
  36 #include <net/ip_tunnels.h>
  37 #include <net/ip6_checksum.h>
  38 #include <net/net_namespace.h>
  39 #include <net/netns/generic.h>
  40 #include <net/sock.h>
  41 #include <net/stt.h>
  42 #include <net/tcp.h>
  43 #include <net/udp.h>
  44
  45 #include "gso.h"
  46 #include "compat.h"
  47
  48 #define STT_NETDEV_VER  "0.1"
  49 #define STT_DST_PORT 7471
  50
  51 #ifdef OVS_STT
  52 #ifdef CONFIG_SLUB
  53 /*
  54  * We saw better performance with skipping zero copy in case of SLUB.
  55  * So skip zero copy for SLUB case.
  56  */
  57 #define SKIP_ZERO_COPY
  58 #endif
  59
  60 #define STT_VER 0
  61
  62 /* @list: Per-net list of STT ports.
  63  * @rcv: The callback is called on STT packet recv, STT reassembly can generate
  64  * multiple packets, in this case first packet has tunnel outer header, rest
  65  * of the packets are inner packet segments with no stt header.
  66  * @rcv_data: user data.
  67  * @sock: Fake TCP socket for the STT port.
  68  */
  69 struct stt_dev {
  70         struct net_device       *dev;
  71         struct net              *net;
  72         struct list_head        next;
  73         struct list_head        up_next;
  74         struct socket           *sock;
  75         __be16                  dst_port;
  76 };
  77
  78 #define STT_CSUM_VERIFIED       BIT(0)
  79 #define STT_CSUM_PARTIAL        BIT(1)
  80 #define STT_PROTO_IPV4          BIT(2)
  81 #define STT_PROTO_TCP           BIT(3)
  82 #define STT_PROTO_TYPES         (STT_PROTO_IPV4 | STT_PROTO_TCP)
  83
  84 #define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \
  85                              SKB_GSO_TCPV6)
  86
  87 /* The length and offset of a fragment are encoded in the sequence number.
  88  * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
  89  * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
  90  */
  91 #define STT_SEQ_LEN_SHIFT 16
  92 #define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1)
  93
  94 /* The maximum amount of memory used to store packets waiting to be reassembled
  95  * on a given CPU.  Once this threshold is exceeded we will begin freeing the
  96  * least recently used fragments.
  97  */
  98 #define REASM_HI_THRESH (4 * 1024 * 1024)
  99 /* The target for the high memory evictor.  Once we have exceeded
 100  * REASM_HI_THRESH, we will continue freeing fragments until we hit
 101  * this limit.
 102  */
 103 #define REASM_LO_THRESH (3 * 1024 * 1024)
 104 /* The length of time a given packet has to be reassembled from the time the
 105  * first fragment arrives.  Once this limit is exceeded it becomes available
 106  * for cleaning.
 107  */
 108 #define FRAG_EXP_TIME (30 * HZ)
 109 /* Number of hash entries.  Each entry has only a single slot to hold a packet
 110  * so if there are collisions, we will drop packets.  This is allocated
 111  * per-cpu and each entry consists of struct pkt_frag.
 112  */
 113 #define FRAG_HASH_SHIFT         8
 114 #define FRAG_HASH_ENTRIES       BIT(FRAG_HASH_SHIFT)
 115 #define FRAG_HASH_SEGS          ((sizeof(u32) * 8) / FRAG_HASH_SHIFT)
 116
 117 #define CLEAN_PERCPU_INTERVAL (30 * HZ)
 118
 119 struct pkt_key {
 120         __be32 saddr;
 121         __be32 daddr;
 122         __be32 pkt_seq;
 123         u32 mark;
 124 };
 125
 126 struct pkt_frag {
 127         struct sk_buff *skbs;
 128         unsigned long timestamp;
 129         struct list_head lru_node;
 130         struct pkt_key key;
 131 };
 132
 133 struct stt_percpu {
 134         struct flex_array *frag_hash;
 135         struct list_head frag_lru;
 136         unsigned int frag_mem_used;
 137
 138         /* Protect frags table. */
 139         spinlock_t lock;
 140 };
 141
 142 struct first_frag {
 143         struct sk_buff *last_skb;
 144         unsigned int mem_used;
 145         u16 tot_len;
 146         u16 rcvd_len;
 147         bool set_ecn_ce;
 148 };
 149
 150 struct frag_skb_cb {
 151         u16 offset;
 152
 153         /* Only valid for the first skb in the chain. */
 154         struct first_frag first;
 155 };
 156
 157 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
 158
 159 /* per-network namespace private data for this module */
 160 struct stt_net {
 161         struct list_head stt_list;
 162         struct list_head stt_up_list;   /* Devices which are in IFF_UP state. */
 163         int n_tunnels;
 164 #ifdef HAVE_NF_REGISTER_NET_HOOK
 165         bool nf_hook_reg_done;
 166 #endif
 167 };
 168
 169 static int stt_net_id;
 170
 171 static struct stt_percpu __percpu *stt_percpu_data __read_mostly;
 172 static u32 frag_hash_seed __read_mostly;
 173
 174 /* Protects sock-hash and refcounts. */
 175 static DEFINE_MUTEX(stt_mutex);
 176
 177 static int n_tunnels;
 178 static DEFINE_PER_CPU(u32, pkt_seq_counter);
 179
 180 static void clean_percpu(struct work_struct *work);
 181 static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
 182
 183 static struct stt_dev *stt_find_up_dev(struct net *net, __be16 port)
 184 {
 185         struct stt_net *sn = net_generic(net, stt_net_id);
 186         struct stt_dev *stt_dev;
 187
 188         list_for_each_entry_rcu(stt_dev, &sn->stt_up_list, up_next) {
 189                 if (stt_dev->dst_port == port)
 190                         return stt_dev;
 191         }
 192         return NULL;
 193 }
 194
 195 static __be32 ack_seq(void)
 196 {
 197 #if NR_CPUS <= 65536
 198         u32 pkt_seq, ack;
 199
 200         pkt_seq = this_cpu_read(pkt_seq_counter);
 201         ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id();
 202         this_cpu_inc(pkt_seq_counter);
 203
 204         return (__force __be32)ack;
 205 #else
 206 #error "Support for greater than 64k CPUs not implemented"
 207 #endif
 208 }
 209
 210 static int clear_gso(struct sk_buff *skb)
 211 {
 212         struct skb_shared_info *shinfo = skb_shinfo(skb);
 213         int err;
 214
 215         if (shinfo->gso_type == 0 && shinfo->gso_size == 0 &&
 216             shinfo->gso_segs == 0)
 217                 return 0;
 218
 219         err = skb_unclone(skb, GFP_ATOMIC);
 220         if (unlikely(err))
 221                 return err;
 222
 223         shinfo = skb_shinfo(skb);
 224         shinfo->gso_type = 0;
 225         shinfo->gso_size = 0;
 226         shinfo->gso_segs = 0;
 227         return 0;
 228 }
 229
 230 static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from)
 231 {
 232         to->protocol = from->protocol;
 233         to->tstamp = from->tstamp;
 234         to->priority = from->priority;
 235         to->mark = from->mark;
 236         to->vlan_tci = from->vlan_tci;
 237 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
 238         to->vlan_proto = from->vlan_proto;
 239 #endif
 240         skb_copy_secmark(to, from);
 241 }
 242
 243 static void update_headers(struct sk_buff *skb, bool head,
 244                                unsigned int l4_offset, unsigned int hdr_len,
 245                                bool ipv4, u32 tcp_seq)
 246 {
 247         u16 old_len, new_len;
 248         __be32 delta;
 249         struct tcphdr *tcph;
 250         int gso_size;
 251
 252         if (ipv4) {
 253                 struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN);
 254
 255                 old_len = ntohs(iph->tot_len);
 256                 new_len = skb->len - ETH_HLEN;
 257                 iph->tot_len = htons(new_len);
 258
 259                 ip_send_check(iph);
 260         } else {
 261                 struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN);
 262
 263                 old_len = ntohs(ip6h->payload_len);
 264                 new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr);
 265                 ip6h->payload_len = htons(new_len);
 266         }
 267
 268         tcph = (struct tcphdr *)(skb->data + l4_offset);
 269         if (!head) {
 270                 tcph->seq = htonl(tcp_seq);
 271                 tcph->cwr = 0;
 272         }
 273
 274         if (skb->next) {
 275                 tcph->fin = 0;
 276                 tcph->psh = 0;
 277         }
 278
 279         delta = htonl(~old_len + new_len);
 280         tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check +
 281                                  (__force u32)delta));
 282
 283         gso_size = skb_shinfo(skb)->gso_size;
 284         if (gso_size && skb->len - hdr_len <= gso_size)
 285                 BUG_ON(clear_gso(skb));
 286 }
 287
 288 static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial)
 289 {
 290         /* If no offloading is in use then we don't have enough information
 291          * to process the headers.
 292          */
 293         if (!csum_partial)
 294                 goto linearize;
 295
 296         /* Handling UDP packets requires IP fragmentation, which means that
 297          * the L4 checksum can no longer be calculated by hardware (since the
 298          * fragments are in different packets.  If we have to compute the
 299          * checksum it's faster just to linearize and large UDP packets are
 300          * pretty uncommon anyways, so it's not worth dealing with for now.
 301          */
 302         if (!tcp)
 303                 goto linearize;
 304
 305         if (ipv4) {
 306                 struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN);
 307
 308                 /* It's difficult to get the IP IDs exactly right here due to
 309                  * varying segment sizes and potentially multiple layers of
 310                  * segmentation.  IP ID isn't important when DF is set and DF
 311                  * is generally set for TCP packets, so just linearize if it's
 312                  * not.
 313                  */
 314                 if (!(iph->frag_off & htons(IP_DF)))
 315                         goto linearize;
 316         } else {
 317                 struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN);
 318
 319                 /* Jumbograms require more processing to update and we'll
 320                  * probably never see them, so just linearize.
 321                  */
 322                 if (ip6h->payload_len == 0)
 323                         goto linearize;
 324         }
 325         return true;
 326
 327 linearize:
 328         return false;
 329 }
 330
 331 static int copy_headers(struct sk_buff *head, struct sk_buff *frag,
 332                             int hdr_len)
 333 {
 334         u16 csum_start;
 335
 336         if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) {
 337                 int extra_head = hdr_len - skb_headroom(frag);
 338
 339                 extra_head = extra_head > 0 ? extra_head : 0;
 340                 if (unlikely(pskb_expand_head(frag, extra_head, 0,
 341                                               GFP_ATOMIC)))
 342                         return -ENOMEM;
 343         }
 344
 345         memcpy(__skb_push(frag, hdr_len), head->data, hdr_len);
 346
 347         csum_start = head->csum_start - skb_headroom(head);
 348         frag->csum_start = skb_headroom(frag) + csum_start;
 349         frag->csum_offset = head->csum_offset;
 350         frag->ip_summed = head->ip_summed;
 351
 352         skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size;
 353         skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type;
 354         skb_shinfo(frag)->gso_segs = 0;
 355
 356         copy_skb_metadata(frag, head);
 357         return 0;
 358 }
 359
 360 static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset)
 361 {
 362         struct sk_buff *skb;
 363         struct tcphdr *tcph;
 364         int seg_len;
 365         int hdr_len;
 366         int tcp_len;
 367         u32 seq;
 368
 369         if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph))))
 370                 return -ENOMEM;
 371
 372         tcph = (struct tcphdr *)(head->data + l4_offset);
 373         tcp_len = tcph->doff * 4;
 374         hdr_len = l4_offset + tcp_len;
 375
 376         if (unlikely((tcp_len < sizeof(struct tcphdr)) ||
 377                      (head->len < hdr_len)))
 378                 return -EINVAL;
 379
 380         if (unlikely(!pskb_may_pull(head, hdr_len)))
 381                 return -ENOMEM;
 382
 383         tcph = (struct tcphdr *)(head->data + l4_offset);
 384         /* Update header of each segment. */
 385         seq = ntohl(tcph->seq);
 386         seg_len = skb_pagelen(head) - hdr_len;
 387
 388         skb = skb_shinfo(head)->frag_list;
 389         skb_shinfo(head)->frag_list = NULL;
 390         head->next = skb;
 391         for (; skb; skb = skb->next) {
 392                 int err;
 393
 394                 head->len -= skb->len;
 395                 head->data_len -= skb->len;
 396                 head->truesize -= skb->truesize;
 397
 398                 seq += seg_len;
 399                 seg_len = skb->len;
 400                 err = copy_headers(head, skb, hdr_len);
 401                 if (err)
 402                         return err;
 403                 update_headers(skb, false, l4_offset, hdr_len, ipv4, seq);
 404         }
 405         update_headers(head, true, l4_offset, hdr_len, ipv4, 0);
 406         return 0;
 407 }
 408
 409 #ifndef SKIP_ZERO_COPY
 410 static struct sk_buff *normalize_frag_list(struct sk_buff *head,
 411                                            struct sk_buff **skbp)
 412 {
 413         struct sk_buff *skb = *skbp;
 414         struct sk_buff *last;
 415
 416         do {
 417                 struct sk_buff *frags;
 418
 419                 if (skb_shared(skb)) {
 420                         struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 421
 422                         if (unlikely(!nskb))
 423                                 return ERR_PTR(-ENOMEM);
 424
 425                         nskb->next = skb->next;
 426                         consume_skb(skb);
 427                         skb = nskb;
 428                         *skbp = skb;
 429                 }
 430
 431                 if (head) {
 432                         head->len -= skb->len;
 433                         head->data_len -= skb->len;
 434                         head->truesize -= skb->truesize;
 435                 }
 436
 437                 frags = skb_shinfo(skb)->frag_list;
 438                 if (frags) {
 439                         int err;
 440
 441                         err = skb_unclone(skb, GFP_ATOMIC);
 442                         if (unlikely(err))
 443                                 return ERR_PTR(err);
 444
 445                         last = normalize_frag_list(skb, &frags);
 446                         if (IS_ERR(last))
 447                                 return last;
 448
 449                         skb_shinfo(skb)->frag_list = NULL;
 450                         last->next = skb->next;
 451                         skb->next = frags;
 452                 } else {
 453                         last = skb;
 454                 }
 455
 456                 skbp = &skb->next;
 457         } while ((skb = skb->next));
 458
 459         return last;
 460 }
 461
 462 /* Takes a linked list of skbs, which potentially contain frag_list
 463  * (whose members in turn potentially contain frag_lists, etc.) and
 464  * converts them into a single linear linked list.
 465  */
 466 static int straighten_frag_list(struct sk_buff **skbp)
 467 {
 468         struct sk_buff *err_skb;
 469
 470         err_skb = normalize_frag_list(NULL, skbp);
 471         if (IS_ERR(err_skb))
 472                 return PTR_ERR(err_skb);
 473
 474         return 0;
 475 }
 476
 477 static int coalesce_skb(struct sk_buff **headp)
 478 {
 479         struct sk_buff *frag, *head, *prev;
 480         int err;
 481
 482         err = straighten_frag_list(headp);
 483         if (unlikely(err))
 484                 return err;
 485         head = *headp;
 486
 487         /* Coalesce frag list. */
 488         prev = head;
 489         for (frag = head->next; frag; frag = frag->next) {
 490                 bool headstolen;
 491                 int delta;
 492
 493                 if (unlikely(skb_unclone(prev, GFP_ATOMIC)))
 494                         return -ENOMEM;
 495
 496                 if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) {
 497                         prev = frag;
 498                         continue;
 499                 }
 500
 501                 prev->next = frag->next;
 502                 frag->len = 0;
 503                 frag->data_len = 0;
 504                 frag->truesize -= delta;
 505                 kfree_skb_partial(frag, headstolen);
 506                 frag = prev;
 507         }
 508
 509         if (!head->next)
 510                 return 0;
 511
 512         for (frag = head->next; frag; frag = frag->next) {
 513                 head->len += frag->len;
 514                 head->data_len += frag->len;
 515                 head->truesize += frag->truesize;
 516         }
 517
 518         skb_shinfo(head)->frag_list = head->next;
 519         head->next = NULL;
 520         return 0;
 521 }
 522 #else
 523 static int coalesce_skb(struct sk_buff **headp)
 524 {
 525         struct sk_buff *frag, *head = *headp, *next;
 526         int delta = FRAG_CB(head)->first.tot_len - skb_headlen(head);
 527         int err;
 528
 529         if (unlikely(!head->next))
 530                 return 0;
 531
 532         err = pskb_expand_head(head, 0, delta, GFP_ATOMIC);
 533         if (unlikely(err))
 534                 return err;
 535
 536         if (unlikely(!__pskb_pull_tail(head, head->data_len)))
 537                 BUG();
 538
 539         for (frag = head->next; frag; frag = next) {
 540                 skb_copy_bits(frag, 0, skb_put(head, frag->len), frag->len);
 541                 next = frag->next;
 542                 kfree_skb(frag);
 543         }
 544
 545         head->next = NULL;
 546         head->truesize = SKB_TRUESIZE(head->len);
 547         return 0;
 548 }
 549 #endif
 550
 551 static int __try_to_segment(struct sk_buff *skb, bool csum_partial,
 552                             bool ipv4, bool tcp, int l4_offset)
 553 {
 554         if (can_segment(skb, ipv4, tcp, csum_partial))
 555                 return skb_list_segment(skb, ipv4, l4_offset);
 556         else
 557                 return skb_linearize(skb);
 558 }
 559
 560 static int try_to_segment(struct sk_buff *skb)
 561 {
 562 #ifdef SKIP_ZERO_COPY
 563         /* coalesce_skb() since does not generate frag-list no need to
 564          * linearize it here.
 565          */
 566         return 0;
 567 #else
 568         struct stthdr *stth = stt_hdr(skb);
 569         bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL);
 570         bool ipv4 = !!(stth->flags & STT_PROTO_IPV4);
 571         bool tcp = !!(stth->flags & STT_PROTO_TCP);
 572         int l4_offset = stth->l4_offset;
 573
 574         return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset);
 575 #endif
 576 }
 577
 578 static int segment_skb(struct sk_buff **headp, bool csum_partial,
 579                        bool ipv4, bool tcp, int l4_offset)
 580 {
 581 #ifndef SKIP_ZERO_COPY
 582         int err;
 583
 584         err = coalesce_skb(headp);
 585         if (err)
 586                 return err;
 587 #endif
 588
 589         if (skb_shinfo(*headp)->frag_list)
 590                 return __try_to_segment(*headp, csum_partial,
 591                                         ipv4, tcp, l4_offset);
 592         return 0;
 593 }
 594
 595 static int __push_stt_header(struct sk_buff *skb, __be64 tun_id,
 596                              __be16 s_port, __be16 d_port,
 597                              __be32 saddr, __be32 dst,
 598                              __be16 l3_proto, u8 l4_proto,
 599                              int dst_mtu)
 600 {
 601         int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD;
 602         unsigned short encap_mss;
 603         struct tcphdr *tcph;
 604         struct stthdr *stth;
 605
 606         skb_push(skb, STT_HEADER_LEN);
 607         skb_reset_transport_header(skb);
 608         tcph = tcp_hdr(skb);
 609         memset(tcph, 0, STT_HEADER_LEN);
 610         stth = stt_hdr(skb);
 611
 612         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 613                 stth->flags |= STT_CSUM_PARTIAL;
 614
 615                 stth->l4_offset = skb->csum_start -
 616                                         (skb_headroom(skb) +
 617                                         STT_HEADER_LEN);
 618
 619                 if (l3_proto == htons(ETH_P_IP))
 620                         stth->flags |= STT_PROTO_IPV4;
 621
 622                 if (l4_proto == IPPROTO_TCP)
 623                         stth->flags |= STT_PROTO_TCP;
 624
 625                 stth->mss = htons(skb_shinfo(skb)->gso_size);
 626         } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
 627                 stth->flags |= STT_CSUM_VERIFIED;
 628         }
 629
 630         stth->vlan_tci = htons(skb->vlan_tci);
 631         skb->vlan_tci = 0;
 632         put_unaligned(tun_id, &stth->key);
 633
 634         tcph->source    = s_port;
 635         tcph->dest      = d_port;
 636         tcph->doff      = sizeof(struct tcphdr) / 4;
 637         tcph->ack       = 1;
 638         tcph->psh       = 1;
 639         tcph->window    = htons(USHRT_MAX);
 640         tcph->seq       = htonl(data_len << STT_SEQ_LEN_SHIFT);
 641         tcph->ack_seq   = ack_seq();
 642         tcph->check     = ~tcp_v4_check(skb->len, saddr, dst, 0);
 643
 644         skb->csum_start = skb_transport_header(skb) - skb->head;
 645         skb->csum_offset = offsetof(struct tcphdr, check);
 646         skb->ip_summed = CHECKSUM_PARTIAL;
 647
 648         encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 649         if (data_len > encap_mss) {
 650                 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
 651                         return -EINVAL;
 652
 653                 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 654                 skb_shinfo(skb)->gso_size = encap_mss;
 655                 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss);
 656         } else {
 657                 if (unlikely(clear_gso(skb)))
 658                         return -EINVAL;
 659         }
 660         return 0;
 661 }
 662
 663 static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id,
 664                                        __be16 s_port, __be16 d_port,
 665                                        __be32 saddr, __be32 dst,
 666                                        __be16 l3_proto, u8 l4_proto,
 667                                        int dst_mtu)
 668 {
 669         struct sk_buff *skb;
 670
 671         if (skb_shinfo(head)->frag_list) {
 672                 bool ipv4 = (l3_proto == htons(ETH_P_IP));
 673                 bool tcp = (l4_proto == IPPROTO_TCP);
 674                 bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL);
 675                 int l4_offset = skb_transport_offset(head);
 676
 677                 /* Need to call skb_orphan() to report currect true-size.
 678                  * calling skb_orphan() in this layer is odd but SKB with
 679                  * frag-list should not be associated with any socket, so
 680                  * skb-orphan should be no-op. */
 681                 skb_orphan(head);
 682                 if (unlikely(segment_skb(&head, csum_partial,
 683                                          ipv4, tcp, l4_offset)))
 684                         goto error;
 685         }
 686
 687         for (skb = head; skb; skb = skb->next) {
 688                 if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst,
 689                                       l3_proto, l4_proto, dst_mtu))
 690                         goto error;
 691         }
 692
 693         return head;
 694 error:
 695         kfree_skb_list(head);
 696         return NULL;
 697 }
 698
 699 static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto)
 700 {
 701         if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) {
 702                 int csum_offset;
 703                 __sum16 *csum;
 704                 int len;
 705
 706                 if (l4_proto == IPPROTO_TCP)
 707                         csum_offset = offsetof(struct tcphdr, check);
 708                 else if (l4_proto == IPPROTO_UDP)
 709                         csum_offset = offsetof(struct udphdr, check);
 710                 else
 711                         return 0;
 712
 713                 len = skb->len - skb_transport_offset(skb);
 714                 csum = (__sum16 *)(skb_transport_header(skb) + csum_offset);
 715
 716                 if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) +
 717                                                  csum_offset + sizeof(*csum))))
 718                         return -EINVAL;
 719
 720                 if (l3_proto == htons(ETH_P_IP)) {
 721                         struct iphdr *iph = ip_hdr(skb);
 722
 723                         *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
 724                                                    len, l4_proto, 0);
 725                 } else if (l3_proto == htons(ETH_P_IPV6)) {
 726                         struct ipv6hdr *ip6h = ipv6_hdr(skb);
 727
 728                         *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
 729                                                  len, l4_proto, 0);
 730                 } else {
 731                         return 0;
 732                 }
 733                 skb->csum_start = skb_transport_header(skb) - skb->head;
 734                 skb->csum_offset = csum_offset;
 735                 skb->ip_summed = CHECKSUM_PARTIAL;
 736         }
 737
 738         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 739                 /* Assume receiver can only offload TCP/UDP over IPv4/6,
 740                  * and require 802.1Q VLANs to be accelerated.
 741                  */
 742                 if (l3_proto != htons(ETH_P_IP) &&
 743                     l3_proto != htons(ETH_P_IPV6))
 744                         return 0;
 745
 746                 if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP)
 747                         return 0;
 748
 749                 /* L4 offset must fit in a 1-byte field. */
 750                 if (skb->csum_start - skb_headroom(skb) > 255)
 751                         return 0;
 752
 753                 if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES)
 754                         return 0;
 755         }
 756         /* Total size of encapsulated packet must fit in 16 bits. */
 757         if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535)
 758                 return 0;
 759
 760 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
 761         if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q))
 762                 return 0;
 763 #endif
 764         return 1;
 765 }
 766
 767 static bool need_linearize(const struct sk_buff *skb)
 768 {
 769         struct skb_shared_info *shinfo = skb_shinfo(skb);
 770         int i;
 771
 772         if (unlikely(shinfo->frag_list))
 773                 return true;
 774
 775         /* Generally speaking we should linearize if there are paged frags.
 776          * However, if all of the refcounts are 1 we know nobody else can
 777          * change them from underneath us and we can skip the linearization.
 778          */
 779         for (i = 0; i < shinfo->nr_frags; i++)
 780                 if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1))
 781                         return true;
 782
 783         return false;
 784 }
 785
 786 static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom)
 787 {
 788         int err;
 789
 790 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
 791         if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) {
 792
 793                 min_headroom += VLAN_HLEN;
 794                 if (skb_headroom(skb) < min_headroom) {
 795                         int head_delta = SKB_DATA_ALIGN(min_headroom -
 796                                                         skb_headroom(skb) + 16);
 797
 798                         err = pskb_expand_head(skb, max_t(int, head_delta, 0),
 799                                                0, GFP_ATOMIC);
 800                         if (unlikely(err))
 801                                 goto error;
 802                 }
 803
 804                 skb = __vlan_hwaccel_push_inside(skb);
 805                 if (!skb) {
 806                         err = -ENOMEM;
 807                         goto error;
 808                 }
 809         }
 810 #endif
 811
 812         if (skb_is_gso(skb)) {
 813                 struct sk_buff *nskb;
 814                 char cb[sizeof(skb->cb)];
 815
 816                 memcpy(cb, skb->cb, sizeof(cb));
 817
 818                 nskb = __skb_gso_segment(skb, 0, false);
 819                 if (IS_ERR(nskb)) {
 820                         err = PTR_ERR(nskb);
 821                         goto error;
 822                 }
 823
 824                 consume_skb(skb);
 825                 skb = nskb;
 826                 while (nskb) {
 827                         memcpy(nskb->cb, cb, sizeof(cb));
 828                         nskb = nskb->next;
 829                 }
 830         } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 831                 /* Pages aren't locked and could change at any time.
 832                  * If this happens after we compute the checksum, the
 833                  * checksum will be wrong.  We linearize now to avoid
 834                  * this problem.
 835                  */
 836                 if (unlikely(need_linearize(skb))) {
 837                         err = __skb_linearize(skb);
 838                         if (unlikely(err))
 839                                 goto error;
 840                 }
 841
 842                 err = skb_checksum_help(skb);
 843                 if (unlikely(err))
 844                         goto error;
 845         }
 846         skb->ip_summed = CHECKSUM_NONE;
 847
 848         return skb;
 849 error:
 850         kfree_skb(skb);
 851         return ERR_PTR(err);
 852 }
 853
 854 static void skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
 855                           __be32 dst, __u8 tos, __u8 ttl, __be16 df)
 856 {
 857         while (skb) {
 858                 struct sk_buff *next = skb->next;
 859
 860                 if (next)
 861                         dst_clone(&rt->dst);
 862
 863                 skb->next = NULL;
 864                 iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
 865                               tos, ttl, df, false);
 866
 867                 skb = next;
 868         }
 869 }
 870
 871 static u8 parse_ipv6_l4_proto(struct sk_buff *skb)
 872 {
 873         unsigned int nh_ofs = skb_network_offset(skb);
 874         int payload_ofs;
 875         struct ipv6hdr *nh;
 876         uint8_t nexthdr;
 877         __be16 frag_off;
 878
 879         if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr))))
 880                 return 0;
 881
 882         nh = ipv6_hdr(skb);
 883         nexthdr = nh->nexthdr;
 884         payload_ofs = (u8 *)(nh + 1) - skb->data;
 885
 886         payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
 887         if (unlikely(payload_ofs < 0))
 888                 return 0;
 889
 890         return nexthdr;
 891 }
 892
 893 static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
 894 {
 895         if (l3_proto == htons(ETH_P_IP)) {
 896                 unsigned int nh_ofs = skb_network_offset(skb);
 897
 898                 if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr))))
 899                         return 0;
 900
 901                 return ip_hdr(skb)->protocol;
 902         } else if (l3_proto == htons(ETH_P_IPV6)) {
 903                 return parse_ipv6_l4_proto(skb);
 904         }
 905         return 0;
 906 }
 907
 908 static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
 909                         __be32 src, __be32 dst, __u8 tos,
 910                         __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
 911                         __be64 tun_id)
 912 {
 913         struct ethhdr *eh = eth_hdr(skb);
 914         int ret = 0, min_headroom;
 915         __be16 inner_l3_proto;
 916          u8 inner_l4_proto;
 917
 918         inner_l3_proto = eh->h_proto;
 919         inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto);
 920
 921         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 922                         + STT_HEADER_LEN + sizeof(struct iphdr);
 923
 924         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 925                 int head_delta = SKB_DATA_ALIGN(min_headroom -
 926                                                 skb_headroom(skb) +
 927                                                 16);
 928
 929                 ret = pskb_expand_head(skb, max_t(int, head_delta, 0),
 930                                        0, GFP_ATOMIC);
 931                 if (unlikely(ret))
 932                         goto err_free_rt;
 933         }
 934
 935         ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto);
 936         if (ret < 0)
 937                 goto err_free_rt;
 938         if (!ret) {
 939                 skb = handle_offloads(skb, min_headroom);
 940                 if (IS_ERR(skb)) {
 941                         ret = PTR_ERR(skb);
 942                         skb = NULL;
 943                         goto err_free_rt;
 944                 }
 945         }
 946
 947         ret = 0;
 948         while (skb) {
 949                 struct sk_buff *next_skb = skb->next;
 950
 951                 skb->next = NULL;
 952
 953                 if (next_skb)
 954                         dst_clone(&rt->dst);
 955
 956                 /* Push STT and TCP header. */
 957                 skb = push_stt_header(skb, tun_id, src_port, dst_port, src,
 958                                       dst, inner_l3_proto, inner_l4_proto,
 959                                       dst_mtu(&rt->dst));
 960                 if (unlikely(!skb)) {
 961                         ip_rt_put(rt);
 962                         goto next;
 963                 }
 964
 965                 /* Push IP header. */
 966                 skb_list_xmit(rt, skb, src, dst, tos, ttl, df);
 967
 968 next:
 969                 skb = next_skb;
 970         }
 971
 972         return 0;
 973
 974 err_free_rt:
 975         ip_rt_put(rt);
 976         kfree_skb(skb);
 977         return ret;
 978 }
 979
 980 static struct rtable *stt_get_rt(struct sk_buff *skb,
 981                                  struct net_device *dev,
 982                                  struct flowi4 *fl,
 983                                  const struct ip_tunnel_key *key)
 984 {
 985         struct net *net = dev_net(dev);
 986
 987         /* Route lookup */
 988         memset(fl, 0, sizeof(*fl));
 989         fl->daddr = key->u.ipv4.dst;
 990         fl->saddr = key->u.ipv4.src;
 991         fl->flowi4_tos = RT_TOS(key->tos);
 992         fl->flowi4_mark = skb->mark;
 993         fl->flowi4_proto = IPPROTO_TCP;
 994
 995         return ip_route_output_key(net, fl);
 996 }
 997
 998 netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
 999 {
1000         struct net_device *dev = skb->dev;
1001         struct stt_dev *stt_dev = netdev_priv(dev);
1002         struct net *net = stt_dev->net;
1003         __be16 dport = stt_dev->dst_port;
1004         struct ip_tunnel_key *tun_key;
1005         struct ip_tunnel_info *tun_info;
1006         struct rtable *rt;
1007         struct flowi4 fl;
1008         __be16 sport;
1009         __be16 df;
1010         int err;
1011
1012         tun_info = skb_tunnel_info(skb);
1013         if (unlikely(!tun_info)) {
1014                 err = -EINVAL;
1015                 goto error;
1016         }
1017
1018         tun_key = &tun_info->key;
1019
1020         rt = stt_get_rt(skb, dev, &fl, tun_key);
1021         if (IS_ERR(rt)) {
1022                 err = PTR_ERR(rt);
1023                 goto error;
1024         }
1025
1026         df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
1027         sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
1028         skb->ignore_df = 1;
1029
1030         stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
1031                     tun_key->tos, tun_key->ttl,
1032                     df, sport, dport, tun_key->tun_id);
1033         return NETDEV_TX_OK;
1034 error:
1035         kfree_skb(skb);
1036         dev->stats.tx_errors++;
1037         return NETDEV_TX_OK;
1038 }
1039 EXPORT_SYMBOL(ovs_stt_xmit);
1040
1041 static void free_frag(struct stt_percpu *stt_percpu,
1042                       struct pkt_frag *frag)
1043 {
1044         stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used;
1045         kfree_skb_list(frag->skbs);
1046         list_del(&frag->lru_node);
1047         frag->skbs = NULL;
1048 }
1049
1050 static void evict_frags(struct stt_percpu *stt_percpu)
1051 {
1052         while (!list_empty(&stt_percpu->frag_lru) &&
1053                stt_percpu->frag_mem_used > REASM_LO_THRESH) {
1054                 struct pkt_frag *frag;
1055
1056                 frag = list_first_entry(&stt_percpu->frag_lru,
1057                                         struct pkt_frag,
1058                                         lru_node);
1059                 free_frag(stt_percpu, frag);
1060         }
1061 }
1062
1063 static bool pkt_key_match(struct net *net,
1064                           const struct pkt_frag *a, const struct pkt_key *b)
1065 {
1066         return a->key.saddr == b->saddr && a->key.daddr == b->daddr &&
1067                a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark &&
1068                net_eq(dev_net(a->skbs->dev), net);
1069 }
1070
1071 static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key)
1072 {
1073         u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark;
1074
1075         return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr,
1076                             (__force u32)key->pkt_seq, initval);
1077 }
1078
1079 static struct pkt_frag *lookup_frag(struct net *net,
1080                                     struct stt_percpu *stt_percpu,
1081                                     const struct pkt_key *key, u32 hash)
1082 {
1083         struct pkt_frag *frag, *victim_frag = NULL;
1084         int i;
1085
1086         for (i = 0; i < FRAG_HASH_SEGS; i++) {
1087                 frag = flex_array_get(stt_percpu->frag_hash,
1088                                       hash & (FRAG_HASH_ENTRIES - 1));
1089
1090                 if (frag->skbs &&
1091                     time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) &&
1092                     pkt_key_match(net, frag, key))
1093                         return frag;
1094
1095                 if (!victim_frag ||
1096                     (victim_frag->skbs &&
1097                      (!frag->skbs ||
1098                       time_before(frag->timestamp, victim_frag->timestamp))))
1099                         victim_frag = frag;
1100
1101                 hash >>= FRAG_HASH_SHIFT;
1102         }
1103
1104         if (victim_frag->skbs)
1105                 free_frag(stt_percpu, victim_frag);
1106
1107         return victim_frag;
1108 }
1109
1110 #ifdef SKIP_ZERO_COPY
1111 static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
1112                       int *delta, bool *headstolen)
1113 {
1114         int err;
1115
1116         if (unlikely(to->next))
1117                 return -EINVAL;
1118
1119         if (unlikely(FRAG_CB(to)->offset))
1120                 return -EINVAL;
1121
1122         if (unlikely(skb_unclone(to, GFP_ATOMIC)))
1123                 return -ENOMEM;
1124
1125         if (skb_try_coalesce(to, from, headstolen, delta))
1126                 return 0;
1127
1128         *headstolen = false;
1129         err = pskb_expand_head(to, 0, to->data_len + from->len, GFP_ATOMIC);
1130         if (unlikely(err))
1131                 return err;
1132
1133         if (unlikely(!__pskb_pull_tail(to, to->data_len)))
1134                 BUG();
1135
1136         skb_copy_bits(from, 0, skb_put(to, from->len), from->len);
1137
1138         *delta = from->len;
1139         to->truesize += from->len;
1140         return 0;
1141 }
1142 #else
1143 static int __copy_skb(struct sk_buff *to, struct sk_buff *from,
1144                       int *delta, bool *headstolen)
1145 {
1146         *headstolen = false;
1147         return -EINVAL;
1148 }
1149 #endif
1150
1151 static struct sk_buff *reassemble(struct sk_buff *skb)
1152 {
1153         struct iphdr *iph = ip_hdr(skb);
1154         struct tcphdr *tcph = tcp_hdr(skb);
1155         u32 seq = ntohl(tcph->seq);
1156         struct stt_percpu *stt_percpu;
1157         struct sk_buff *last_skb, *copied_skb = NULL;
1158         struct pkt_frag *frag;
1159         struct pkt_key key;
1160         int tot_len, delta = skb->truesize;
1161         bool headstolen;
1162         u32 hash;
1163
1164         tot_len = seq >> STT_SEQ_LEN_SHIFT;
1165         FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK;
1166
1167         if (unlikely(skb->len == 0))
1168                 goto out_free;
1169
1170         if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len))
1171                 goto out_free;
1172
1173         if (tot_len == skb->len)
1174                 goto out;
1175
1176         key.saddr = iph->saddr;
1177         key.daddr = iph->daddr;
1178         key.pkt_seq = tcph->ack_seq;
1179         key.mark = skb->mark;
1180         hash = pkt_key_hash(dev_net(skb->dev), &key);
1181
1182         stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id());
1183
1184         spin_lock(&stt_percpu->lock);
1185
1186         if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH))
1187                 evict_frags(stt_percpu);
1188
1189         frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash);
1190         if (!frag->skbs) {
1191                 frag->skbs = skb;
1192                 frag->key = key;
1193                 frag->timestamp = jiffies;
1194                 FRAG_CB(skb)->first.last_skb = skb;
1195                 FRAG_CB(skb)->first.mem_used = skb->truesize;
1196                 FRAG_CB(skb)->first.tot_len = tot_len;
1197                 FRAG_CB(skb)->first.rcvd_len = skb->len;
1198                 FRAG_CB(skb)->first.set_ecn_ce = false;
1199                 list_add_tail(&frag->lru_node, &stt_percpu->frag_lru);
1200                 stt_percpu->frag_mem_used += skb->truesize;
1201                 skb = NULL;
1202                 goto unlock;
1203         }
1204
1205         /* Optimize for the common case where fragments are received in-order
1206          * and not overlapping.
1207          */
1208         last_skb = FRAG_CB(frag->skbs)->first.last_skb;
1209         if (likely(FRAG_CB(last_skb)->offset + last_skb->len ==
1210                    FRAG_CB(skb)->offset)) {
1211
1212                 if (!__copy_skb(frag->skbs, skb, &delta, &headstolen)) {
1213                         copied_skb = skb;
1214                 } else {
1215                         last_skb->next = skb;
1216                         FRAG_CB(frag->skbs)->first.last_skb = skb;
1217                 }
1218         } else {
1219                 struct sk_buff *prev = NULL, *next;
1220
1221                 for (next = frag->skbs; next; next = next->next) {
1222                         if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset)
1223                                 break;
1224                         prev = next;
1225                 }
1226
1227                 /* Overlapping fragments aren't allowed.  We shouldn't start
1228                  * before the end of the previous fragment.
1229                  */
1230                 if (prev &&
1231                     FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset)
1232                         goto unlock_free;
1233
1234                 /* We also shouldn't end after the beginning of the next
1235                  * fragment.
1236                  */
1237                 if (next &&
1238                     FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset)
1239                         goto unlock_free;
1240
1241                 if (prev) {
1242                         prev->next = skb;
1243                 } else {
1244                         FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first;
1245                         frag->skbs = skb;
1246                 }
1247
1248                 if (next)
1249                         skb->next = next;
1250                 else
1251                         FRAG_CB(frag->skbs)->first.last_skb = skb;
1252         }
1253
1254         FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos);
1255         FRAG_CB(frag->skbs)->first.rcvd_len += skb->len;
1256         stt_percpu->frag_mem_used += delta;
1257         FRAG_CB(frag->skbs)->first.mem_used += delta;
1258
1259         if (FRAG_CB(frag->skbs)->first.tot_len ==
1260             FRAG_CB(frag->skbs)->first.rcvd_len) {
1261                 struct sk_buff *frag_head = frag->skbs;
1262
1263                 frag_head->tstamp = skb->tstamp;
1264                 if (FRAG_CB(frag_head)->first.set_ecn_ce)
1265                         INET_ECN_set_ce(frag_head);
1266
1267                 list_del(&frag->lru_node);
1268                 stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used;
1269                 frag->skbs = NULL;
1270                 skb = frag_head;
1271         } else {
1272                 list_move_tail(&frag->lru_node, &stt_percpu->frag_lru);
1273                 skb = NULL;
1274         }
1275
1276         if (copied_skb)
1277                 kfree_skb_partial(copied_skb, headstolen);
1278         goto unlock;
1279
1280 unlock_free:
1281         kfree_skb(skb);
1282         skb = NULL;
1283 unlock:
1284         spin_unlock(&stt_percpu->lock);
1285         return skb;
1286 out_free:
1287         kfree_skb(skb);
1288         skb = NULL;
1289 out:
1290         return skb;
1291 }
1292
1293 static bool validate_checksum(struct sk_buff *skb)
1294 {
1295         struct iphdr *iph = ip_hdr(skb);
1296
1297         if (skb_csum_unnecessary(skb))
1298                 return true;
1299
1300         if (skb->ip_summed == CHECKSUM_COMPLETE &&
1301             !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum))
1302                 return true;
1303
1304         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len,
1305                                        IPPROTO_TCP, 0);
1306
1307         return __tcp_checksum_complete(skb) == 0;
1308 }
1309
1310 static bool set_offloads(struct sk_buff *skb)
1311 {
1312         struct stthdr *stth = stt_hdr(skb);
1313         unsigned short gso_type;
1314         int l3_header_size;
1315         int l4_header_size;
1316         u16 csum_offset;
1317         u8 proto_type;
1318
1319         if (stth->vlan_tci)
1320                 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
1321                                        ntohs(stth->vlan_tci));
1322
1323         if (!(stth->flags & STT_CSUM_PARTIAL)) {
1324                 if (stth->flags & STT_CSUM_VERIFIED)
1325                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1326                 else
1327                         skb->ip_summed = CHECKSUM_NONE;
1328
1329                 return clear_gso(skb) == 0;
1330         }
1331
1332         proto_type = stth->flags & STT_PROTO_TYPES;
1333
1334         switch (proto_type) {
1335         case (STT_PROTO_IPV4 | STT_PROTO_TCP):
1336                 /* TCP/IPv4 */
1337                 csum_offset = offsetof(struct tcphdr, check);
1338                 gso_type = SKB_GSO_TCPV4;
1339                 l3_header_size = sizeof(struct iphdr);
1340                 l4_header_size = sizeof(struct tcphdr);
1341                 skb->protocol = htons(ETH_P_IP);
1342                 break;
1343         case STT_PROTO_TCP:
1344                 /* TCP/IPv6 */
1345                 csum_offset = offsetof(struct tcphdr, check);
1346                 gso_type = SKB_GSO_TCPV6;
1347                 l3_header_size = sizeof(struct ipv6hdr);
1348                 l4_header_size = sizeof(struct tcphdr);
1349                 skb->protocol = htons(ETH_P_IPV6);
1350                 break;
1351         case STT_PROTO_IPV4:
1352                 /* UDP/IPv4 */
1353                 csum_offset = offsetof(struct udphdr, check);
1354                 gso_type = SKB_GSO_UDP;
1355                 l3_header_size = sizeof(struct iphdr);
1356                 l4_header_size = sizeof(struct udphdr);
1357                 skb->protocol = htons(ETH_P_IP);
1358                 break;
1359         default:
1360                 /* UDP/IPv6 */
1361                 csum_offset = offsetof(struct udphdr, check);
1362                 gso_type = SKB_GSO_UDP;
1363                 l3_header_size = sizeof(struct ipv6hdr);
1364                 l4_header_size = sizeof(struct udphdr);
1365                 skb->protocol = htons(ETH_P_IPV6);
1366         }
1367
1368         if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size))
1369                 return false;
1370
1371         if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size)))
1372                 return false;
1373
1374         stth = stt_hdr(skb);
1375
1376         skb->csum_start = skb_headroom(skb) + stth->l4_offset;
1377         skb->csum_offset = csum_offset;
1378         skb->ip_summed = CHECKSUM_PARTIAL;
1379
1380         if (stth->mss) {
1381                 if (unlikely(skb_unclone(skb, GFP_ATOMIC)))
1382                         return false;
1383
1384                 skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY;
1385                 skb_shinfo(skb)->gso_size = ntohs(stth->mss);
1386                 skb_shinfo(skb)->gso_segs = 0;
1387         } else {
1388                 if (unlikely(clear_gso(skb)))
1389                         return false;
1390         }
1391
1392         return true;
1393 }
1394
1395 static void rcv_list(struct net_device *dev, struct sk_buff *skb,
1396                      struct metadata_dst *tun_dst)
1397 {
1398         struct sk_buff *next;
1399
1400         do {
1401                 next = skb->next;
1402                 skb->next = NULL;
1403                 if (next) {
1404                         ovs_dst_hold((struct dst_entry *)tun_dst);
1405                         ovs_skb_dst_set(next, (struct dst_entry *)tun_dst);
1406                 }
1407                 ovs_ip_tunnel_rcv(dev, skb, tun_dst);
1408         } while ((skb = next));
1409 }
1410
1411 #ifndef USE_UPSTREAM_TUNNEL
1412 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1413 {
1414         struct metadata_dst tun_dst;
1415
1416         ovs_ip_tun_rx_dst(&tun_dst, skb, TUNNEL_KEY | TUNNEL_CSUM,
1417                           get_unaligned(&stt_hdr(skb)->key), 0);
1418         tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1419         tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1420
1421         rcv_list(stt_dev->dev, skb, &tun_dst);
1422         return 0;
1423 }
1424 #else
1425 static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1426 {
1427         struct metadata_dst *tun_dst;
1428         __be16 flags;
1429         __be64 tun_id;
1430
1431         flags = TUNNEL_KEY | TUNNEL_CSUM;
1432         tun_id = get_unaligned(&stt_hdr(skb)->key);
1433         tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
1434         if (!tun_dst)
1435                 return -ENOMEM;
1436         tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source;
1437         tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
1438
1439         rcv_list(stt_dev->dev, skb, tun_dst);
1440         return 0;
1441 }
1442 #endif
1443
1444 static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
1445 {
1446         int err;
1447
1448         if (unlikely(!validate_checksum(skb)))
1449                 goto drop;
1450
1451         __skb_pull(skb, sizeof(struct tcphdr));
1452         skb = reassemble(skb);
1453         if (!skb)
1454                 return;
1455
1456         if (skb->next && coalesce_skb(&skb))
1457                 goto drop;
1458
1459         err = iptunnel_pull_header(skb,
1460                                    sizeof(struct stthdr) + STT_ETH_PAD,
1461                                    htons(ETH_P_TEB),
1462                                    !net_eq(stt_dev->net, dev_net(stt_dev->dev)));
1463         if (unlikely(err))
1464                 goto drop;
1465
1466         if (unlikely(stt_hdr(skb)->version != 0))
1467                 goto drop;
1468
1469         if (unlikely(!set_offloads(skb)))
1470                 goto drop;
1471
1472         if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
1473                 goto drop;
1474
1475         err = __stt_rcv(stt_dev, skb);
1476         if (err)
1477                 goto drop;
1478         return;
1479 drop:
1480         /* Consume bad packet */
1481         kfree_skb_list(skb);
1482         stt_dev->dev->stats.rx_errors++;
1483 }
1484
1485 static void tcp_sock_release(struct socket *sock)
1486 {
1487         kernel_sock_shutdown(sock, SHUT_RDWR);
1488         sock_release(sock);
1489 }
1490
1491 static int tcp_sock_create4(struct net *net, __be16 port,
1492                             struct socket **sockp)
1493 {
1494         struct sockaddr_in tcp_addr;
1495         struct socket *sock = NULL;
1496         int err;
1497
1498         err = sock_create_kern(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1499         if (err < 0)
1500                 goto error;
1501
1502         memset(&tcp_addr, 0, sizeof(tcp_addr));
1503         tcp_addr.sin_family = AF_INET;
1504         tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY);
1505         tcp_addr.sin_port = port;
1506         err = kernel_bind(sock, (struct sockaddr *)&tcp_addr,
1507                           sizeof(tcp_addr));
1508         if (err < 0)
1509                 goto error;
1510
1511         *sockp = sock;
1512         return 0;
1513
1514 error:
1515         if (sock)
1516                 tcp_sock_release(sock);
1517         *sockp = NULL;
1518         return err;
1519 }
1520
1521 static void schedule_clean_percpu(void)
1522 {
1523         schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL);
1524 }
1525
1526 static void clean_percpu(struct work_struct *work)
1527 {
1528         int i;
1529
1530         for_each_possible_cpu(i) {
1531                 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1532                 int j;
1533
1534                 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1535                         struct pkt_frag *frag;
1536
1537                         frag = flex_array_get(stt_percpu->frag_hash, j);
1538                         if (!frag->skbs ||
1539                             time_before(jiffies, frag->timestamp + FRAG_EXP_TIME))
1540                                 continue;
1541
1542                         spin_lock_bh(&stt_percpu->lock);
1543
1544                         if (frag->skbs &&
1545                             time_after(jiffies, frag->timestamp + FRAG_EXP_TIME))
1546                                 free_frag(stt_percpu, frag);
1547
1548                         spin_unlock_bh(&stt_percpu->lock);
1549                 }
1550         }
1551         schedule_clean_percpu();
1552 }
1553
1554 #ifdef HAVE_NF_HOOKFN_ARG_OPS
1555 #define FIRST_PARAM const struct nf_hook_ops *ops
1556 #else
1557 #ifdef HAVE_NF_HOOKFN_ARG_PRIV
1558 #define FIRST_PARAM void *priv
1559 #else
1560 #define FIRST_PARAM unsigned int hooknum
1561 #endif
1562 #endif
1563
1564 #ifdef HAVE_NF_HOOK_STATE
1565 #if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)
1566 /* RHEL nfhook hacks. */
1567 #ifndef __GENKSYMS__
1568 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1569                    const struct nf_hook_state *state
1570 #else
1571 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1572                    int (*okfn)(struct sk_buff *)
1573 #endif
1574 #else
1575 #define LAST_PARAM const struct nf_hook_state *state
1576 #endif
1577 #else
1578 #define LAST_PARAM const struct net_device *in, const struct net_device *out, \
1579                    int (*okfn)(struct sk_buff *)
1580 #endif
1581
1582 static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
1583 {
1584         struct stt_dev *stt_dev;
1585         int ip_hdr_len;
1586
1587         if (ip_hdr(skb)->protocol != IPPROTO_TCP)
1588                 return NF_ACCEPT;
1589
1590         ip_hdr_len = ip_hdrlen(skb);
1591         if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr))))
1592                 return NF_ACCEPT;
1593
1594         skb_set_transport_header(skb, ip_hdr_len);
1595
1596         stt_dev = stt_find_up_dev(dev_net(skb->dev), tcp_hdr(skb)->dest);
1597         if (!stt_dev)
1598                 return NF_ACCEPT;
1599
1600         __skb_pull(skb, ip_hdr_len);
1601         stt_rcv(stt_dev, skb);
1602         return NF_STOLEN;
1603 }
1604
1605 static struct nf_hook_ops nf_hook_ops __read_mostly = {
1606         .hook           = nf_ip_hook,
1607 #ifdef HAVE_NF_HOOKS_OPS_OWNER
1608         .owner          = THIS_MODULE,
1609 #endif
1610         .pf             = NFPROTO_IPV4,
1611         .hooknum        = NF_INET_LOCAL_IN,
1612         .priority       = INT_MAX,
1613 };
1614
1615 static int stt_start(struct net *net)
1616 {
1617         struct stt_net *sn = net_generic(net, stt_net_id);
1618         int err;
1619         int i;
1620
1621         if (n_tunnels) {
1622                 n_tunnels++;
1623                 return 0;
1624         }
1625         get_random_bytes(&frag_hash_seed, sizeof(u32));
1626
1627         stt_percpu_data = alloc_percpu(struct stt_percpu);
1628         if (!stt_percpu_data) {
1629                 err = -ENOMEM;
1630                 goto error;
1631         }
1632
1633         for_each_possible_cpu(i) {
1634                 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1635                 struct flex_array *frag_hash;
1636
1637                 spin_lock_init(&stt_percpu->lock);
1638                 INIT_LIST_HEAD(&stt_percpu->frag_lru);
1639                 get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32));
1640
1641                 frag_hash = flex_array_alloc(sizeof(struct pkt_frag),
1642                                              FRAG_HASH_ENTRIES,
1643                                              GFP_KERNEL | __GFP_ZERO);
1644                 if (!frag_hash) {
1645                         err = -ENOMEM;
1646                         goto free_percpu;
1647                 }
1648                 stt_percpu->frag_hash = frag_hash;
1649
1650                 err = flex_array_prealloc(stt_percpu->frag_hash, 0,
1651                                           FRAG_HASH_ENTRIES,
1652                                           GFP_KERNEL | __GFP_ZERO);
1653                 if (err)
1654                         goto free_percpu;
1655         }
1656         schedule_clean_percpu();
1657         n_tunnels++;
1658
1659         if (sn->n_tunnels) {
1660                 sn->n_tunnels++;
1661                 return 0;
1662         }
1663 #ifdef HAVE_NF_REGISTER_NET_HOOK
1664         /* On kernel which support per net nf-hook, nf_register_hook() takes
1665          * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
1666          * use this new API.
1667          */
1668
1669         if (sn->nf_hook_reg_done)
1670                 goto out;
1671
1672         err = nf_register_net_hook(net, &nf_hook_ops);
1673         if (!err)
1674                 sn->nf_hook_reg_done = true;
1675 #else
1676         /* Register STT only on very first STT device addition. */
1677         if (!list_empty(&nf_hook_ops.list))
1678                 goto out;
1679
1680         err = nf_register_hook(&nf_hook_ops);
1681 #endif
1682         if (err)
1683                 goto dec_n_tunnel;
1684 out:
1685         sn->n_tunnels++;
1686         return 0;
1687
1688 dec_n_tunnel:
1689         n_tunnels--;
1690 free_percpu:
1691         for_each_possible_cpu(i) {
1692                 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1693
1694                 if (stt_percpu->frag_hash)
1695                         flex_array_free(stt_percpu->frag_hash);
1696         }
1697
1698         free_percpu(stt_percpu_data);
1699
1700 error:
1701         return err;
1702 }
1703
1704 static void stt_cleanup(struct net *net)
1705 {
1706         struct stt_net *sn = net_generic(net, stt_net_id);
1707         int i;
1708
1709         sn->n_tunnels--;
1710         if (sn->n_tunnels)
1711                 goto out;
1712 out:
1713         n_tunnels--;
1714         if (n_tunnels)
1715                 return;
1716
1717         cancel_delayed_work_sync(&clean_percpu_wq);
1718         for_each_possible_cpu(i) {
1719                 struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
1720                 int j;
1721
1722                 for (j = 0; j < FRAG_HASH_ENTRIES; j++) {
1723                         struct pkt_frag *frag;
1724
1725                         frag = flex_array_get(stt_percpu->frag_hash, j);
1726                         kfree_skb_list(frag->skbs);
1727                 }
1728
1729                 flex_array_free(stt_percpu->frag_hash);
1730         }
1731
1732         free_percpu(stt_percpu_data);
1733 }
1734
1735 static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
1736 {
1737 #ifdef USE_UPSTREAM_TUNNEL
1738         return ovs_stt_xmit(skb);
1739 #else
1740         /* Drop All packets coming from networking stack. OVS-CB is
1741          * not initialized for these packets.
1742          */
1743         dev_kfree_skb(skb);
1744         dev->stats.tx_dropped++;
1745         return NETDEV_TX_OK;
1746 #endif
1747 }
1748
1749 /* Setup stats when device is created */
1750 static int stt_init(struct net_device *dev)
1751 {
1752         dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1753         if (!dev->tstats)
1754                 return -ENOMEM;
1755
1756         return 0;
1757 }
1758
1759 static void stt_uninit(struct net_device *dev)
1760 {
1761         free_percpu(dev->tstats);
1762 }
1763
1764 static int stt_open(struct net_device *dev)
1765 {
1766         struct stt_dev *stt = netdev_priv(dev);
1767         struct net *net = stt->net;
1768         struct stt_net *sn = net_generic(net, stt_net_id);
1769         int err;
1770
1771         err = stt_start(net);
1772         if (err)
1773                 return err;
1774
1775         err = tcp_sock_create4(net, stt->dst_port, &stt->sock);
1776         if (err)
1777                 return err;
1778         list_add_rcu(&stt->up_next, &sn->stt_up_list);
1779         return 0;
1780 }
1781
1782 static int stt_stop(struct net_device *dev)
1783 {
1784         struct stt_dev *stt_dev = netdev_priv(dev);
1785         struct net *net = stt_dev->net;
1786
1787         list_del_rcu(&stt_dev->up_next);
1788         synchronize_net();
1789         tcp_sock_release(stt_dev->sock);
1790         stt_dev->sock = NULL;
1791         stt_cleanup(net);
1792         return 0;
1793 }
1794
1795 static int __stt_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1796 {
1797         int max_mtu = IP_MAX_MTU - STT_HEADER_LEN - sizeof(struct iphdr)
1798                       - dev->hard_header_len;
1799
1800         if (new_mtu < 68)
1801                 return -EINVAL;
1802
1803         if (new_mtu > max_mtu) {
1804                 if (strict)
1805                         return -EINVAL;
1806
1807                 new_mtu = max_mtu;
1808         }
1809
1810         dev->mtu = new_mtu;
1811         return 0;
1812 }
1813
1814 static int stt_change_mtu(struct net_device *dev, int new_mtu)
1815 {
1816         return __stt_change_mtu(dev, new_mtu, true);
1817 }
1818
1819 int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
1820 {
1821         struct ip_tunnel_info *info = skb_tunnel_info(skb);
1822         struct stt_dev *stt_dev = netdev_priv(dev);
1823         struct net *net = stt_dev->net;
1824         __be16 dport = stt_dev->dst_port;
1825         struct flowi4 fl4;
1826         struct rtable *rt;
1827
1828         if (ip_tunnel_info_af(info) != AF_INET)
1829                 return -EINVAL;
1830
1831         rt = stt_get_rt(skb, dev, &fl4, &info->key);
1832         if (IS_ERR(rt))
1833                 return PTR_ERR(rt);
1834
1835         ip_rt_put(rt);
1836
1837         info->key.u.ipv4.src = fl4.saddr;
1838         info->key.tp_src = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
1839         info->key.tp_dst = dport;
1840         return 0;
1841 }
1842 EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst);
1843
1844 static const struct net_device_ops stt_netdev_ops = {
1845         .ndo_init               = stt_init,
1846         .ndo_uninit             = stt_uninit,
1847         .ndo_open               = stt_open,
1848         .ndo_stop               = stt_stop,
1849         .ndo_start_xmit         = stt_dev_xmit,
1850         .ndo_get_stats64        = ip_tunnel_get_stats64,
1851         .ndo_change_mtu         = stt_change_mtu,
1852         .ndo_validate_addr      = eth_validate_addr,
1853         .ndo_set_mac_address    = eth_mac_addr,
1854 #ifdef USE_UPSTREAM_TUNNEL
1855 #ifdef HAVE_NDO_FILL_METADATA_DST
1856         .ndo_fill_metadata_dst  = stt_fill_metadata_dst,
1857 #endif
1858 #endif
1859 };
1860
1861 static void stt_get_drvinfo(struct net_device *dev,
1862                 struct ethtool_drvinfo *drvinfo)
1863 {
1864         strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version));
1865         strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver));
1866 }
1867
1868 static const struct ethtool_ops stt_ethtool_ops = {
1869         .get_drvinfo    = stt_get_drvinfo,
1870         .get_link       = ethtool_op_get_link,
1871 };
1872
1873 /* Info for udev, that this is a virtual tunnel endpoint */
1874 static struct device_type stt_type = {
1875         .name = "stt",
1876 };
1877
1878 /* Initialize the device structure. */
1879 static void stt_setup(struct net_device *dev)
1880 {
1881         ether_setup(dev);
1882
1883         dev->netdev_ops = &stt_netdev_ops;
1884         dev->ethtool_ops = &stt_ethtool_ops;
1885         dev->destructor = free_netdev;
1886
1887         SET_NETDEV_DEVTYPE(dev, &stt_type);
1888
1889         dev->features    |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL;
1890         dev->features    |= NETIF_F_SG | NETIF_F_HW_CSUM;
1891         dev->features    |= NETIF_F_RXCSUM;
1892         dev->features    |= NETIF_F_GSO_SOFTWARE;
1893
1894         dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1895         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1896
1897 #ifdef USE_UPSTREAM_TUNNEL
1898         netif_keep_dst(dev);
1899 #endif
1900         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
1901         eth_hw_addr_random(dev);
1902 }
1903
1904 static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = {
1905         [IFLA_STT_PORT]              = { .type = NLA_U16 },
1906 };
1907
1908 static int stt_validate(struct nlattr *tb[], struct nlattr *data[])
1909 {
1910         if (tb[IFLA_ADDRESS]) {
1911                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1912                         return -EINVAL;
1913
1914                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1915                         return -EADDRNOTAVAIL;
1916         }
1917
1918         return 0;
1919 }
1920
1921 static struct stt_dev *find_dev(struct net *net, __be16 dst_port)
1922 {
1923         struct stt_net *sn = net_generic(net, stt_net_id);
1924         struct stt_dev *dev;
1925
1926         list_for_each_entry(dev, &sn->stt_list, next) {
1927                 if (dev->dst_port == dst_port)
1928                         return dev;
1929         }
1930         return NULL;
1931 }
1932
1933 static int stt_configure(struct net *net, struct net_device *dev,
1934                           __be16 dst_port)
1935 {
1936         struct stt_net *sn = net_generic(net, stt_net_id);
1937         struct stt_dev *stt = netdev_priv(dev);
1938         int err;
1939
1940         stt->net = net;
1941         stt->dev = dev;
1942
1943         stt->dst_port = dst_port;
1944
1945         if (find_dev(net, dst_port))
1946                 return -EBUSY;
1947
1948         err = __stt_change_mtu(dev, IP_MAX_MTU, false);
1949         if (err)
1950                 return err;
1951
1952         err = register_netdevice(dev);
1953         if (err)
1954                 return err;
1955
1956         list_add(&stt->next, &sn->stt_list);
1957         return 0;
1958 }
1959
1960 static int stt_newlink(struct net *net, struct net_device *dev,
1961                 struct nlattr *tb[], struct nlattr *data[])
1962 {
1963         __be16 dst_port = htons(STT_DST_PORT);
1964
1965         if (data[IFLA_STT_PORT])
1966                 dst_port = nla_get_be16(data[IFLA_STT_PORT]);
1967
1968         return stt_configure(net, dev, dst_port);
1969 }
1970
1971 static void stt_dellink(struct net_device *dev, struct list_head *head)
1972 {
1973         struct stt_dev *stt = netdev_priv(dev);
1974
1975         list_del(&stt->next);
1976         unregister_netdevice_queue(dev, head);
1977 }
1978
1979 static size_t stt_get_size(const struct net_device *dev)
1980 {
1981         return nla_total_size(sizeof(__be32));  /* IFLA_STT_PORT */
1982 }
1983
1984 static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev)
1985 {
1986         struct stt_dev *stt = netdev_priv(dev);
1987
1988         if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port))
1989                 goto nla_put_failure;
1990
1991         return 0;
1992
1993 nla_put_failure:
1994         return -EMSGSIZE;
1995 }
1996
1997 static struct rtnl_link_ops stt_link_ops __read_mostly = {
1998         .kind           = "stt",
1999         .maxtype        = IFLA_STT_MAX,
2000         .policy         = stt_policy,
2001         .priv_size      = sizeof(struct stt_dev),
2002         .setup          = stt_setup,
2003         .validate       = stt_validate,
2004         .newlink        = stt_newlink,
2005         .dellink        = stt_dellink,
2006         .get_size       = stt_get_size,
2007         .fill_info      = stt_fill_info,
2008 };
2009
2010 struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
2011                                       u8 name_assign_type, u16 dst_port)
2012 {
2013         struct nlattr *tb[IFLA_MAX + 1];
2014         struct net_device *dev;
2015         int err;
2016
2017         memset(tb, 0, sizeof(tb));
2018         dev = rtnl_create_link(net, (char *) name, name_assign_type,
2019                         &stt_link_ops, tb);
2020         if (IS_ERR(dev))
2021                 return dev;
2022
2023         err = stt_configure(net, dev, htons(dst_port));
2024         if (err) {
2025                 free_netdev(dev);
2026                 return ERR_PTR(err);
2027         }
2028         return dev;
2029 }
2030 EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb);
2031
2032 static int stt_init_net(struct net *net)
2033 {
2034         struct stt_net *sn = net_generic(net, stt_net_id);
2035
2036         INIT_LIST_HEAD(&sn->stt_list);
2037         INIT_LIST_HEAD(&sn->stt_up_list);
2038 #ifdef HAVE_NF_REGISTER_NET_HOOK
2039         sn->nf_hook_reg_done = false;
2040 #endif
2041         return 0;
2042 }
2043
2044 static void stt_exit_net(struct net *net)
2045 {
2046         struct stt_net *sn = net_generic(net, stt_net_id);
2047         struct stt_dev *stt, *next;
2048         struct net_device *dev, *aux;
2049         LIST_HEAD(list);
2050
2051 #ifdef HAVE_NF_REGISTER_NET_HOOK
2052         /* Ideally this should be done from stt_stop(), But on some kernels
2053          * nf-unreg operation needs RTNL-lock, which can cause deallock.
2054          * So it is done from here. */
2055         if (sn->nf_hook_reg_done)
2056                 nf_unregister_net_hook(net, &nf_hook_ops);
2057 #endif
2058
2059         rtnl_lock();
2060
2061         /* gather any stt devices that were moved into this ns */
2062         for_each_netdev_safe(net, dev, aux)
2063                 if (dev->rtnl_link_ops == &stt_link_ops)
2064                         unregister_netdevice_queue(dev, &list);
2065
2066         list_for_each_entry_safe(stt, next, &sn->stt_list, next) {
2067                 /* If stt->dev is in the same netns, it was already added
2068                  * to the stt by the previous loop.
2069                  */
2070                 if (!net_eq(dev_net(stt->dev), net))
2071                         unregister_netdevice_queue(stt->dev, &list);
2072         }
2073
2074         /* unregister the devices gathered above */
2075         unregister_netdevice_many(&list);
2076         rtnl_unlock();
2077 }
2078
2079 static struct pernet_operations stt_net_ops = {
2080         .init = stt_init_net,
2081         .exit = stt_exit_net,
2082         .id   = &stt_net_id,
2083         .size = sizeof(struct stt_net),
2084 };
2085
2086 int stt_init_module(void)
2087 {
2088         int rc;
2089
2090         rc = register_pernet_subsys(&stt_net_ops);
2091         if (rc)
2092                 goto out1;
2093
2094         rc = rtnl_link_register(&stt_link_ops);
2095         if (rc)
2096                 goto out2;
2097
2098         INIT_LIST_HEAD(&nf_hook_ops.list);
2099         pr_info("STT tunneling driver\n");
2100         return 0;
2101 out2:
2102         unregister_pernet_subsys(&stt_net_ops);
2103 out1:
2104         return rc;
2105 }
2106
2107 void stt_cleanup_module(void)
2108 {
2109 #ifndef HAVE_NF_REGISTER_NET_HOOK
2110         if (!list_empty(&nf_hook_ops.list))
2111                 nf_unregister_hook(&nf_hook_ops);
2112 #endif
2113         rtnl_link_unregister(&stt_link_ops);
2114         unregister_pernet_subsys(&stt_net_ops);
2115 }
2116 #endif